Message ID | 20201201082142.649007-1-pcc@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | arm/hvf: Optimize and simplify WFI handling | expand |
Hi Peter, On 01.12.20 09:21, Peter Collingbourne wrote: > Sleep on WFx until the VTIMER is due but allow ourselves to be woken > up on IPI. > > Signed-off-by: Peter Collingbourne <pcc@google.com> Thanks a bunch! > --- > Alexander Graf wrote: >> I would love to take a patch from you here :). I'll still be stuck for a >> while with the sysreg sync rework that Peter asked for before I can look >> at WFI again. > Okay, here's a patch :) It's a relatively straightforward adaptation > of what we have in our fork, which can now boot Android to GUI while > remaining at around 4% CPU when idle. > > I'm not set up to boot a full Linux distribution at the moment so I > tested it on upstream QEMU by running a recent mainline Linux kernel > with a rootfs containing an init program that just does sleep(5) > and verified that the qemu process remains at low CPU usage during > the sleep. This was on top of your v2 plus the last patch of your v1 > since it doesn't look like you have a replacement for that logic yet. > > accel/hvf/hvf-cpus.c | 5 +-- > include/sysemu/hvf_int.h | 3 +- > target/arm/hvf/hvf.c | 94 +++++++++++----------------------------- > 3 files changed, 28 insertions(+), 74 deletions(-) > > diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c > index 4360f64671..b2c8fb57f6 100644 > --- a/accel/hvf/hvf-cpus.c > +++ b/accel/hvf/hvf-cpus.c > @@ -344,9 +344,8 @@ static int hvf_init_vcpu(CPUState *cpu) > sigact.sa_handler = dummy_signal; > sigaction(SIG_IPI, &sigact, NULL); > > - pthread_sigmask(SIG_BLOCK, NULL, &set); > - sigdelset(&set, SIG_IPI); > - pthread_sigmask(SIG_SETMASK, &set, NULL); > + pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->unblock_ipi_mask); > + sigdelset(&cpu->hvf->unblock_ipi_mask, SIG_IPI); What will this do to the x86 hvf implementation? We're now not unblocking SIG_IPI again for that, right? > > #ifdef __aarch64__ > r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t **)&cpu->hvf->exit, NULL); > diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h > index c56baa3ae8..13adf6ea77 100644 > --- a/include/sysemu/hvf_int.h > +++ b/include/sysemu/hvf_int.h > @@ -62,8 +62,7 @@ extern HVFState *hvf_state; > struct hvf_vcpu_state { > uint64_t fd; > void *exit; > - struct timespec ts; > - bool sleeping; > + sigset_t unblock_ipi_mask; > }; > > void assert_hvf_ok(hv_return_t ret); > diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c > index 8fe10966d2..60a361ff38 100644 > --- a/target/arm/hvf/hvf.c > +++ b/target/arm/hvf/hvf.c > @@ -2,6 +2,7 @@ > * QEMU Hypervisor.framework support for Apple Silicon > > * Copyright 2020 Alexander Graf <agraf@csgraf.de> > + * Copyright 2020 Google LLC > * > * This work is licensed under the terms of the GNU GPL, version 2 or later. > * See the COPYING file in the top-level directory. > @@ -18,6 +19,7 @@ > #include "sysemu/hw_accel.h" > > #include <Hypervisor/Hypervisor.h> > +#include <mach/mach_time.h> > > #include "exec/address-spaces.h" > #include "hw/irq.h" > @@ -320,18 +322,8 @@ int hvf_arch_init_vcpu(CPUState *cpu) > > void hvf_kick_vcpu_thread(CPUState *cpu) > { > - if (cpu->hvf->sleeping) { > - /* > - * When sleeping, make sure we always send signals. Also, clear the > - * timespec, so that an IPI that arrives between setting hvf->sleeping > - * and the nanosleep syscall still aborts the sleep. > - */ > - cpu->thread_kicked = false; > - cpu->hvf->ts = (struct timespec){ }; > - cpus_kick_thread(cpu); > - } else { > - hv_vcpus_exit(&cpu->hvf->fd, 1); > - } > + cpus_kick_thread(cpu); > + hv_vcpus_exit(&cpu->hvf->fd, 1); This means your first WFI will almost always return immediately due to a pending signal, because there probably was an IRQ pending before on the same CPU, no? > } > > static int hvf_inject_interrupts(CPUState *cpu) > @@ -385,18 +377,19 @@ int hvf_vcpu_exec(CPUState *cpu) > uint64_t syndrome = hvf_exit->exception.syndrome; > uint32_t ec = syn_get_ec(syndrome); > > + qemu_mutex_lock_iothread(); Is there a particular reason you're moving the iothread lock out again from the individual bits? I would really like to keep a notion of fast path exits. > switch (exit_reason) { > case HV_EXIT_REASON_EXCEPTION: > /* This is the main one, handle below. */ > break; > case HV_EXIT_REASON_VTIMER_ACTIVATED: > - qemu_mutex_lock_iothread(); > current_cpu = cpu; > qemu_set_irq(arm_cpu->gt_timer_outputs[GTIMER_VIRT], 1); > qemu_mutex_unlock_iothread(); > continue; > case HV_EXIT_REASON_CANCELED: > /* we got kicked, no exit to process */ > + qemu_mutex_unlock_iothread(); > continue; > default: > assert(0); > @@ -413,7 +406,6 @@ int hvf_vcpu_exec(CPUState *cpu) > uint32_t srt = (syndrome >> 16) & 0x1f; > uint64_t val = 0; > > - qemu_mutex_lock_iothread(); > current_cpu = cpu; > > DPRINTF("data abort: [pc=0x%llx va=0x%016llx pa=0x%016llx isv=%x " > @@ -446,8 +438,6 @@ int hvf_vcpu_exec(CPUState *cpu) > hvf_set_reg(cpu, srt, val); > } > > - qemu_mutex_unlock_iothread(); > - > advance_pc = true; > break; > } > @@ -493,68 +483,36 @@ int hvf_vcpu_exec(CPUState *cpu) > case EC_WFX_TRAP: > if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request & > (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) { > - uint64_t cval, ctl, val, diff, now; > + uint64_t cval; > > - /* Set up a local timer for vtimer if necessary ... */ > - r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0, &ctl); > - assert_hvf_ok(r); > r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0, &cval); > assert_hvf_ok(r); > > - asm volatile("mrs %0, cntvct_el0" : "=r"(val)); > - diff = cval - val; > - > - now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) / > - gt_cntfrq_period_ns(arm_cpu); > - > - /* Timer disabled or masked, just wait for long */ > - if (!(ctl & 1) || (ctl & 2)) { > - diff = (120 * NANOSECONDS_PER_SECOND) / > - gt_cntfrq_period_ns(arm_cpu); > + int64_t ticks_to_sleep = cval - mach_absolute_time(); > + if (ticks_to_sleep < 0) { > + break; This will loop at 100% for Windows, which configures the vtimer as cval=0 ctl=7, so with IRQ mask bit set. Alex > } > > - if (diff < INT64_MAX) { > - uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu); > - struct timespec *ts = &cpu->hvf->ts; > - > - *ts = (struct timespec){ > - .tv_sec = ns / NANOSECONDS_PER_SECOND, > - .tv_nsec = ns % NANOSECONDS_PER_SECOND, > - }; > - > - /* > - * Waking up easily takes 1ms, don't go to sleep for smaller > - * time periods than 2ms. > - */ > - if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) { I put this logic here on purpose. A pselect(1 ns) easily takes 1-2ms to return. Without logic like this, super short WFIs will hurt performance quite badly. Alex > - advance_pc = true; > - break; > - } > - > - /* Set cpu->hvf->sleeping so that we get a SIG_IPI signal. */ > - cpu->hvf->sleeping = true; > - smp_mb(); > - > - /* Bail out if we received an IRQ meanwhile */ > - if (cpu->thread_kicked || (cpu->interrupt_request & > - (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) { > - cpu->hvf->sleeping = false; > - break; > - } > - > - /* nanosleep returns on signal, so we wake up on kick. */ > - nanosleep(ts, NULL); > - > - /* Out of sleep - either naturally or because of a kick */ > - cpu->hvf->sleeping = false; > - } > + uint64_t seconds = ticks_to_sleep / arm_cpu->gt_cntfrq_hz; > + uint64_t nanos = > + (ticks_to_sleep - arm_cpu->gt_cntfrq_hz * seconds) * > + 1000000000 / arm_cpu->gt_cntfrq_hz; > + struct timespec ts = { seconds, nanos }; > + > + /* > + * Use pselect to sleep so that other threads can IPI us while > + * we're sleeping. > + */ > + qatomic_mb_set(&cpu->thread_kicked, false); > + qemu_mutex_unlock_iothread(); > + pselect(0, 0, 0, 0, &ts, &cpu->hvf->unblock_ipi_mask); > + qemu_mutex_lock_iothread(); > > advance_pc = true; > } > break; > case EC_AA64_HVC: > cpu_synchronize_state(cpu); > - qemu_mutex_lock_iothread(); > current_cpu = cpu; > if (arm_is_psci_call(arm_cpu, EXCP_HVC)) { > arm_handle_psci_call(arm_cpu); > @@ -562,11 +520,9 @@ int hvf_vcpu_exec(CPUState *cpu) > DPRINTF("unknown HVC! %016llx", env->xregs[0]); > env->xregs[0] = -1; > } > - qemu_mutex_unlock_iothread(); > break; > case EC_AA64_SMC: > cpu_synchronize_state(cpu); > - qemu_mutex_lock_iothread(); > current_cpu = cpu; > if (arm_is_psci_call(arm_cpu, EXCP_SMC)) { > arm_handle_psci_call(arm_cpu); > @@ -575,7 +531,6 @@ int hvf_vcpu_exec(CPUState *cpu) > env->xregs[0] = -1; > env->pc += 4; > } > - qemu_mutex_unlock_iothread(); > break; > default: > cpu_synchronize_state(cpu); > @@ -594,6 +549,7 @@ int hvf_vcpu_exec(CPUState *cpu) > r = hv_vcpu_set_reg(cpu->hvf->fd, HV_REG_PC, pc); > assert_hvf_ok(r); > } > + qemu_mutex_unlock_iothread(); > } while (ret == 0); > > qemu_mutex_lock_iothread();
On 01.12.20 09:21, Peter Collingbourne wrote: > Sleep on WFx until the VTIMER is due but allow ourselves to be woken > up on IPI. > > Signed-off-by: Peter Collingbourne <pcc@google.com> > --- > Alexander Graf wrote: >> I would love to take a patch from you here :). I'll still be stuck for a >> while with the sysreg sync rework that Peter asked for before I can look >> at WFI again. > Okay, here's a patch :) It's a relatively straightforward adaptation > of what we have in our fork, which can now boot Android to GUI while > remaining at around 4% CPU when idle. > > I'm not set up to boot a full Linux distribution at the moment so I > tested it on upstream QEMU by running a recent mainline Linux kernel > with a rootfs containing an init program that just does sleep(5) > and verified that the qemu process remains at low CPU usage during > the sleep. This was on top of your v2 plus the last patch of your v1 > since it doesn't look like you have a replacement for that logic yet. How about something like this instead? Alex diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c index 4360f64671..50384013ea 100644 --- a/accel/hvf/hvf-cpus.c +++ b/accel/hvf/hvf-cpus.c @@ -337,16 +337,18 @@ static int hvf_init_vcpu(CPUState *cpu) cpu->hvf = g_malloc0(sizeof(*cpu->hvf)); /* init cpu signals */ - sigset_t set; struct sigaction sigact; memset(&sigact, 0, sizeof(sigact)); sigact.sa_handler = dummy_signal; sigaction(SIG_IPI, &sigact, NULL); - pthread_sigmask(SIG_BLOCK, NULL, &set); - sigdelset(&set, SIG_IPI); - pthread_sigmask(SIG_SETMASK, &set, NULL); + pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->sigmask); + sigdelset(&cpu->hvf->sigmask, SIG_IPI); + pthread_sigmask(SIG_SETMASK, &cpu->hvf->sigmask, NULL); + + pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->sigmask_ipi); + sigaddset(&cpu->hvf->sigmask_ipi, SIG_IPI); #ifdef __aarch64__ r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t **)&cpu->hvf->exit, NULL); diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h index c56baa3ae8..6e237f2db0 100644 --- a/include/sysemu/hvf_int.h +++ b/include/sysemu/hvf_int.h @@ -62,8 +62,9 @@ extern HVFState *hvf_state; struct hvf_vcpu_state { uint64_t fd; void *exit; - struct timespec ts; bool sleeping; + sigset_t sigmask; + sigset_t sigmask_ipi; }; void assert_hvf_ok(hv_return_t ret); diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c index 0c01a03725..350b845e6e 100644 --- a/target/arm/hvf/hvf.c +++ b/target/arm/hvf/hvf.c @@ -320,20 +320,24 @@ int hvf_arch_init_vcpu(CPUState *cpu) void hvf_kick_vcpu_thread(CPUState *cpu) { - if (cpu->hvf->sleeping) { - /* - * When sleeping, make sure we always send signals. Also, clear the - * timespec, so that an IPI that arrives between setting hvf->sleeping - * and the nanosleep syscall still aborts the sleep. - */ - cpu->thread_kicked = false; - cpu->hvf->ts = (struct timespec){ }; + if (qatomic_read(&cpu->hvf->sleeping)) { + /* When sleeping, send a signal to get out of pselect */ cpus_kick_thread(cpu); } else { hv_vcpus_exit(&cpu->hvf->fd, 1); } } +static void hvf_block_sig_ipi(CPUState *cpu) +{ + pthread_sigmask(SIG_SETMASK, &cpu->hvf->sigmask_ipi, NULL); +} + +static void hvf_unblock_sig_ipi(CPUState *cpu) +{ + pthread_sigmask(SIG_SETMASK, &cpu->hvf->sigmask, NULL); +} + static int hvf_inject_interrupts(CPUState *cpu) { if (cpu->interrupt_request & CPU_INTERRUPT_FIQ) { @@ -354,6 +358,7 @@ int hvf_vcpu_exec(CPUState *cpu) ARMCPU *arm_cpu = ARM_CPU(cpu); CPUARMState *env = &arm_cpu->env; hv_vcpu_exit_t *hvf_exit = cpu->hvf->exit; + const uint32_t irq_mask = CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ; hv_return_t r; int ret = 0; @@ -491,8 +496,8 @@ int hvf_vcpu_exec(CPUState *cpu) break; } case EC_WFX_TRAP: - if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request & - (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) { + if (!(syndrome & WFX_IS_WFE) && + !(cpu->interrupt_request & irq_mask)) { uint64_t cval, ctl, val, diff, now; /* Set up a local timer for vtimer if necessary ... */ @@ -515,9 +520,7 @@ int hvf_vcpu_exec(CPUState *cpu) if (diff < INT64_MAX) { uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu); - struct timespec *ts = &cpu->hvf->ts; - - *ts = (struct timespec){ + struct timespec ts = { .tv_sec = ns / NANOSECONDS_PER_SECOND, .tv_nsec = ns % NANOSECONDS_PER_SECOND, }; @@ -526,27 +529,31 @@ int hvf_vcpu_exec(CPUState *cpu) * Waking up easily takes 1ms, don't go to sleep for smaller * time periods than 2ms. */ - if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) { + if (!ts.tv_sec && (ts.tv_nsec < (SCALE_MS * 2))) { advance_pc = true; break; } + /* block SIG_IPI for the sleep */ + hvf_block_sig_ipi(cpu); + cpu->thread_kicked = false; + /* Set cpu->hvf->sleeping so that we get a SIG_IPI signal. */ - cpu->hvf->sleeping = true; - smp_mb(); + qatomic_set(&cpu->hvf->sleeping, true); - /* Bail out if we received an IRQ meanwhile */ - if (cpu->thread_kicked || (cpu->interrupt_request & - (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) { - cpu->hvf->sleeping = false; + /* Bail out if we received a kick meanwhile */ + if (qatomic_read(&cpu->interrupt_request) & irq_mask) { + qatomic_set(&cpu->hvf->sleeping, false); + hvf_unblock_sig_ipi(cpu); break; } - /* nanosleep returns on signal, so we wake up on kick. */ - nanosleep(ts, NULL); + /* pselect returns on kick signal and consumes it */ + pselect(0, 0, 0, 0, &ts, &cpu->hvf->sigmask); /* Out of sleep - either naturally or because of a kick */ - cpu->hvf->sleeping = false; + qatomic_set(&cpu->hvf->sleeping, false); + hvf_unblock_sig_ipi(cpu); } advance_pc = true;
On Tue, Dec 1, 2020 at 3:16 AM Alexander Graf <agraf@csgraf.de> wrote: > > Hi Peter, > > On 01.12.20 09:21, Peter Collingbourne wrote: > > Sleep on WFx until the VTIMER is due but allow ourselves to be woken > > up on IPI. > > > > Signed-off-by: Peter Collingbourne <pcc@google.com> > > > Thanks a bunch! > > > > --- > > Alexander Graf wrote: > >> I would love to take a patch from you here :). I'll still be stuck for a > >> while with the sysreg sync rework that Peter asked for before I can look > >> at WFI again. > > Okay, here's a patch :) It's a relatively straightforward adaptation > > of what we have in our fork, which can now boot Android to GUI while > > remaining at around 4% CPU when idle. > > > > I'm not set up to boot a full Linux distribution at the moment so I > > tested it on upstream QEMU by running a recent mainline Linux kernel > > with a rootfs containing an init program that just does sleep(5) > > and verified that the qemu process remains at low CPU usage during > > the sleep. This was on top of your v2 plus the last patch of your v1 > > since it doesn't look like you have a replacement for that logic yet. > > > > accel/hvf/hvf-cpus.c | 5 +-- > > include/sysemu/hvf_int.h | 3 +- > > target/arm/hvf/hvf.c | 94 +++++++++++----------------------------- > > 3 files changed, 28 insertions(+), 74 deletions(-) > > > > diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c > > index 4360f64671..b2c8fb57f6 100644 > > --- a/accel/hvf/hvf-cpus.c > > +++ b/accel/hvf/hvf-cpus.c > > @@ -344,9 +344,8 @@ static int hvf_init_vcpu(CPUState *cpu) > > sigact.sa_handler = dummy_signal; > > sigaction(SIG_IPI, &sigact, NULL); > > > > - pthread_sigmask(SIG_BLOCK, NULL, &set); > > - sigdelset(&set, SIG_IPI); > > - pthread_sigmask(SIG_SETMASK, &set, NULL); > > + pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->unblock_ipi_mask); > > + sigdelset(&cpu->hvf->unblock_ipi_mask, SIG_IPI); > > > What will this do to the x86 hvf implementation? We're now not > unblocking SIG_IPI again for that, right? Yes and that was the case before your patch series. > > > > #ifdef __aarch64__ > > r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t **)&cpu->hvf->exit, NULL); > > diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h > > index c56baa3ae8..13adf6ea77 100644 > > --- a/include/sysemu/hvf_int.h > > +++ b/include/sysemu/hvf_int.h > > @@ -62,8 +62,7 @@ extern HVFState *hvf_state; > > struct hvf_vcpu_state { > > uint64_t fd; > > void *exit; > > - struct timespec ts; > > - bool sleeping; > > + sigset_t unblock_ipi_mask; > > }; > > > > void assert_hvf_ok(hv_return_t ret); > > diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c > > index 8fe10966d2..60a361ff38 100644 > > --- a/target/arm/hvf/hvf.c > > +++ b/target/arm/hvf/hvf.c > > @@ -2,6 +2,7 @@ > > * QEMU Hypervisor.framework support for Apple Silicon > > > > * Copyright 2020 Alexander Graf <agraf@csgraf.de> > > + * Copyright 2020 Google LLC > > * > > * This work is licensed under the terms of the GNU GPL, version 2 or later. > > * See the COPYING file in the top-level directory. > > @@ -18,6 +19,7 @@ > > #include "sysemu/hw_accel.h" > > > > #include <Hypervisor/Hypervisor.h> > > +#include <mach/mach_time.h> > > > > #include "exec/address-spaces.h" > > #include "hw/irq.h" > > @@ -320,18 +322,8 @@ int hvf_arch_init_vcpu(CPUState *cpu) > > > > void hvf_kick_vcpu_thread(CPUState *cpu) > > { > > - if (cpu->hvf->sleeping) { > > - /* > > - * When sleeping, make sure we always send signals. Also, clear the > > - * timespec, so that an IPI that arrives between setting hvf->sleeping > > - * and the nanosleep syscall still aborts the sleep. > > - */ > > - cpu->thread_kicked = false; > > - cpu->hvf->ts = (struct timespec){ }; > > - cpus_kick_thread(cpu); > > - } else { > > - hv_vcpus_exit(&cpu->hvf->fd, 1); > > - } > > + cpus_kick_thread(cpu); > > + hv_vcpus_exit(&cpu->hvf->fd, 1); > > > This means your first WFI will almost always return immediately due to a > pending signal, because there probably was an IRQ pending before on the > same CPU, no? That's right. Any approach involving the "sleeping" field would need to be implemented carefully to avoid races that may result in missed wakeups so for simplicity I just decided to send both kinds of wakeups. In particular the approach in the updated patch you sent is racy and I'll elaborate more in the reply to that patch. > > } > > > > static int hvf_inject_interrupts(CPUState *cpu) > > @@ -385,18 +377,19 @@ int hvf_vcpu_exec(CPUState *cpu) > > uint64_t syndrome = hvf_exit->exception.syndrome; > > uint32_t ec = syn_get_ec(syndrome); > > > > + qemu_mutex_lock_iothread(); > > > Is there a particular reason you're moving the iothread lock out again > from the individual bits? I would really like to keep a notion of fast > path exits. We still need to lock at least once no matter the exit reason to check the interrupts so I don't think it's worth it to try and avoid locking like this. It also makes the implementation easier to reason about and therefore more likely to be correct. In our implementation we just stay locked the whole time unless we're in hv_vcpu_run() or pselect(). > > switch (exit_reason) { > > case HV_EXIT_REASON_EXCEPTION: > > /* This is the main one, handle below. */ > > break; > > case HV_EXIT_REASON_VTIMER_ACTIVATED: > > - qemu_mutex_lock_iothread(); > > current_cpu = cpu; > > qemu_set_irq(arm_cpu->gt_timer_outputs[GTIMER_VIRT], 1); > > qemu_mutex_unlock_iothread(); > > continue; > > case HV_EXIT_REASON_CANCELED: > > /* we got kicked, no exit to process */ > > + qemu_mutex_unlock_iothread(); > > continue; > > default: > > assert(0); > > @@ -413,7 +406,6 @@ int hvf_vcpu_exec(CPUState *cpu) > > uint32_t srt = (syndrome >> 16) & 0x1f; > > uint64_t val = 0; > > > > - qemu_mutex_lock_iothread(); > > current_cpu = cpu; > > > > DPRINTF("data abort: [pc=0x%llx va=0x%016llx pa=0x%016llx isv=%x " > > @@ -446,8 +438,6 @@ int hvf_vcpu_exec(CPUState *cpu) > > hvf_set_reg(cpu, srt, val); > > } > > > > - qemu_mutex_unlock_iothread(); > > - > > advance_pc = true; > > break; > > } > > @@ -493,68 +483,36 @@ int hvf_vcpu_exec(CPUState *cpu) > > case EC_WFX_TRAP: > > if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request & > > (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) { > > - uint64_t cval, ctl, val, diff, now; > > + uint64_t cval; > > > > - /* Set up a local timer for vtimer if necessary ... */ > > - r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0, &ctl); > > - assert_hvf_ok(r); > > r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0, &cval); > > assert_hvf_ok(r); > > > > - asm volatile("mrs %0, cntvct_el0" : "=r"(val)); > > - diff = cval - val; > > - > > - now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) / > > - gt_cntfrq_period_ns(arm_cpu); > > - > > - /* Timer disabled or masked, just wait for long */ > > - if (!(ctl & 1) || (ctl & 2)) { > > - diff = (120 * NANOSECONDS_PER_SECOND) / > > - gt_cntfrq_period_ns(arm_cpu); > > + int64_t ticks_to_sleep = cval - mach_absolute_time(); > > + if (ticks_to_sleep < 0) { > > + break; > > > This will loop at 100% for Windows, which configures the vtimer as > cval=0 ctl=7, so with IRQ mask bit set. Okay, but the 120s is kind of arbitrary so we should just sleep until we get a signal. That can be done by passing null as the timespec argument to pselect(). > > > Alex > > > > } > > > > - if (diff < INT64_MAX) { > > - uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu); > > - struct timespec *ts = &cpu->hvf->ts; > > - > > - *ts = (struct timespec){ > > - .tv_sec = ns / NANOSECONDS_PER_SECOND, > > - .tv_nsec = ns % NANOSECONDS_PER_SECOND, > > - }; > > - > > - /* > > - * Waking up easily takes 1ms, don't go to sleep for smaller > > - * time periods than 2ms. > > - */ > > - if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) { > > > I put this logic here on purpose. A pselect(1 ns) easily takes 1-2ms to > return. Without logic like this, super short WFIs will hurt performance > quite badly. I don't think that's accurate. According to this benchmark it's a few hundred nanoseconds at most. pcc@pac-mini /tmp> cat pselect.c #include <signal.h> #include <sys/select.h> int main() { sigset_t mask, orig_mask; pthread_sigmask(SIG_SETMASK, 0, &mask); sigaddset(&mask, SIGUSR1); pthread_sigmask(SIG_SETMASK, &mask, &orig_mask); for (int i = 0; i != 1000000; ++i) { struct timespec ts = { 0, 1 }; pselect(0, 0, 0, 0, &ts, &orig_mask); } } pcc@pac-mini /tmp> time ./pselect
On Tue, Dec 1, 2020 at 8:26 AM Alexander Graf <agraf@csgraf.de> wrote: > > > On 01.12.20 09:21, Peter Collingbourne wrote: > > Sleep on WFx until the VTIMER is due but allow ourselves to be woken > > up on IPI. > > > > Signed-off-by: Peter Collingbourne <pcc@google.com> > > --- > > Alexander Graf wrote: > >> I would love to take a patch from you here :). I'll still be stuck for a > >> while with the sysreg sync rework that Peter asked for before I can look > >> at WFI again. > > Okay, here's a patch :) It's a relatively straightforward adaptation > > of what we have in our fork, which can now boot Android to GUI while > > remaining at around 4% CPU when idle. > > > > I'm not set up to boot a full Linux distribution at the moment so I > > tested it on upstream QEMU by running a recent mainline Linux kernel > > with a rootfs containing an init program that just does sleep(5) > > and verified that the qemu process remains at low CPU usage during > > the sleep. This was on top of your v2 plus the last patch of your v1 > > since it doesn't look like you have a replacement for that logic yet. > > > How about something like this instead? > > > Alex > > > diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c > index 4360f64671..50384013ea 100644 > --- a/accel/hvf/hvf-cpus.c > +++ b/accel/hvf/hvf-cpus.c > @@ -337,16 +337,18 @@ static int hvf_init_vcpu(CPUState *cpu) > cpu->hvf = g_malloc0(sizeof(*cpu->hvf)); > > /* init cpu signals */ > - sigset_t set; > struct sigaction sigact; > > memset(&sigact, 0, sizeof(sigact)); > sigact.sa_handler = dummy_signal; > sigaction(SIG_IPI, &sigact, NULL); > > - pthread_sigmask(SIG_BLOCK, NULL, &set); > - sigdelset(&set, SIG_IPI); > - pthread_sigmask(SIG_SETMASK, &set, NULL); > + pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->sigmask); > + sigdelset(&cpu->hvf->sigmask, SIG_IPI); > + pthread_sigmask(SIG_SETMASK, &cpu->hvf->sigmask, NULL); > + > + pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->sigmask_ipi); > + sigaddset(&cpu->hvf->sigmask_ipi, SIG_IPI); There's no reason to unblock SIG_IPI while not in pselect and it can easily lead to missed wakeups. The whole point of pselect is so that you can guarantee that only one part of your program sees signals without a possibility of them being missed. > > #ifdef __aarch64__ > r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t > **)&cpu->hvf->exit, NULL); > diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h > index c56baa3ae8..6e237f2db0 100644 > --- a/include/sysemu/hvf_int.h > +++ b/include/sysemu/hvf_int.h > @@ -62,8 +62,9 @@ extern HVFState *hvf_state; > struct hvf_vcpu_state { > uint64_t fd; > void *exit; > - struct timespec ts; > bool sleeping; > + sigset_t sigmask; > + sigset_t sigmask_ipi; > }; > > void assert_hvf_ok(hv_return_t ret); > diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c > index 0c01a03725..350b845e6e 100644 > --- a/target/arm/hvf/hvf.c > +++ b/target/arm/hvf/hvf.c > @@ -320,20 +320,24 @@ int hvf_arch_init_vcpu(CPUState *cpu) > > void hvf_kick_vcpu_thread(CPUState *cpu) > { > - if (cpu->hvf->sleeping) { > - /* > - * When sleeping, make sure we always send signals. Also, clear the > - * timespec, so that an IPI that arrives between setting > hvf->sleeping > - * and the nanosleep syscall still aborts the sleep. > - */ > - cpu->thread_kicked = false; > - cpu->hvf->ts = (struct timespec){ }; > + if (qatomic_read(&cpu->hvf->sleeping)) { > + /* When sleeping, send a signal to get out of pselect */ > cpus_kick_thread(cpu); > } else { > hv_vcpus_exit(&cpu->hvf->fd, 1); > } > } > > +static void hvf_block_sig_ipi(CPUState *cpu) > +{ > + pthread_sigmask(SIG_SETMASK, &cpu->hvf->sigmask_ipi, NULL); > +} > + > +static void hvf_unblock_sig_ipi(CPUState *cpu) > +{ > + pthread_sigmask(SIG_SETMASK, &cpu->hvf->sigmask, NULL); > +} > + > static int hvf_inject_interrupts(CPUState *cpu) > { > if (cpu->interrupt_request & CPU_INTERRUPT_FIQ) { > @@ -354,6 +358,7 @@ int hvf_vcpu_exec(CPUState *cpu) > ARMCPU *arm_cpu = ARM_CPU(cpu); > CPUARMState *env = &arm_cpu->env; > hv_vcpu_exit_t *hvf_exit = cpu->hvf->exit; > + const uint32_t irq_mask = CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ; > hv_return_t r; > int ret = 0; > > @@ -491,8 +496,8 @@ int hvf_vcpu_exec(CPUState *cpu) > break; > } > case EC_WFX_TRAP: > - if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request & > - (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) { > + if (!(syndrome & WFX_IS_WFE) && > + !(cpu->interrupt_request & irq_mask)) { > uint64_t cval, ctl, val, diff, now; I don't think the access to cpu->interrupt_request is safe because it is done while not under the iothread lock. That's why to avoid these types of issues I would prefer to hold the lock almost all of the time. > /* Set up a local timer for vtimer if necessary ... */ > @@ -515,9 +520,7 @@ int hvf_vcpu_exec(CPUState *cpu) > > if (diff < INT64_MAX) { > uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu); > - struct timespec *ts = &cpu->hvf->ts; > - > - *ts = (struct timespec){ > + struct timespec ts = { > .tv_sec = ns / NANOSECONDS_PER_SECOND, > .tv_nsec = ns % NANOSECONDS_PER_SECOND, > }; > @@ -526,27 +529,31 @@ int hvf_vcpu_exec(CPUState *cpu) > * Waking up easily takes 1ms, don't go to sleep > for smaller > * time periods than 2ms. > */ > - if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) { > + if (!ts.tv_sec && (ts.tv_nsec < (SCALE_MS * 2))) { > advance_pc = true; > break; > } > > + /* block SIG_IPI for the sleep */ > + hvf_block_sig_ipi(cpu); > + cpu->thread_kicked = false; > + > /* Set cpu->hvf->sleeping so that we get a SIG_IPI > signal. */ > - cpu->hvf->sleeping = true; > - smp_mb(); > + qatomic_set(&cpu->hvf->sleeping, true); This doesn't protect against races because another thread could call kvf_vcpu_kick_thread() at any time between when we return from hv_vcpu_run() and when we set sleeping = true and we would miss the wakeup (due to kvf_vcpu_kick_thread() seeing sleeping = false and calling hv_vcpus_exit() instead of pthread_kill()). I don't think it can be fixed by setting sleeping to true earlier either because no matter how early you move it, there will always be a window where we are going to pselect() but sleeping is false, resulting in a missed wakeup. Peter > > - /* Bail out if we received an IRQ meanwhile */ > - if (cpu->thread_kicked || (cpu->interrupt_request & > - (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) { > - cpu->hvf->sleeping = false; > + /* Bail out if we received a kick meanwhile */ > + if (qatomic_read(&cpu->interrupt_request) & irq_mask) { > + qatomic_set(&cpu->hvf->sleeping, false); > + hvf_unblock_sig_ipi(cpu); > break; > } > > - /* nanosleep returns on signal, so we wake up on > kick. */ > - nanosleep(ts, NULL); > + /* pselect returns on kick signal and consumes it */ > + pselect(0, 0, 0, 0, &ts, &cpu->hvf->sigmask); > > /* Out of sleep - either naturally or because of a > kick */ > - cpu->hvf->sleeping = false; > + qatomic_set(&cpu->hvf->sleeping, false); > + hvf_unblock_sig_ipi(cpu); > } > > advance_pc = true; >
On 01.12.20 19:59, Peter Collingbourne wrote: > On Tue, Dec 1, 2020 at 3:16 AM Alexander Graf <agraf@csgraf.de> wrote: >> Hi Peter, >> >> On 01.12.20 09:21, Peter Collingbourne wrote: >>> Sleep on WFx until the VTIMER is due but allow ourselves to be woken >>> up on IPI. >>> >>> Signed-off-by: Peter Collingbourne <pcc@google.com> >> >> Thanks a bunch! >> >> >>> --- >>> Alexander Graf wrote: >>>> I would love to take a patch from you here :). I'll still be stuck for a >>>> while with the sysreg sync rework that Peter asked for before I can look >>>> at WFI again. >>> Okay, here's a patch :) It's a relatively straightforward adaptation >>> of what we have in our fork, which can now boot Android to GUI while >>> remaining at around 4% CPU when idle. >>> >>> I'm not set up to boot a full Linux distribution at the moment so I >>> tested it on upstream QEMU by running a recent mainline Linux kernel >>> with a rootfs containing an init program that just does sleep(5) >>> and verified that the qemu process remains at low CPU usage during >>> the sleep. This was on top of your v2 plus the last patch of your v1 >>> since it doesn't look like you have a replacement for that logic yet. >>> >>> accel/hvf/hvf-cpus.c | 5 +-- >>> include/sysemu/hvf_int.h | 3 +- >>> target/arm/hvf/hvf.c | 94 +++++++++++----------------------------- >>> 3 files changed, 28 insertions(+), 74 deletions(-) >>> >>> diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c >>> index 4360f64671..b2c8fb57f6 100644 >>> --- a/accel/hvf/hvf-cpus.c >>> +++ b/accel/hvf/hvf-cpus.c >>> @@ -344,9 +344,8 @@ static int hvf_init_vcpu(CPUState *cpu) >>> sigact.sa_handler = dummy_signal; >>> sigaction(SIG_IPI, &sigact, NULL); >>> >>> - pthread_sigmask(SIG_BLOCK, NULL, &set); >>> - sigdelset(&set, SIG_IPI); >>> - pthread_sigmask(SIG_SETMASK, &set, NULL); >>> + pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->unblock_ipi_mask); >>> + sigdelset(&cpu->hvf->unblock_ipi_mask, SIG_IPI); >> >> What will this do to the x86 hvf implementation? We're now not >> unblocking SIG_IPI again for that, right? > Yes and that was the case before your patch series. The way I understand Roman, he wanted to unblock the IPI signal on x86: https://patchwork.kernel.org/project/qemu-devel/patch/20201126215017.41156-3-agraf@csgraf.de/#23807021 I agree that at this point it's not a problem though to break it again. I'm not quite sure how to merge your patches within my patch set though, given they basically revert half of my previously introduced code... > >>> #ifdef __aarch64__ >>> r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t **)&cpu->hvf->exit, NULL); >>> diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h >>> index c56baa3ae8..13adf6ea77 100644 >>> --- a/include/sysemu/hvf_int.h >>> +++ b/include/sysemu/hvf_int.h >>> @@ -62,8 +62,7 @@ extern HVFState *hvf_state; >>> struct hvf_vcpu_state { >>> uint64_t fd; >>> void *exit; >>> - struct timespec ts; >>> - bool sleeping; >>> + sigset_t unblock_ipi_mask; >>> }; >>> >>> void assert_hvf_ok(hv_return_t ret); >>> diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c >>> index 8fe10966d2..60a361ff38 100644 >>> --- a/target/arm/hvf/hvf.c >>> +++ b/target/arm/hvf/hvf.c >>> @@ -2,6 +2,7 @@ >>> * QEMU Hypervisor.framework support for Apple Silicon >>> >>> * Copyright 2020 Alexander Graf <agraf@csgraf.de> >>> + * Copyright 2020 Google LLC >>> * >>> * This work is licensed under the terms of the GNU GPL, version 2 or later. >>> * See the COPYING file in the top-level directory. >>> @@ -18,6 +19,7 @@ >>> #include "sysemu/hw_accel.h" >>> >>> #include <Hypervisor/Hypervisor.h> >>> +#include <mach/mach_time.h> >>> >>> #include "exec/address-spaces.h" >>> #include "hw/irq.h" >>> @@ -320,18 +322,8 @@ int hvf_arch_init_vcpu(CPUState *cpu) >>> >>> void hvf_kick_vcpu_thread(CPUState *cpu) >>> { >>> - if (cpu->hvf->sleeping) { >>> - /* >>> - * When sleeping, make sure we always send signals. Also, clear the >>> - * timespec, so that an IPI that arrives between setting hvf->sleeping >>> - * and the nanosleep syscall still aborts the sleep. >>> - */ >>> - cpu->thread_kicked = false; >>> - cpu->hvf->ts = (struct timespec){ }; >>> - cpus_kick_thread(cpu); >>> - } else { >>> - hv_vcpus_exit(&cpu->hvf->fd, 1); >>> - } >>> + cpus_kick_thread(cpu); >>> + hv_vcpus_exit(&cpu->hvf->fd, 1); >> >> This means your first WFI will almost always return immediately due to a >> pending signal, because there probably was an IRQ pending before on the >> same CPU, no? > That's right. Any approach involving the "sleeping" field would need > to be implemented carefully to avoid races that may result in missed > wakeups so for simplicity I just decided to send both kinds of > wakeups. In particular the approach in the updated patch you sent is > racy and I'll elaborate more in the reply to that patch. > >>> } >>> >>> static int hvf_inject_interrupts(CPUState *cpu) >>> @@ -385,18 +377,19 @@ int hvf_vcpu_exec(CPUState *cpu) >>> uint64_t syndrome = hvf_exit->exception.syndrome; >>> uint32_t ec = syn_get_ec(syndrome); >>> >>> + qemu_mutex_lock_iothread(); >> >> Is there a particular reason you're moving the iothread lock out again >> from the individual bits? I would really like to keep a notion of fast >> path exits. > We still need to lock at least once no matter the exit reason to check > the interrupts so I don't think it's worth it to try and avoid locking > like this. It also makes the implementation easier to reason about and > therefore more likely to be correct. In our implementation we just > stay locked the whole time unless we're in hv_vcpu_run() or pselect(). > >>> switch (exit_reason) { >>> case HV_EXIT_REASON_EXCEPTION: >>> /* This is the main one, handle below. */ >>> break; >>> case HV_EXIT_REASON_VTIMER_ACTIVATED: >>> - qemu_mutex_lock_iothread(); >>> current_cpu = cpu; >>> qemu_set_irq(arm_cpu->gt_timer_outputs[GTIMER_VIRT], 1); >>> qemu_mutex_unlock_iothread(); >>> continue; >>> case HV_EXIT_REASON_CANCELED: >>> /* we got kicked, no exit to process */ >>> + qemu_mutex_unlock_iothread(); >>> continue; >>> default: >>> assert(0); >>> @@ -413,7 +406,6 @@ int hvf_vcpu_exec(CPUState *cpu) >>> uint32_t srt = (syndrome >> 16) & 0x1f; >>> uint64_t val = 0; >>> >>> - qemu_mutex_lock_iothread(); >>> current_cpu = cpu; >>> >>> DPRINTF("data abort: [pc=0x%llx va=0x%016llx pa=0x%016llx isv=%x " >>> @@ -446,8 +438,6 @@ int hvf_vcpu_exec(CPUState *cpu) >>> hvf_set_reg(cpu, srt, val); >>> } >>> >>> - qemu_mutex_unlock_iothread(); >>> - >>> advance_pc = true; >>> break; >>> } >>> @@ -493,68 +483,36 @@ int hvf_vcpu_exec(CPUState *cpu) >>> case EC_WFX_TRAP: >>> if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request & >>> (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) { >>> - uint64_t cval, ctl, val, diff, now; >>> + uint64_t cval; >>> >>> - /* Set up a local timer for vtimer if necessary ... */ >>> - r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0, &ctl); >>> - assert_hvf_ok(r); >>> r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0, &cval); >>> assert_hvf_ok(r); >>> >>> - asm volatile("mrs %0, cntvct_el0" : "=r"(val)); >>> - diff = cval - val; >>> - >>> - now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) / >>> - gt_cntfrq_period_ns(arm_cpu); >>> - >>> - /* Timer disabled or masked, just wait for long */ >>> - if (!(ctl & 1) || (ctl & 2)) { >>> - diff = (120 * NANOSECONDS_PER_SECOND) / >>> - gt_cntfrq_period_ns(arm_cpu); >>> + int64_t ticks_to_sleep = cval - mach_absolute_time(); >>> + if (ticks_to_sleep < 0) { >>> + break; >> >> This will loop at 100% for Windows, which configures the vtimer as >> cval=0 ctl=7, so with IRQ mask bit set. > Okay, but the 120s is kind of arbitrary so we should just sleep until > we get a signal. That can be done by passing null as the timespec > argument to pselect(). The reason I capped it at 120s was so that if I do hit a race, you don't break everything forever. Only for 2 minutes :). > >> >> Alex >> >> >>> } >>> >>> - if (diff < INT64_MAX) { >>> - uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu); >>> - struct timespec *ts = &cpu->hvf->ts; >>> - >>> - *ts = (struct timespec){ >>> - .tv_sec = ns / NANOSECONDS_PER_SECOND, >>> - .tv_nsec = ns % NANOSECONDS_PER_SECOND, >>> - }; >>> - >>> - /* >>> - * Waking up easily takes 1ms, don't go to sleep for smaller >>> - * time periods than 2ms. >>> - */ >>> - if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) { >> >> I put this logic here on purpose. A pselect(1 ns) easily takes 1-2ms to >> return. Without logic like this, super short WFIs will hurt performance >> quite badly. > I don't think that's accurate. According to this benchmark it's a few > hundred nanoseconds at most. > > pcc@pac-mini /tmp> cat pselect.c > #include <signal.h> > #include <sys/select.h> > > int main() { > sigset_t mask, orig_mask; > pthread_sigmask(SIG_SETMASK, 0, &mask); > sigaddset(&mask, SIGUSR1); > pthread_sigmask(SIG_SETMASK, &mask, &orig_mask); > > for (int i = 0; i != 1000000; ++i) { > struct timespec ts = { 0, 1 }; > pselect(0, 0, 0, 0, &ts, &orig_mask); > } > } > pcc@pac-mini /tmp> time ./pselect > > ________________________________________________________ > Executed in 179.87 millis fish external > usr time 77.68 millis 57.00 micros 77.62 millis > sys time 101.37 millis 852.00 micros 100.52 millis > > Besides, all that you're really saving here is the single pselect > call. There are no doubt more expensive syscalls involved in exiting > and entering the VCPU that would dominate here. I would expect that such a super low ts value has a short-circuit path in the kernel as well. Where things start to fall apart is when you're at a threshold where rescheduling might be ok, but then you need to take all of the additional task switch overhead into account. Try to adapt your test code a bit: #include <signal.h> #include <sys/select.h> int main() { sigset_t mask, orig_mask; pthread_sigmask(SIG_SETMASK, 0, &mask); sigaddset(&mask, SIGUSR1); pthread_sigmask(SIG_SETMASK, &mask, &orig_mask); for (int i = 0; i != 10000; ++i) { #define SCALE_MS 1000000 struct timespec ts = { 0, SCALE_MS / 10 }; pselect(0, 0, 0, 0, &ts, &orig_mask); } } % time ./pselect ./pselect 0.00s user 0.01s system 1% cpu 1.282 total You're suddenly seeing 300µs overhead per pselect call then. When I measured actual enter/exit times in QEMU, I saw much bigger differences between "time I want to sleep for" and "time I did sleep" even when just capturing the virtual time before and after the nanosleep/pselect call. Alex
On 01.12.20 21:03, Peter Collingbourne wrote: > On Tue, Dec 1, 2020 at 8:26 AM Alexander Graf <agraf@csgraf.de> wrote: >> >> On 01.12.20 09:21, Peter Collingbourne wrote: >>> Sleep on WFx until the VTIMER is due but allow ourselves to be woken >>> up on IPI. >>> >>> Signed-off-by: Peter Collingbourne <pcc@google.com> >>> --- >>> Alexander Graf wrote: >>>> I would love to take a patch from you here :). I'll still be stuck for a >>>> while with the sysreg sync rework that Peter asked for before I can look >>>> at WFI again. >>> Okay, here's a patch :) It's a relatively straightforward adaptation >>> of what we have in our fork, which can now boot Android to GUI while >>> remaining at around 4% CPU when idle. >>> >>> I'm not set up to boot a full Linux distribution at the moment so I >>> tested it on upstream QEMU by running a recent mainline Linux kernel >>> with a rootfs containing an init program that just does sleep(5) >>> and verified that the qemu process remains at low CPU usage during >>> the sleep. This was on top of your v2 plus the last patch of your v1 >>> since it doesn't look like you have a replacement for that logic yet. >> >> How about something like this instead? >> >> >> Alex >> >> >> diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c >> index 4360f64671..50384013ea 100644 >> --- a/accel/hvf/hvf-cpus.c >> +++ b/accel/hvf/hvf-cpus.c >> @@ -337,16 +337,18 @@ static int hvf_init_vcpu(CPUState *cpu) >> cpu->hvf = g_malloc0(sizeof(*cpu->hvf)); >> >> /* init cpu signals */ >> - sigset_t set; >> struct sigaction sigact; >> >> memset(&sigact, 0, sizeof(sigact)); >> sigact.sa_handler = dummy_signal; >> sigaction(SIG_IPI, &sigact, NULL); >> >> - pthread_sigmask(SIG_BLOCK, NULL, &set); >> - sigdelset(&set, SIG_IPI); >> - pthread_sigmask(SIG_SETMASK, &set, NULL); >> + pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->sigmask); >> + sigdelset(&cpu->hvf->sigmask, SIG_IPI); >> + pthread_sigmask(SIG_SETMASK, &cpu->hvf->sigmask, NULL); >> + >> + pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->sigmask_ipi); >> + sigaddset(&cpu->hvf->sigmask_ipi, SIG_IPI); > There's no reason to unblock SIG_IPI while not in pselect and it can > easily lead to missed wakeups. The whole point of pselect is so that > you can guarantee that only one part of your program sees signals > without a possibility of them being missed. Hm, I think I start to agree with you here :). We can probably just leave SIG_IPI masked at all times and only unmask on pselect. The worst thing that will happen is a premature wakeup if we did get an IPI incoming while hvf->sleeping is set, but were either not running pselect() yet and bailed out or already finished pselect() execution. > >> #ifdef __aarch64__ >> r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t >> **)&cpu->hvf->exit, NULL); >> diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h >> index c56baa3ae8..6e237f2db0 100644 >> --- a/include/sysemu/hvf_int.h >> +++ b/include/sysemu/hvf_int.h >> @@ -62,8 +62,9 @@ extern HVFState *hvf_state; >> struct hvf_vcpu_state { >> uint64_t fd; >> void *exit; >> - struct timespec ts; >> bool sleeping; >> + sigset_t sigmask; >> + sigset_t sigmask_ipi; >> }; >> >> void assert_hvf_ok(hv_return_t ret); >> diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c >> index 0c01a03725..350b845e6e 100644 >> --- a/target/arm/hvf/hvf.c >> +++ b/target/arm/hvf/hvf.c >> @@ -320,20 +320,24 @@ int hvf_arch_init_vcpu(CPUState *cpu) >> >> void hvf_kick_vcpu_thread(CPUState *cpu) >> { >> - if (cpu->hvf->sleeping) { >> - /* >> - * When sleeping, make sure we always send signals. Also, clear the >> - * timespec, so that an IPI that arrives between setting >> hvf->sleeping >> - * and the nanosleep syscall still aborts the sleep. >> - */ >> - cpu->thread_kicked = false; >> - cpu->hvf->ts = (struct timespec){ }; >> + if (qatomic_read(&cpu->hvf->sleeping)) { >> + /* When sleeping, send a signal to get out of pselect */ >> cpus_kick_thread(cpu); >> } else { >> hv_vcpus_exit(&cpu->hvf->fd, 1); >> } >> } >> >> +static void hvf_block_sig_ipi(CPUState *cpu) >> +{ >> + pthread_sigmask(SIG_SETMASK, &cpu->hvf->sigmask_ipi, NULL); >> +} >> + >> +static void hvf_unblock_sig_ipi(CPUState *cpu) >> +{ >> + pthread_sigmask(SIG_SETMASK, &cpu->hvf->sigmask, NULL); >> +} >> + >> static int hvf_inject_interrupts(CPUState *cpu) >> { >> if (cpu->interrupt_request & CPU_INTERRUPT_FIQ) { >> @@ -354,6 +358,7 @@ int hvf_vcpu_exec(CPUState *cpu) >> ARMCPU *arm_cpu = ARM_CPU(cpu); >> CPUARMState *env = &arm_cpu->env; >> hv_vcpu_exit_t *hvf_exit = cpu->hvf->exit; >> + const uint32_t irq_mask = CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ; >> hv_return_t r; >> int ret = 0; >> >> @@ -491,8 +496,8 @@ int hvf_vcpu_exec(CPUState *cpu) >> break; >> } >> case EC_WFX_TRAP: >> - if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request & >> - (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) { >> + if (!(syndrome & WFX_IS_WFE) && >> + !(cpu->interrupt_request & irq_mask)) { >> uint64_t cval, ctl, val, diff, now; > I don't think the access to cpu->interrupt_request is safe because it > is done while not under the iothread lock. That's why to avoid these > types of issues I would prefer to hold the lock almost all of the > time. In this branch, that's not a problem yet. On stale values, we either don't sleep (which is ok), or we go into the sleep path, and reevaluate cpu->interrupt_request atomically again after setting hvf->sleeping. > >> /* Set up a local timer for vtimer if necessary ... */ >> @@ -515,9 +520,7 @@ int hvf_vcpu_exec(CPUState *cpu) >> >> if (diff < INT64_MAX) { >> uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu); >> - struct timespec *ts = &cpu->hvf->ts; >> - >> - *ts = (struct timespec){ >> + struct timespec ts = { >> .tv_sec = ns / NANOSECONDS_PER_SECOND, >> .tv_nsec = ns % NANOSECONDS_PER_SECOND, >> }; >> @@ -526,27 +529,31 @@ int hvf_vcpu_exec(CPUState *cpu) >> * Waking up easily takes 1ms, don't go to sleep >> for smaller >> * time periods than 2ms. >> */ >> - if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) { >> + if (!ts.tv_sec && (ts.tv_nsec < (SCALE_MS * 2))) { >> advance_pc = true; >> break; >> } >> >> + /* block SIG_IPI for the sleep */ >> + hvf_block_sig_ipi(cpu); >> + cpu->thread_kicked = false; >> + >> /* Set cpu->hvf->sleeping so that we get a SIG_IPI >> signal. */ >> - cpu->hvf->sleeping = true; >> - smp_mb(); >> + qatomic_set(&cpu->hvf->sleeping, true); > This doesn't protect against races because another thread could call > kvf_vcpu_kick_thread() at any time between when we return from > hv_vcpu_run() and when we set sleeping = true and we would miss the > wakeup (due to kvf_vcpu_kick_thread() seeing sleeping = false and > calling hv_vcpus_exit() instead of pthread_kill()). I don't think it > can be fixed by setting sleeping to true earlier either because no > matter how early you move it, there will always be a window where we > are going to pselect() but sleeping is false, resulting in a missed > wakeup. I don't follow. If anyone was sending us an IPI, it's because they want to notify us about an update to cpu->interrupt_request, right? In that case, the atomic read of that field below will catch it and bail out of the sleep sequence. > > Peter > >> - /* Bail out if we received an IRQ meanwhile */ >> - if (cpu->thread_kicked || (cpu->interrupt_request & >> - (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) { >> - cpu->hvf->sleeping = false; >> + /* Bail out if we received a kick meanwhile */ >> + if (qatomic_read(&cpu->interrupt_request) & irq_mask) { >> + qatomic_set(&cpu->hvf->sleeping, false); ^^^ Alex >> + hvf_unblock_sig_ipi(cpu); >> break; >> } >> >> - /* nanosleep returns on signal, so we wake up on >> kick. */ >> - nanosleep(ts, NULL); >> + /* pselect returns on kick signal and consumes it */ >> + pselect(0, 0, 0, 0, &ts, &cpu->hvf->sigmask); >> >> /* Out of sleep - either naturally or because of a >> kick */ >> - cpu->hvf->sleeping = false; >> + qatomic_set(&cpu->hvf->sleeping, false); >> + hvf_unblock_sig_ipi(cpu); >> } >> >> advance_pc = true; >>
On 01.12.20 23:09, Alexander Graf wrote: > > On 01.12.20 21:03, Peter Collingbourne wrote: >> On Tue, Dec 1, 2020 at 8:26 AM Alexander Graf <agraf@csgraf.de> wrote: >>> >>> On 01.12.20 09:21, Peter Collingbourne wrote: >>>> Sleep on WFx until the VTIMER is due but allow ourselves to be woken >>>> up on IPI. >>>> >>>> Signed-off-by: Peter Collingbourne <pcc@google.com> >>>> --- >>>> Alexander Graf wrote: >>>>> I would love to take a patch from you here :). I'll still be stuck >>>>> for a >>>>> while with the sysreg sync rework that Peter asked for before I >>>>> can look >>>>> at WFI again. >>>> Okay, here's a patch :) It's a relatively straightforward adaptation >>>> of what we have in our fork, which can now boot Android to GUI while >>>> remaining at around 4% CPU when idle. >>>> >>>> I'm not set up to boot a full Linux distribution at the moment so I >>>> tested it on upstream QEMU by running a recent mainline Linux kernel >>>> with a rootfs containing an init program that just does sleep(5) >>>> and verified that the qemu process remains at low CPU usage during >>>> the sleep. This was on top of your v2 plus the last patch of your v1 >>>> since it doesn't look like you have a replacement for that logic yet. >>> >>> How about something like this instead? >>> >>> >>> Alex >>> >>> >>> diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c >>> index 4360f64671..50384013ea 100644 >>> --- a/accel/hvf/hvf-cpus.c >>> +++ b/accel/hvf/hvf-cpus.c >>> @@ -337,16 +337,18 @@ static int hvf_init_vcpu(CPUState *cpu) >>> cpu->hvf = g_malloc0(sizeof(*cpu->hvf)); >>> >>> /* init cpu signals */ >>> - sigset_t set; >>> struct sigaction sigact; >>> >>> memset(&sigact, 0, sizeof(sigact)); >>> sigact.sa_handler = dummy_signal; >>> sigaction(SIG_IPI, &sigact, NULL); >>> >>> - pthread_sigmask(SIG_BLOCK, NULL, &set); >>> - sigdelset(&set, SIG_IPI); >>> - pthread_sigmask(SIG_SETMASK, &set, NULL); >>> + pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->sigmask); >>> + sigdelset(&cpu->hvf->sigmask, SIG_IPI); >>> + pthread_sigmask(SIG_SETMASK, &cpu->hvf->sigmask, NULL); >>> + >>> + pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->sigmask_ipi); >>> + sigaddset(&cpu->hvf->sigmask_ipi, SIG_IPI); >> There's no reason to unblock SIG_IPI while not in pselect and it can >> easily lead to missed wakeups. The whole point of pselect is so that >> you can guarantee that only one part of your program sees signals >> without a possibility of them being missed. > > > Hm, I think I start to agree with you here :). We can probably just > leave SIG_IPI masked at all times and only unmask on pselect. The > worst thing that will happen is a premature wakeup if we did get an > IPI incoming while hvf->sleeping is set, but were either not running > pselect() yet and bailed out or already finished pselect() execution. How about this one? Do you really think it's still racy? Alex diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c index 4360f64671..e10fca622d 100644 --- a/accel/hvf/hvf-cpus.c +++ b/accel/hvf/hvf-cpus.c @@ -337,16 +337,17 @@ static int hvf_init_vcpu(CPUState *cpu) cpu->hvf = g_malloc0(sizeof(*cpu->hvf)); /* init cpu signals */ - sigset_t set; struct sigaction sigact; memset(&sigact, 0, sizeof(sigact)); sigact.sa_handler = dummy_signal; sigaction(SIG_IPI, &sigact, NULL); - pthread_sigmask(SIG_BLOCK, NULL, &set); - sigdelset(&set, SIG_IPI); - pthread_sigmask(SIG_SETMASK, &set, NULL); + /* Remember unmasked IPI mask for pselect(), leave masked normally */ + pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->sigmask_ipi); + sigaddset(&cpu->hvf->sigmask_ipi, SIG_IPI); + pthread_sigmask(SIG_SETMASK, &cpu->hvf->sigmask_ipi, NULL); + sigdelset(&cpu->hvf->sigmask_ipi, SIG_IPI); #ifdef __aarch64__ r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t **)&cpu->hvf->exit, NULL); diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h index c56baa3ae8..8d7d4a6226 100644 --- a/include/sysemu/hvf_int.h +++ b/include/sysemu/hvf_int.h @@ -62,8 +62,8 @@ extern HVFState *hvf_state; struct hvf_vcpu_state { uint64_t fd; void *exit; - struct timespec ts; bool sleeping; + sigset_t sigmask_ipi; }; void assert_hvf_ok(hv_return_t ret); diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c index 0c01a03725..a255a1a7d3 100644 --- a/target/arm/hvf/hvf.c +++ b/target/arm/hvf/hvf.c @@ -320,14 +320,8 @@ int hvf_arch_init_vcpu(CPUState *cpu) void hvf_kick_vcpu_thread(CPUState *cpu) { - if (cpu->hvf->sleeping) { - /* - * When sleeping, make sure we always send signals. Also, clear the - * timespec, so that an IPI that arrives between setting hvf->sleeping - * and the nanosleep syscall still aborts the sleep. - */ - cpu->thread_kicked = false; - cpu->hvf->ts = (struct timespec){ }; + if (qatomic_read(&cpu->hvf->sleeping)) { + /* When sleeping, send a signal to get out of pselect */ cpus_kick_thread(cpu); } else { hv_vcpus_exit(&cpu->hvf->fd, 1); @@ -354,6 +348,7 @@ int hvf_vcpu_exec(CPUState *cpu) ARMCPU *arm_cpu = ARM_CPU(cpu); CPUARMState *env = &arm_cpu->env; hv_vcpu_exit_t *hvf_exit = cpu->hvf->exit; + const uint32_t irq_mask = CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ; hv_return_t r; int ret = 0; @@ -491,8 +486,8 @@ int hvf_vcpu_exec(CPUState *cpu) break; } case EC_WFX_TRAP: - if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request & - (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) { + if (!(syndrome & WFX_IS_WFE) && + !(cpu->interrupt_request & irq_mask)) { uint64_t cval, ctl, val, diff, now; /* Set up a local timer for vtimer if necessary ... */ @@ -515,9 +510,7 @@ int hvf_vcpu_exec(CPUState *cpu) if (diff < INT64_MAX) { uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu); - struct timespec *ts = &cpu->hvf->ts; - - *ts = (struct timespec){ + struct timespec ts = { .tv_sec = ns / NANOSECONDS_PER_SECOND, .tv_nsec = ns % NANOSECONDS_PER_SECOND, }; @@ -526,27 +519,27 @@ int hvf_vcpu_exec(CPUState *cpu) * Waking up easily takes 1ms, don't go to sleep for smaller * time periods than 2ms. */ - if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) { + if (!ts.tv_sec && (ts.tv_nsec < (SCALE_MS * 2))) { advance_pc = true; break; } + cpu->thread_kicked = false; + /* Set cpu->hvf->sleeping so that we get a SIG_IPI signal. */ - cpu->hvf->sleeping = true; - smp_mb(); + qatomic_set(&cpu->hvf->sleeping, true); - /* Bail out if we received an IRQ meanwhile */ - if (cpu->thread_kicked || (cpu->interrupt_request & - (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) { - cpu->hvf->sleeping = false; + /* Bail out if we received a kick meanwhile */ + if (qatomic_read(&cpu->interrupt_request) & irq_mask) { + qatomic_set(&cpu->hvf->sleeping, false); break; } - /* nanosleep returns on signal, so we wake up on kick. */ - nanosleep(ts, NULL); + /* pselect returns on kick signal and consumes it */ + pselect(0, 0, 0, 0, &ts, &cpu->hvf->sigmask_ipi); /* Out of sleep - either naturally or because of a kick */ - cpu->hvf->sleeping = false; + qatomic_set(&cpu->hvf->sleeping, false); } advance_pc = true;
On Tue, Dec 1, 2020 at 2:09 PM Alexander Graf <agraf@csgraf.de> wrote: > > > On 01.12.20 21:03, Peter Collingbourne wrote: > > On Tue, Dec 1, 2020 at 8:26 AM Alexander Graf <agraf@csgraf.de> wrote: > >> > >> On 01.12.20 09:21, Peter Collingbourne wrote: > >>> Sleep on WFx until the VTIMER is due but allow ourselves to be woken > >>> up on IPI. > >>> > >>> Signed-off-by: Peter Collingbourne <pcc@google.com> > >>> --- > >>> Alexander Graf wrote: > >>>> I would love to take a patch from you here :). I'll still be stuck for a > >>>> while with the sysreg sync rework that Peter asked for before I can look > >>>> at WFI again. > >>> Okay, here's a patch :) It's a relatively straightforward adaptation > >>> of what we have in our fork, which can now boot Android to GUI while > >>> remaining at around 4% CPU when idle. > >>> > >>> I'm not set up to boot a full Linux distribution at the moment so I > >>> tested it on upstream QEMU by running a recent mainline Linux kernel > >>> with a rootfs containing an init program that just does sleep(5) > >>> and verified that the qemu process remains at low CPU usage during > >>> the sleep. This was on top of your v2 plus the last patch of your v1 > >>> since it doesn't look like you have a replacement for that logic yet. > >> > >> How about something like this instead? > >> > >> > >> Alex > >> > >> > >> diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c > >> index 4360f64671..50384013ea 100644 > >> --- a/accel/hvf/hvf-cpus.c > >> +++ b/accel/hvf/hvf-cpus.c > >> @@ -337,16 +337,18 @@ static int hvf_init_vcpu(CPUState *cpu) > >> cpu->hvf = g_malloc0(sizeof(*cpu->hvf)); > >> > >> /* init cpu signals */ > >> - sigset_t set; > >> struct sigaction sigact; > >> > >> memset(&sigact, 0, sizeof(sigact)); > >> sigact.sa_handler = dummy_signal; > >> sigaction(SIG_IPI, &sigact, NULL); > >> > >> - pthread_sigmask(SIG_BLOCK, NULL, &set); > >> - sigdelset(&set, SIG_IPI); > >> - pthread_sigmask(SIG_SETMASK, &set, NULL); > >> + pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->sigmask); > >> + sigdelset(&cpu->hvf->sigmask, SIG_IPI); > >> + pthread_sigmask(SIG_SETMASK, &cpu->hvf->sigmask, NULL); > >> + > >> + pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->sigmask_ipi); > >> + sigaddset(&cpu->hvf->sigmask_ipi, SIG_IPI); > > There's no reason to unblock SIG_IPI while not in pselect and it can > > easily lead to missed wakeups. The whole point of pselect is so that > > you can guarantee that only one part of your program sees signals > > without a possibility of them being missed. > > > Hm, I think I start to agree with you here :). We can probably just > leave SIG_IPI masked at all times and only unmask on pselect. The worst > thing that will happen is a premature wakeup if we did get an IPI > incoming while hvf->sleeping is set, but were either not running > pselect() yet and bailed out or already finished pselect() execution. Ack. > > > >> #ifdef __aarch64__ > >> r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t > >> **)&cpu->hvf->exit, NULL); > >> diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h > >> index c56baa3ae8..6e237f2db0 100644 > >> --- a/include/sysemu/hvf_int.h > >> +++ b/include/sysemu/hvf_int.h > >> @@ -62,8 +62,9 @@ extern HVFState *hvf_state; > >> struct hvf_vcpu_state { > >> uint64_t fd; > >> void *exit; > >> - struct timespec ts; > >> bool sleeping; > >> + sigset_t sigmask; > >> + sigset_t sigmask_ipi; > >> }; > >> > >> void assert_hvf_ok(hv_return_t ret); > >> diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c > >> index 0c01a03725..350b845e6e 100644 > >> --- a/target/arm/hvf/hvf.c > >> +++ b/target/arm/hvf/hvf.c > >> @@ -320,20 +320,24 @@ int hvf_arch_init_vcpu(CPUState *cpu) > >> > >> void hvf_kick_vcpu_thread(CPUState *cpu) > >> { > >> - if (cpu->hvf->sleeping) { > >> - /* > >> - * When sleeping, make sure we always send signals. Also, clear the > >> - * timespec, so that an IPI that arrives between setting > >> hvf->sleeping > >> - * and the nanosleep syscall still aborts the sleep. > >> - */ > >> - cpu->thread_kicked = false; > >> - cpu->hvf->ts = (struct timespec){ }; > >> + if (qatomic_read(&cpu->hvf->sleeping)) { > >> + /* When sleeping, send a signal to get out of pselect */ > >> cpus_kick_thread(cpu); > >> } else { > >> hv_vcpus_exit(&cpu->hvf->fd, 1); > >> } > >> } > >> > >> +static void hvf_block_sig_ipi(CPUState *cpu) > >> +{ > >> + pthread_sigmask(SIG_SETMASK, &cpu->hvf->sigmask_ipi, NULL); > >> +} > >> + > >> +static void hvf_unblock_sig_ipi(CPUState *cpu) > >> +{ > >> + pthread_sigmask(SIG_SETMASK, &cpu->hvf->sigmask, NULL); > >> +} > >> + > >> static int hvf_inject_interrupts(CPUState *cpu) > >> { > >> if (cpu->interrupt_request & CPU_INTERRUPT_FIQ) { > >> @@ -354,6 +358,7 @@ int hvf_vcpu_exec(CPUState *cpu) > >> ARMCPU *arm_cpu = ARM_CPU(cpu); > >> CPUARMState *env = &arm_cpu->env; > >> hv_vcpu_exit_t *hvf_exit = cpu->hvf->exit; > >> + const uint32_t irq_mask = CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ; > >> hv_return_t r; > >> int ret = 0; > >> > >> @@ -491,8 +496,8 @@ int hvf_vcpu_exec(CPUState *cpu) > >> break; > >> } > >> case EC_WFX_TRAP: > >> - if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request & > >> - (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) { > >> + if (!(syndrome & WFX_IS_WFE) && > >> + !(cpu->interrupt_request & irq_mask)) { > >> uint64_t cval, ctl, val, diff, now; > > I don't think the access to cpu->interrupt_request is safe because it > > is done while not under the iothread lock. That's why to avoid these > > types of issues I would prefer to hold the lock almost all of the > > time. > > > In this branch, that's not a problem yet. On stale values, we either > don't sleep (which is ok), or we go into the sleep path, and reevaluate > cpu->interrupt_request atomically again after setting hvf->sleeping. Okay, this may be a "benign race" (and it may be helped a little by the M1's sequential consistency extension) but this is the sort of thing that I'd prefer not to rely on. At least it should be an atomic read. > > > >> /* Set up a local timer for vtimer if necessary ... */ > >> @@ -515,9 +520,7 @@ int hvf_vcpu_exec(CPUState *cpu) > >> > >> if (diff < INT64_MAX) { > >> uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu); > >> - struct timespec *ts = &cpu->hvf->ts; > >> - > >> - *ts = (struct timespec){ > >> + struct timespec ts = { > >> .tv_sec = ns / NANOSECONDS_PER_SECOND, > >> .tv_nsec = ns % NANOSECONDS_PER_SECOND, > >> }; > >> @@ -526,27 +529,31 @@ int hvf_vcpu_exec(CPUState *cpu) > >> * Waking up easily takes 1ms, don't go to sleep > >> for smaller > >> * time periods than 2ms. > >> */ > >> - if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) { > >> + if (!ts.tv_sec && (ts.tv_nsec < (SCALE_MS * 2))) { > >> advance_pc = true; > >> break; > >> } > >> > >> + /* block SIG_IPI for the sleep */ > >> + hvf_block_sig_ipi(cpu); > >> + cpu->thread_kicked = false; > >> + > >> /* Set cpu->hvf->sleeping so that we get a SIG_IPI > >> signal. */ > >> - cpu->hvf->sleeping = true; > >> - smp_mb(); > >> + qatomic_set(&cpu->hvf->sleeping, true); > > This doesn't protect against races because another thread could call > > kvf_vcpu_kick_thread() at any time between when we return from > > hv_vcpu_run() and when we set sleeping = true and we would miss the > > wakeup (due to kvf_vcpu_kick_thread() seeing sleeping = false and > > calling hv_vcpus_exit() instead of pthread_kill()). I don't think it > > can be fixed by setting sleeping to true earlier either because no > > matter how early you move it, there will always be a window where we > > are going to pselect() but sleeping is false, resulting in a missed > > wakeup. > > > I don't follow. If anyone was sending us an IPI, it's because they want > to notify us about an update to cpu->interrupt_request, right? In that > case, the atomic read of that field below will catch it and bail out of > the sleep sequence. I think there are other possible IPI reasons, e.g. set halted to 1, I/O events. Now we could check for halted below and maybe some of the others but the code will be subtle and it seems like a game of whack-a-mole to get them all. This is an example of what I was talking about when I said that an approach that relies on the sleeping field will be difficult to get right. I would strongly prefer to start with a simple approach and maybe we can consider a more complicated one later. Peter > > > > > > Peter > > > >> - /* Bail out if we received an IRQ meanwhile */ > >> - if (cpu->thread_kicked || (cpu->interrupt_request & > >> - (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) { > >> - cpu->hvf->sleeping = false; > >> + /* Bail out if we received a kick meanwhile */ > >> + if (qatomic_read(&cpu->interrupt_request) & irq_mask) { > >> + qatomic_set(&cpu->hvf->sleeping, false); > > > ^^^ > > > Alex > > > >> + hvf_unblock_sig_ipi(cpu); > >> break; > >> } > >> > >> - /* nanosleep returns on signal, so we wake up on > >> kick. */ > >> - nanosleep(ts, NULL); > >> + /* pselect returns on kick signal and consumes it */ > >> + pselect(0, 0, 0, 0, &ts, &cpu->hvf->sigmask); > >> > >> /* Out of sleep - either naturally or because of a > >> kick */ > >> - cpu->hvf->sleeping = false; > >> + qatomic_set(&cpu->hvf->sleeping, false); > >> + hvf_unblock_sig_ipi(cpu); > >> } > >> > >> advance_pc = true; > >>
On Tue, Dec 1, 2020 at 2:04 PM Alexander Graf <agraf@csgraf.de> wrote: > > > On 01.12.20 19:59, Peter Collingbourne wrote: > > On Tue, Dec 1, 2020 at 3:16 AM Alexander Graf <agraf@csgraf.de> wrote: > >> Hi Peter, > >> > >> On 01.12.20 09:21, Peter Collingbourne wrote: > >>> Sleep on WFx until the VTIMER is due but allow ourselves to be woken > >>> up on IPI. > >>> > >>> Signed-off-by: Peter Collingbourne <pcc@google.com> > >> > >> Thanks a bunch! > >> > >> > >>> --- > >>> Alexander Graf wrote: > >>>> I would love to take a patch from you here :). I'll still be stuck for a > >>>> while with the sysreg sync rework that Peter asked for before I can look > >>>> at WFI again. > >>> Okay, here's a patch :) It's a relatively straightforward adaptation > >>> of what we have in our fork, which can now boot Android to GUI while > >>> remaining at around 4% CPU when idle. > >>> > >>> I'm not set up to boot a full Linux distribution at the moment so I > >>> tested it on upstream QEMU by running a recent mainline Linux kernel > >>> with a rootfs containing an init program that just does sleep(5) > >>> and verified that the qemu process remains at low CPU usage during > >>> the sleep. This was on top of your v2 plus the last patch of your v1 > >>> since it doesn't look like you have a replacement for that logic yet. > >>> > >>> accel/hvf/hvf-cpus.c | 5 +-- > >>> include/sysemu/hvf_int.h | 3 +- > >>> target/arm/hvf/hvf.c | 94 +++++++++++----------------------------- > >>> 3 files changed, 28 insertions(+), 74 deletions(-) > >>> > >>> diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c > >>> index 4360f64671..b2c8fb57f6 100644 > >>> --- a/accel/hvf/hvf-cpus.c > >>> +++ b/accel/hvf/hvf-cpus.c > >>> @@ -344,9 +344,8 @@ static int hvf_init_vcpu(CPUState *cpu) > >>> sigact.sa_handler = dummy_signal; > >>> sigaction(SIG_IPI, &sigact, NULL); > >>> > >>> - pthread_sigmask(SIG_BLOCK, NULL, &set); > >>> - sigdelset(&set, SIG_IPI); > >>> - pthread_sigmask(SIG_SETMASK, &set, NULL); > >>> + pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->unblock_ipi_mask); > >>> + sigdelset(&cpu->hvf->unblock_ipi_mask, SIG_IPI); > >> > >> What will this do to the x86 hvf implementation? We're now not > >> unblocking SIG_IPI again for that, right? > > Yes and that was the case before your patch series. > > > The way I understand Roman, he wanted to unblock the IPI signal on x86: > > https://patchwork.kernel.org/project/qemu-devel/patch/20201126215017.41156-3-agraf@csgraf.de/#23807021 > > I agree that at this point it's not a problem though to break it again. > I'm not quite sure how to merge your patches within my patch set though, > given they basically revert half of my previously introduced code... > > > > > >>> #ifdef __aarch64__ > >>> r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t **)&cpu->hvf->exit, NULL); > >>> diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h > >>> index c56baa3ae8..13adf6ea77 100644 > >>> --- a/include/sysemu/hvf_int.h > >>> +++ b/include/sysemu/hvf_int.h > >>> @@ -62,8 +62,7 @@ extern HVFState *hvf_state; > >>> struct hvf_vcpu_state { > >>> uint64_t fd; > >>> void *exit; > >>> - struct timespec ts; > >>> - bool sleeping; > >>> + sigset_t unblock_ipi_mask; > >>> }; > >>> > >>> void assert_hvf_ok(hv_return_t ret); > >>> diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c > >>> index 8fe10966d2..60a361ff38 100644 > >>> --- a/target/arm/hvf/hvf.c > >>> +++ b/target/arm/hvf/hvf.c > >>> @@ -2,6 +2,7 @@ > >>> * QEMU Hypervisor.framework support for Apple Silicon > >>> > >>> * Copyright 2020 Alexander Graf <agraf@csgraf.de> > >>> + * Copyright 2020 Google LLC > >>> * > >>> * This work is licensed under the terms of the GNU GPL, version 2 or later. > >>> * See the COPYING file in the top-level directory. > >>> @@ -18,6 +19,7 @@ > >>> #include "sysemu/hw_accel.h" > >>> > >>> #include <Hypervisor/Hypervisor.h> > >>> +#include <mach/mach_time.h> > >>> > >>> #include "exec/address-spaces.h" > >>> #include "hw/irq.h" > >>> @@ -320,18 +322,8 @@ int hvf_arch_init_vcpu(CPUState *cpu) > >>> > >>> void hvf_kick_vcpu_thread(CPUState *cpu) > >>> { > >>> - if (cpu->hvf->sleeping) { > >>> - /* > >>> - * When sleeping, make sure we always send signals. Also, clear the > >>> - * timespec, so that an IPI that arrives between setting hvf->sleeping > >>> - * and the nanosleep syscall still aborts the sleep. > >>> - */ > >>> - cpu->thread_kicked = false; > >>> - cpu->hvf->ts = (struct timespec){ }; > >>> - cpus_kick_thread(cpu); > >>> - } else { > >>> - hv_vcpus_exit(&cpu->hvf->fd, 1); > >>> - } > >>> + cpus_kick_thread(cpu); > >>> + hv_vcpus_exit(&cpu->hvf->fd, 1); > >> > >> This means your first WFI will almost always return immediately due to a > >> pending signal, because there probably was an IRQ pending before on the > >> same CPU, no? > > That's right. Any approach involving the "sleeping" field would need > > to be implemented carefully to avoid races that may result in missed > > wakeups so for simplicity I just decided to send both kinds of > > wakeups. In particular the approach in the updated patch you sent is > > racy and I'll elaborate more in the reply to that patch. > > > >>> } > >>> > >>> static int hvf_inject_interrupts(CPUState *cpu) > >>> @@ -385,18 +377,19 @@ int hvf_vcpu_exec(CPUState *cpu) > >>> uint64_t syndrome = hvf_exit->exception.syndrome; > >>> uint32_t ec = syn_get_ec(syndrome); > >>> > >>> + qemu_mutex_lock_iothread(); > >> > >> Is there a particular reason you're moving the iothread lock out again > >> from the individual bits? I would really like to keep a notion of fast > >> path exits. > > We still need to lock at least once no matter the exit reason to check > > the interrupts so I don't think it's worth it to try and avoid locking > > like this. It also makes the implementation easier to reason about and > > therefore more likely to be correct. In our implementation we just > > stay locked the whole time unless we're in hv_vcpu_run() or pselect(). > > > >>> switch (exit_reason) { > >>> case HV_EXIT_REASON_EXCEPTION: > >>> /* This is the main one, handle below. */ > >>> break; > >>> case HV_EXIT_REASON_VTIMER_ACTIVATED: > >>> - qemu_mutex_lock_iothread(); > >>> current_cpu = cpu; > >>> qemu_set_irq(arm_cpu->gt_timer_outputs[GTIMER_VIRT], 1); > >>> qemu_mutex_unlock_iothread(); > >>> continue; > >>> case HV_EXIT_REASON_CANCELED: > >>> /* we got kicked, no exit to process */ > >>> + qemu_mutex_unlock_iothread(); > >>> continue; > >>> default: > >>> assert(0); > >>> @@ -413,7 +406,6 @@ int hvf_vcpu_exec(CPUState *cpu) > >>> uint32_t srt = (syndrome >> 16) & 0x1f; > >>> uint64_t val = 0; > >>> > >>> - qemu_mutex_lock_iothread(); > >>> current_cpu = cpu; > >>> > >>> DPRINTF("data abort: [pc=0x%llx va=0x%016llx pa=0x%016llx isv=%x " > >>> @@ -446,8 +438,6 @@ int hvf_vcpu_exec(CPUState *cpu) > >>> hvf_set_reg(cpu, srt, val); > >>> } > >>> > >>> - qemu_mutex_unlock_iothread(); > >>> - > >>> advance_pc = true; > >>> break; > >>> } > >>> @@ -493,68 +483,36 @@ int hvf_vcpu_exec(CPUState *cpu) > >>> case EC_WFX_TRAP: > >>> if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request & > >>> (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) { > >>> - uint64_t cval, ctl, val, diff, now; > >>> + uint64_t cval; > >>> > >>> - /* Set up a local timer for vtimer if necessary ... */ > >>> - r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0, &ctl); > >>> - assert_hvf_ok(r); > >>> r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0, &cval); > >>> assert_hvf_ok(r); > >>> > >>> - asm volatile("mrs %0, cntvct_el0" : "=r"(val)); > >>> - diff = cval - val; > >>> - > >>> - now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) / > >>> - gt_cntfrq_period_ns(arm_cpu); > >>> - > >>> - /* Timer disabled or masked, just wait for long */ > >>> - if (!(ctl & 1) || (ctl & 2)) { > >>> - diff = (120 * NANOSECONDS_PER_SECOND) / > >>> - gt_cntfrq_period_ns(arm_cpu); > >>> + int64_t ticks_to_sleep = cval - mach_absolute_time(); > >>> + if (ticks_to_sleep < 0) { > >>> + break; > >> > >> This will loop at 100% for Windows, which configures the vtimer as > >> cval=0 ctl=7, so with IRQ mask bit set. > > Okay, but the 120s is kind of arbitrary so we should just sleep until > > we get a signal. That can be done by passing null as the timespec > > argument to pselect(). > > > The reason I capped it at 120s was so that if I do hit a race, you don't > break everything forever. Only for 2 minutes :). I see. I think at this point we want to notice these types of bugs if they exist instead of hiding them, so I would mildly be in favor of not capping at 120s. > > > >> > >> Alex > >> > >> > >>> } > >>> > >>> - if (diff < INT64_MAX) { > >>> - uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu); > >>> - struct timespec *ts = &cpu->hvf->ts; > >>> - > >>> - *ts = (struct timespec){ > >>> - .tv_sec = ns / NANOSECONDS_PER_SECOND, > >>> - .tv_nsec = ns % NANOSECONDS_PER_SECOND, > >>> - }; > >>> - > >>> - /* > >>> - * Waking up easily takes 1ms, don't go to sleep for smaller > >>> - * time periods than 2ms. > >>> - */ > >>> - if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) { > >> > >> I put this logic here on purpose. A pselect(1 ns) easily takes 1-2ms to > >> return. Without logic like this, super short WFIs will hurt performance > >> quite badly. > > I don't think that's accurate. According to this benchmark it's a few > > hundred nanoseconds at most. > > > > pcc@pac-mini /tmp> cat pselect.c > > #include <signal.h> > > #include <sys/select.h> > > > > int main() { > > sigset_t mask, orig_mask; > > pthread_sigmask(SIG_SETMASK, 0, &mask); > > sigaddset(&mask, SIGUSR1); > > pthread_sigmask(SIG_SETMASK, &mask, &orig_mask); > > > > for (int i = 0; i != 1000000; ++i) { > > struct timespec ts = { 0, 1 }; > > pselect(0, 0, 0, 0, &ts, &orig_mask); > > } > > } > > pcc@pac-mini /tmp> time ./pselect > > > > ________________________________________________________ > > Executed in 179.87 millis fish external > > usr time 77.68 millis 57.00 micros 77.62 millis > > sys time 101.37 millis 852.00 micros 100.52 millis > > > > Besides, all that you're really saving here is the single pselect > > call. There are no doubt more expensive syscalls involved in exiting > > and entering the VCPU that would dominate here. > > > I would expect that such a super low ts value has a short-circuit path > in the kernel as well. Where things start to fall apart is when you're > at a threshold where rescheduling might be ok, but then you need to take > all of the additional task switch overhead into account. Try to adapt > your test code a bit: > > #include <signal.h> > #include <sys/select.h> > > int main() { > sigset_t mask, orig_mask; > pthread_sigmask(SIG_SETMASK, 0, &mask); > sigaddset(&mask, SIGUSR1); > pthread_sigmask(SIG_SETMASK, &mask, &orig_mask); > > for (int i = 0; i != 10000; ++i) { > #define SCALE_MS 1000000 > struct timespec ts = { 0, SCALE_MS / 10 }; > pselect(0, 0, 0, 0, &ts, &orig_mask); > } > } > > > % time ./pselect > ./pselect 0.00s user 0.01s system 1% cpu 1.282 total > > You're suddenly seeing 300µs overhead per pselect call then. When I > measured actual enter/exit times in QEMU, I saw much bigger differences > between "time I want to sleep for" and "time I did sleep" even when just > capturing the virtual time before and after the nanosleep/pselect call. Okay. So the alternative is that we spin on the CPU, either doing no-op VCPU entries/exits or something like: while (mach_absolute_time() < cval); My intuition is we shouldn't try to subvert the OS scheduler like this unless it's proven to help with some real world metric since otherwise we're not being fair to the other processes on the CPU. With CPU intensive workloads I wouldn't expect these kinds of sleeps to happen very often if at all so if it's only microbenchmarks and so on that are affected then my inclination is not to do this for now. Peter
On 02.12.20 02:19, Peter Collingbourne wrote: > On Tue, Dec 1, 2020 at 2:04 PM Alexander Graf <agraf@csgraf.de> wrote: >> >> On 01.12.20 19:59, Peter Collingbourne wrote: >>> On Tue, Dec 1, 2020 at 3:16 AM Alexander Graf <agraf@csgraf.de> wrote: >>>> Hi Peter, >>>> >>>> On 01.12.20 09:21, Peter Collingbourne wrote: >>>>> Sleep on WFx until the VTIMER is due but allow ourselves to be woken >>>>> up on IPI. >>>>> >>>>> Signed-off-by: Peter Collingbourne <pcc@google.com> >>>> Thanks a bunch! >>>> >>>> >>>>> --- >>>>> Alexander Graf wrote: >>>>>> I would love to take a patch from you here :). I'll still be stuck for a >>>>>> while with the sysreg sync rework that Peter asked for before I can look >>>>>> at WFI again. >>>>> Okay, here's a patch :) It's a relatively straightforward adaptation >>>>> of what we have in our fork, which can now boot Android to GUI while >>>>> remaining at around 4% CPU when idle. >>>>> >>>>> I'm not set up to boot a full Linux distribution at the moment so I >>>>> tested it on upstream QEMU by running a recent mainline Linux kernel >>>>> with a rootfs containing an init program that just does sleep(5) >>>>> and verified that the qemu process remains at low CPU usage during >>>>> the sleep. This was on top of your v2 plus the last patch of your v1 >>>>> since it doesn't look like you have a replacement for that logic yet. >>>>> >>>>> accel/hvf/hvf-cpus.c | 5 +-- >>>>> include/sysemu/hvf_int.h | 3 +- >>>>> target/arm/hvf/hvf.c | 94 +++++++++++----------------------------- >>>>> 3 files changed, 28 insertions(+), 74 deletions(-) >>>>> >>>>> diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c >>>>> index 4360f64671..b2c8fb57f6 100644 >>>>> --- a/accel/hvf/hvf-cpus.c >>>>> +++ b/accel/hvf/hvf-cpus.c >>>>> @@ -344,9 +344,8 @@ static int hvf_init_vcpu(CPUState *cpu) >>>>> sigact.sa_handler = dummy_signal; >>>>> sigaction(SIG_IPI, &sigact, NULL); >>>>> >>>>> - pthread_sigmask(SIG_BLOCK, NULL, &set); >>>>> - sigdelset(&set, SIG_IPI); >>>>> - pthread_sigmask(SIG_SETMASK, &set, NULL); >>>>> + pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->unblock_ipi_mask); >>>>> + sigdelset(&cpu->hvf->unblock_ipi_mask, SIG_IPI); >>>> What will this do to the x86 hvf implementation? We're now not >>>> unblocking SIG_IPI again for that, right? >>> Yes and that was the case before your patch series. >> >> The way I understand Roman, he wanted to unblock the IPI signal on x86: >> >> https://patchwork.kernel.org/project/qemu-devel/patch/20201126215017.41156-3-agraf@csgraf.de/#23807021 >> >> I agree that at this point it's not a problem though to break it again. >> I'm not quite sure how to merge your patches within my patch set though, >> given they basically revert half of my previously introduced code... >> >> >>>>> #ifdef __aarch64__ >>>>> r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t **)&cpu->hvf->exit, NULL); >>>>> diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h >>>>> index c56baa3ae8..13adf6ea77 100644 >>>>> --- a/include/sysemu/hvf_int.h >>>>> +++ b/include/sysemu/hvf_int.h >>>>> @@ -62,8 +62,7 @@ extern HVFState *hvf_state; >>>>> struct hvf_vcpu_state { >>>>> uint64_t fd; >>>>> void *exit; >>>>> - struct timespec ts; >>>>> - bool sleeping; >>>>> + sigset_t unblock_ipi_mask; >>>>> }; >>>>> >>>>> void assert_hvf_ok(hv_return_t ret); >>>>> diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c >>>>> index 8fe10966d2..60a361ff38 100644 >>>>> --- a/target/arm/hvf/hvf.c >>>>> +++ b/target/arm/hvf/hvf.c >>>>> @@ -2,6 +2,7 @@ >>>>> * QEMU Hypervisor.framework support for Apple Silicon >>>>> >>>>> * Copyright 2020 Alexander Graf <agraf@csgraf.de> >>>>> + * Copyright 2020 Google LLC >>>>> * >>>>> * This work is licensed under the terms of the GNU GPL, version 2 or later. >>>>> * See the COPYING file in the top-level directory. >>>>> @@ -18,6 +19,7 @@ >>>>> #include "sysemu/hw_accel.h" >>>>> >>>>> #include <Hypervisor/Hypervisor.h> >>>>> +#include <mach/mach_time.h> >>>>> >>>>> #include "exec/address-spaces.h" >>>>> #include "hw/irq.h" >>>>> @@ -320,18 +322,8 @@ int hvf_arch_init_vcpu(CPUState *cpu) >>>>> >>>>> void hvf_kick_vcpu_thread(CPUState *cpu) >>>>> { >>>>> - if (cpu->hvf->sleeping) { >>>>> - /* >>>>> - * When sleeping, make sure we always send signals. Also, clear the >>>>> - * timespec, so that an IPI that arrives between setting hvf->sleeping >>>>> - * and the nanosleep syscall still aborts the sleep. >>>>> - */ >>>>> - cpu->thread_kicked = false; >>>>> - cpu->hvf->ts = (struct timespec){ }; >>>>> - cpus_kick_thread(cpu); >>>>> - } else { >>>>> - hv_vcpus_exit(&cpu->hvf->fd, 1); >>>>> - } >>>>> + cpus_kick_thread(cpu); >>>>> + hv_vcpus_exit(&cpu->hvf->fd, 1); >>>> This means your first WFI will almost always return immediately due to a >>>> pending signal, because there probably was an IRQ pending before on the >>>> same CPU, no? >>> That's right. Any approach involving the "sleeping" field would need >>> to be implemented carefully to avoid races that may result in missed >>> wakeups so for simplicity I just decided to send both kinds of >>> wakeups. In particular the approach in the updated patch you sent is >>> racy and I'll elaborate more in the reply to that patch. >>> >>>>> } >>>>> >>>>> static int hvf_inject_interrupts(CPUState *cpu) >>>>> @@ -385,18 +377,19 @@ int hvf_vcpu_exec(CPUState *cpu) >>>>> uint64_t syndrome = hvf_exit->exception.syndrome; >>>>> uint32_t ec = syn_get_ec(syndrome); >>>>> >>>>> + qemu_mutex_lock_iothread(); >>>> Is there a particular reason you're moving the iothread lock out again >>>> from the individual bits? I would really like to keep a notion of fast >>>> path exits. >>> We still need to lock at least once no matter the exit reason to check >>> the interrupts so I don't think it's worth it to try and avoid locking >>> like this. It also makes the implementation easier to reason about and >>> therefore more likely to be correct. In our implementation we just >>> stay locked the whole time unless we're in hv_vcpu_run() or pselect(). >>> >>>>> switch (exit_reason) { >>>>> case HV_EXIT_REASON_EXCEPTION: >>>>> /* This is the main one, handle below. */ >>>>> break; >>>>> case HV_EXIT_REASON_VTIMER_ACTIVATED: >>>>> - qemu_mutex_lock_iothread(); >>>>> current_cpu = cpu; >>>>> qemu_set_irq(arm_cpu->gt_timer_outputs[GTIMER_VIRT], 1); >>>>> qemu_mutex_unlock_iothread(); >>>>> continue; >>>>> case HV_EXIT_REASON_CANCELED: >>>>> /* we got kicked, no exit to process */ >>>>> + qemu_mutex_unlock_iothread(); >>>>> continue; >>>>> default: >>>>> assert(0); >>>>> @@ -413,7 +406,6 @@ int hvf_vcpu_exec(CPUState *cpu) >>>>> uint32_t srt = (syndrome >> 16) & 0x1f; >>>>> uint64_t val = 0; >>>>> >>>>> - qemu_mutex_lock_iothread(); >>>>> current_cpu = cpu; >>>>> >>>>> DPRINTF("data abort: [pc=0x%llx va=0x%016llx pa=0x%016llx isv=%x " >>>>> @@ -446,8 +438,6 @@ int hvf_vcpu_exec(CPUState *cpu) >>>>> hvf_set_reg(cpu, srt, val); >>>>> } >>>>> >>>>> - qemu_mutex_unlock_iothread(); >>>>> - >>>>> advance_pc = true; >>>>> break; >>>>> } >>>>> @@ -493,68 +483,36 @@ int hvf_vcpu_exec(CPUState *cpu) >>>>> case EC_WFX_TRAP: >>>>> if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request & >>>>> (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) { >>>>> - uint64_t cval, ctl, val, diff, now; >>>>> + uint64_t cval; >>>>> >>>>> - /* Set up a local timer for vtimer if necessary ... */ >>>>> - r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0, &ctl); >>>>> - assert_hvf_ok(r); >>>>> r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0, &cval); >>>>> assert_hvf_ok(r); >>>>> >>>>> - asm volatile("mrs %0, cntvct_el0" : "=r"(val)); >>>>> - diff = cval - val; >>>>> - >>>>> - now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) / >>>>> - gt_cntfrq_period_ns(arm_cpu); >>>>> - >>>>> - /* Timer disabled or masked, just wait for long */ >>>>> - if (!(ctl & 1) || (ctl & 2)) { >>>>> - diff = (120 * NANOSECONDS_PER_SECOND) / >>>>> - gt_cntfrq_period_ns(arm_cpu); >>>>> + int64_t ticks_to_sleep = cval - mach_absolute_time(); >>>>> + if (ticks_to_sleep < 0) { >>>>> + break; >>>> This will loop at 100% for Windows, which configures the vtimer as >>>> cval=0 ctl=7, so with IRQ mask bit set. >>> Okay, but the 120s is kind of arbitrary so we should just sleep until >>> we get a signal. That can be done by passing null as the timespec >>> argument to pselect(). >> >> The reason I capped it at 120s was so that if I do hit a race, you don't >> break everything forever. Only for 2 minutes :). > I see. I think at this point we want to notice these types of bugs if > they exist instead of hiding them, so I would mildly be in favor of > not capping at 120s. Crossing my fingers that we are at that point already :). > >>>> Alex >>>> >>>> >>>>> } >>>>> >>>>> - if (diff < INT64_MAX) { >>>>> - uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu); >>>>> - struct timespec *ts = &cpu->hvf->ts; >>>>> - >>>>> - *ts = (struct timespec){ >>>>> - .tv_sec = ns / NANOSECONDS_PER_SECOND, >>>>> - .tv_nsec = ns % NANOSECONDS_PER_SECOND, >>>>> - }; >>>>> - >>>>> - /* >>>>> - * Waking up easily takes 1ms, don't go to sleep for smaller >>>>> - * time periods than 2ms. >>>>> - */ >>>>> - if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) { >>>> I put this logic here on purpose. A pselect(1 ns) easily takes 1-2ms to >>>> return. Without logic like this, super short WFIs will hurt performance >>>> quite badly. >>> I don't think that's accurate. According to this benchmark it's a few >>> hundred nanoseconds at most. >>> >>> pcc@pac-mini /tmp> cat pselect.c >>> #include <signal.h> >>> #include <sys/select.h> >>> >>> int main() { >>> sigset_t mask, orig_mask; >>> pthread_sigmask(SIG_SETMASK, 0, &mask); >>> sigaddset(&mask, SIGUSR1); >>> pthread_sigmask(SIG_SETMASK, &mask, &orig_mask); >>> >>> for (int i = 0; i != 1000000; ++i) { >>> struct timespec ts = { 0, 1 }; >>> pselect(0, 0, 0, 0, &ts, &orig_mask); >>> } >>> } >>> pcc@pac-mini /tmp> time ./pselect >>> >>> ________________________________________________________ >>> Executed in 179.87 millis fish external >>> usr time 77.68 millis 57.00 micros 77.62 millis >>> sys time 101.37 millis 852.00 micros 100.52 millis >>> >>> Besides, all that you're really saving here is the single pselect >>> call. There are no doubt more expensive syscalls involved in exiting >>> and entering the VCPU that would dominate here. >> >> I would expect that such a super low ts value has a short-circuit path >> in the kernel as well. Where things start to fall apart is when you're >> at a threshold where rescheduling might be ok, but then you need to take >> all of the additional task switch overhead into account. Try to adapt >> your test code a bit: >> >> #include <signal.h> >> #include <sys/select.h> >> >> int main() { >> sigset_t mask, orig_mask; >> pthread_sigmask(SIG_SETMASK, 0, &mask); >> sigaddset(&mask, SIGUSR1); >> pthread_sigmask(SIG_SETMASK, &mask, &orig_mask); >> >> for (int i = 0; i != 10000; ++i) { >> #define SCALE_MS 1000000 >> struct timespec ts = { 0, SCALE_MS / 10 }; >> pselect(0, 0, 0, 0, &ts, &orig_mask); >> } >> } >> >> >> % time ./pselect >> ./pselect 0.00s user 0.01s system 1% cpu 1.282 total >> >> You're suddenly seeing 300µs overhead per pselect call then. When I >> measured actual enter/exit times in QEMU, I saw much bigger differences >> between "time I want to sleep for" and "time I did sleep" even when just >> capturing the virtual time before and after the nanosleep/pselect call. > Okay. So the alternative is that we spin on the CPU, either doing > no-op VCPU entries/exits or something like: > > while (mach_absolute_time() < cval); This won't catch events that arrive during that time, such as interrupts, right? I'd just declare the WFI as done and keep looping in and out of the guest for now. > My intuition is we shouldn't try to subvert the OS scheduler like this > unless it's proven to help with some real world metric since otherwise > we're not being fair to the other processes on the CPU. With CPU > intensive workloads I wouldn't expect these kinds of sleeps to happen > very often if at all so if it's only microbenchmarks and so on that > are affected then my inclination is not to do this for now. The problem is that the VM's OS is expecting bare metal timer behavior usually. And that gives you much better granularities than what we can achieve with a virtualization layer on top. So I do feel strongly about leaving this bit in. In the workloads you describe above, you won't ever hit that branch anyway. The workloads that benefit from logic like this are message passing ones. Check out this presentation from a KVM colleague of yours for details: https://www.linux-kvm.org/images/a/ac/02x03-Davit_Matalack-KVM_Message_passing_Performance.pdf https://www.youtube.com/watch?v=p85FFrloLFg Alex
On Tue, Dec 1, 2020 at 5:53 PM Alexander Graf <agraf@csgraf.de> wrote: > > > On 02.12.20 02:19, Peter Collingbourne wrote: > > On Tue, Dec 1, 2020 at 2:04 PM Alexander Graf <agraf@csgraf.de> wrote: > >> > >> On 01.12.20 19:59, Peter Collingbourne wrote: > >>> On Tue, Dec 1, 2020 at 3:16 AM Alexander Graf <agraf@csgraf.de> wrote: > >>>> Hi Peter, > >>>> > >>>> On 01.12.20 09:21, Peter Collingbourne wrote: > >>>>> Sleep on WFx until the VTIMER is due but allow ourselves to be woken > >>>>> up on IPI. > >>>>> > >>>>> Signed-off-by: Peter Collingbourne <pcc@google.com> > >>>> Thanks a bunch! > >>>> > >>>> > >>>>> --- > >>>>> Alexander Graf wrote: > >>>>>> I would love to take a patch from you here :). I'll still be stuck for a > >>>>>> while with the sysreg sync rework that Peter asked for before I can look > >>>>>> at WFI again. > >>>>> Okay, here's a patch :) It's a relatively straightforward adaptation > >>>>> of what we have in our fork, which can now boot Android to GUI while > >>>>> remaining at around 4% CPU when idle. > >>>>> > >>>>> I'm not set up to boot a full Linux distribution at the moment so I > >>>>> tested it on upstream QEMU by running a recent mainline Linux kernel > >>>>> with a rootfs containing an init program that just does sleep(5) > >>>>> and verified that the qemu process remains at low CPU usage during > >>>>> the sleep. This was on top of your v2 plus the last patch of your v1 > >>>>> since it doesn't look like you have a replacement for that logic yet. > >>>>> > >>>>> accel/hvf/hvf-cpus.c | 5 +-- > >>>>> include/sysemu/hvf_int.h | 3 +- > >>>>> target/arm/hvf/hvf.c | 94 +++++++++++----------------------------- > >>>>> 3 files changed, 28 insertions(+), 74 deletions(-) > >>>>> > >>>>> diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c > >>>>> index 4360f64671..b2c8fb57f6 100644 > >>>>> --- a/accel/hvf/hvf-cpus.c > >>>>> +++ b/accel/hvf/hvf-cpus.c > >>>>> @@ -344,9 +344,8 @@ static int hvf_init_vcpu(CPUState *cpu) > >>>>> sigact.sa_handler = dummy_signal; > >>>>> sigaction(SIG_IPI, &sigact, NULL); > >>>>> > >>>>> - pthread_sigmask(SIG_BLOCK, NULL, &set); > >>>>> - sigdelset(&set, SIG_IPI); > >>>>> - pthread_sigmask(SIG_SETMASK, &set, NULL); > >>>>> + pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->unblock_ipi_mask); > >>>>> + sigdelset(&cpu->hvf->unblock_ipi_mask, SIG_IPI); > >>>> What will this do to the x86 hvf implementation? We're now not > >>>> unblocking SIG_IPI again for that, right? > >>> Yes and that was the case before your patch series. > >> > >> The way I understand Roman, he wanted to unblock the IPI signal on x86: > >> > >> https://patchwork.kernel.org/project/qemu-devel/patch/20201126215017.41156-3-agraf@csgraf.de/#23807021 > >> > >> I agree that at this point it's not a problem though to break it again. > >> I'm not quite sure how to merge your patches within my patch set though, > >> given they basically revert half of my previously introduced code... > >> > >> > >>>>> #ifdef __aarch64__ > >>>>> r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t **)&cpu->hvf->exit, NULL); > >>>>> diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h > >>>>> index c56baa3ae8..13adf6ea77 100644 > >>>>> --- a/include/sysemu/hvf_int.h > >>>>> +++ b/include/sysemu/hvf_int.h > >>>>> @@ -62,8 +62,7 @@ extern HVFState *hvf_state; > >>>>> struct hvf_vcpu_state { > >>>>> uint64_t fd; > >>>>> void *exit; > >>>>> - struct timespec ts; > >>>>> - bool sleeping; > >>>>> + sigset_t unblock_ipi_mask; > >>>>> }; > >>>>> > >>>>> void assert_hvf_ok(hv_return_t ret); > >>>>> diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c > >>>>> index 8fe10966d2..60a361ff38 100644 > >>>>> --- a/target/arm/hvf/hvf.c > >>>>> +++ b/target/arm/hvf/hvf.c > >>>>> @@ -2,6 +2,7 @@ > >>>>> * QEMU Hypervisor.framework support for Apple Silicon > >>>>> > >>>>> * Copyright 2020 Alexander Graf <agraf@csgraf.de> > >>>>> + * Copyright 2020 Google LLC > >>>>> * > >>>>> * This work is licensed under the terms of the GNU GPL, version 2 or later. > >>>>> * See the COPYING file in the top-level directory. > >>>>> @@ -18,6 +19,7 @@ > >>>>> #include "sysemu/hw_accel.h" > >>>>> > >>>>> #include <Hypervisor/Hypervisor.h> > >>>>> +#include <mach/mach_time.h> > >>>>> > >>>>> #include "exec/address-spaces.h" > >>>>> #include "hw/irq.h" > >>>>> @@ -320,18 +322,8 @@ int hvf_arch_init_vcpu(CPUState *cpu) > >>>>> > >>>>> void hvf_kick_vcpu_thread(CPUState *cpu) > >>>>> { > >>>>> - if (cpu->hvf->sleeping) { > >>>>> - /* > >>>>> - * When sleeping, make sure we always send signals. Also, clear the > >>>>> - * timespec, so that an IPI that arrives between setting hvf->sleeping > >>>>> - * and the nanosleep syscall still aborts the sleep. > >>>>> - */ > >>>>> - cpu->thread_kicked = false; > >>>>> - cpu->hvf->ts = (struct timespec){ }; > >>>>> - cpus_kick_thread(cpu); > >>>>> - } else { > >>>>> - hv_vcpus_exit(&cpu->hvf->fd, 1); > >>>>> - } > >>>>> + cpus_kick_thread(cpu); > >>>>> + hv_vcpus_exit(&cpu->hvf->fd, 1); > >>>> This means your first WFI will almost always return immediately due to a > >>>> pending signal, because there probably was an IRQ pending before on the > >>>> same CPU, no? > >>> That's right. Any approach involving the "sleeping" field would need > >>> to be implemented carefully to avoid races that may result in missed > >>> wakeups so for simplicity I just decided to send both kinds of > >>> wakeups. In particular the approach in the updated patch you sent is > >>> racy and I'll elaborate more in the reply to that patch. > >>> > >>>>> } > >>>>> > >>>>> static int hvf_inject_interrupts(CPUState *cpu) > >>>>> @@ -385,18 +377,19 @@ int hvf_vcpu_exec(CPUState *cpu) > >>>>> uint64_t syndrome = hvf_exit->exception.syndrome; > >>>>> uint32_t ec = syn_get_ec(syndrome); > >>>>> > >>>>> + qemu_mutex_lock_iothread(); > >>>> Is there a particular reason you're moving the iothread lock out again > >>>> from the individual bits? I would really like to keep a notion of fast > >>>> path exits. > >>> We still need to lock at least once no matter the exit reason to check > >>> the interrupts so I don't think it's worth it to try and avoid locking > >>> like this. It also makes the implementation easier to reason about and > >>> therefore more likely to be correct. In our implementation we just > >>> stay locked the whole time unless we're in hv_vcpu_run() or pselect(). > >>> > >>>>> switch (exit_reason) { > >>>>> case HV_EXIT_REASON_EXCEPTION: > >>>>> /* This is the main one, handle below. */ > >>>>> break; > >>>>> case HV_EXIT_REASON_VTIMER_ACTIVATED: > >>>>> - qemu_mutex_lock_iothread(); > >>>>> current_cpu = cpu; > >>>>> qemu_set_irq(arm_cpu->gt_timer_outputs[GTIMER_VIRT], 1); > >>>>> qemu_mutex_unlock_iothread(); > >>>>> continue; > >>>>> case HV_EXIT_REASON_CANCELED: > >>>>> /* we got kicked, no exit to process */ > >>>>> + qemu_mutex_unlock_iothread(); > >>>>> continue; > >>>>> default: > >>>>> assert(0); > >>>>> @@ -413,7 +406,6 @@ int hvf_vcpu_exec(CPUState *cpu) > >>>>> uint32_t srt = (syndrome >> 16) & 0x1f; > >>>>> uint64_t val = 0; > >>>>> > >>>>> - qemu_mutex_lock_iothread(); > >>>>> current_cpu = cpu; > >>>>> > >>>>> DPRINTF("data abort: [pc=0x%llx va=0x%016llx pa=0x%016llx isv=%x " > >>>>> @@ -446,8 +438,6 @@ int hvf_vcpu_exec(CPUState *cpu) > >>>>> hvf_set_reg(cpu, srt, val); > >>>>> } > >>>>> > >>>>> - qemu_mutex_unlock_iothread(); > >>>>> - > >>>>> advance_pc = true; > >>>>> break; > >>>>> } > >>>>> @@ -493,68 +483,36 @@ int hvf_vcpu_exec(CPUState *cpu) > >>>>> case EC_WFX_TRAP: > >>>>> if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request & > >>>>> (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) { > >>>>> - uint64_t cval, ctl, val, diff, now; > >>>>> + uint64_t cval; > >>>>> > >>>>> - /* Set up a local timer for vtimer if necessary ... */ > >>>>> - r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0, &ctl); > >>>>> - assert_hvf_ok(r); > >>>>> r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0, &cval); > >>>>> assert_hvf_ok(r); > >>>>> > >>>>> - asm volatile("mrs %0, cntvct_el0" : "=r"(val)); > >>>>> - diff = cval - val; > >>>>> - > >>>>> - now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) / > >>>>> - gt_cntfrq_period_ns(arm_cpu); > >>>>> - > >>>>> - /* Timer disabled or masked, just wait for long */ > >>>>> - if (!(ctl & 1) || (ctl & 2)) { > >>>>> - diff = (120 * NANOSECONDS_PER_SECOND) / > >>>>> - gt_cntfrq_period_ns(arm_cpu); > >>>>> + int64_t ticks_to_sleep = cval - mach_absolute_time(); > >>>>> + if (ticks_to_sleep < 0) { > >>>>> + break; > >>>> This will loop at 100% for Windows, which configures the vtimer as > >>>> cval=0 ctl=7, so with IRQ mask bit set. > >>> Okay, but the 120s is kind of arbitrary so we should just sleep until > >>> we get a signal. That can be done by passing null as the timespec > >>> argument to pselect(). > >> > >> The reason I capped it at 120s was so that if I do hit a race, you don't > >> break everything forever. Only for 2 minutes :). > > I see. I think at this point we want to notice these types of bugs if > > they exist instead of hiding them, so I would mildly be in favor of > > not capping at 120s. > > > Crossing my fingers that we are at that point already :). > > > > > >>>> Alex > >>>> > >>>> > >>>>> } > >>>>> > >>>>> - if (diff < INT64_MAX) { > >>>>> - uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu); > >>>>> - struct timespec *ts = &cpu->hvf->ts; > >>>>> - > >>>>> - *ts = (struct timespec){ > >>>>> - .tv_sec = ns / NANOSECONDS_PER_SECOND, > >>>>> - .tv_nsec = ns % NANOSECONDS_PER_SECOND, > >>>>> - }; > >>>>> - > >>>>> - /* > >>>>> - * Waking up easily takes 1ms, don't go to sleep for smaller > >>>>> - * time periods than 2ms. > >>>>> - */ > >>>>> - if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) { > >>>> I put this logic here on purpose. A pselect(1 ns) easily takes 1-2ms to > >>>> return. Without logic like this, super short WFIs will hurt performance > >>>> quite badly. > >>> I don't think that's accurate. According to this benchmark it's a few > >>> hundred nanoseconds at most. > >>> > >>> pcc@pac-mini /tmp> cat pselect.c > >>> #include <signal.h> > >>> #include <sys/select.h> > >>> > >>> int main() { > >>> sigset_t mask, orig_mask; > >>> pthread_sigmask(SIG_SETMASK, 0, &mask); > >>> sigaddset(&mask, SIGUSR1); > >>> pthread_sigmask(SIG_SETMASK, &mask, &orig_mask); > >>> > >>> for (int i = 0; i != 1000000; ++i) { > >>> struct timespec ts = { 0, 1 }; > >>> pselect(0, 0, 0, 0, &ts, &orig_mask); > >>> } > >>> } > >>> pcc@pac-mini /tmp> time ./pselect > >>> > >>> ________________________________________________________ > >>> Executed in 179.87 millis fish external > >>> usr time 77.68 millis 57.00 micros 77.62 millis > >>> sys time 101.37 millis 852.00 micros 100.52 millis > >>> > >>> Besides, all that you're really saving here is the single pselect > >>> call. There are no doubt more expensive syscalls involved in exiting > >>> and entering the VCPU that would dominate here. > >> > >> I would expect that such a super low ts value has a short-circuit path > >> in the kernel as well. Where things start to fall apart is when you're > >> at a threshold where rescheduling might be ok, but then you need to take > >> all of the additional task switch overhead into account. Try to adapt > >> your test code a bit: > >> > >> #include <signal.h> > >> #include <sys/select.h> > >> > >> int main() { > >> sigset_t mask, orig_mask; > >> pthread_sigmask(SIG_SETMASK, 0, &mask); > >> sigaddset(&mask, SIGUSR1); > >> pthread_sigmask(SIG_SETMASK, &mask, &orig_mask); > >> > >> for (int i = 0; i != 10000; ++i) { > >> #define SCALE_MS 1000000 > >> struct timespec ts = { 0, SCALE_MS / 10 }; > >> pselect(0, 0, 0, 0, &ts, &orig_mask); > >> } > >> } > >> > >> > >> % time ./pselect > >> ./pselect 0.00s user 0.01s system 1% cpu 1.282 total > >> > >> You're suddenly seeing 300µs overhead per pselect call then. When I > >> measured actual enter/exit times in QEMU, I saw much bigger differences > >> between "time I want to sleep for" and "time I did sleep" even when just > >> capturing the virtual time before and after the nanosleep/pselect call. > > Okay. So the alternative is that we spin on the CPU, either doing > > no-op VCPU entries/exits or something like: > > > > while (mach_absolute_time() < cval); > > > This won't catch events that arrive during that time, such as > interrupts, right? I'd just declare the WFI as done and keep looping in > and out of the guest for now. Oh, that's a good point. > > My intuition is we shouldn't try to subvert the OS scheduler like this > > unless it's proven to help with some real world metric since otherwise > > we're not being fair to the other processes on the CPU. With CPU > > intensive workloads I wouldn't expect these kinds of sleeps to happen > > very often if at all so if it's only microbenchmarks and so on that > > are affected then my inclination is not to do this for now. > > > The problem is that the VM's OS is expecting bare metal timer behavior > usually. And that gives you much better granularities than what we can > achieve with a virtualization layer on top. So I do feel strongly about > leaving this bit in. In the workloads you describe above, you won't ever > hit that branch anyway. > > The workloads that benefit from logic like this are message passing > ones. Check out this presentation from a KVM colleague of yours for details: > > https://www.linux-kvm.org/images/a/ac/02x03-Davit_Matalack-KVM_Message_passing_Performance.pdf > https://www.youtube.com/watch?v=p85FFrloLFg Mm, okay. I personally would not add anything like that at this point without real-world data but I don't feel too strongly and I suppose the implementation can always be adjusted later. Peter
On Tue, Dec 01, 2020 at 10:59:50AM -0800, Peter Collingbourne wrote: > On Tue, Dec 1, 2020 at 3:16 AM Alexander Graf <agraf@csgraf.de> wrote: > > > > Hi Peter, > > > > On 01.12.20 09:21, Peter Collingbourne wrote: > > > Sleep on WFx until the VTIMER is due but allow ourselves to be woken > > > up on IPI. > > > > > > Signed-off-by: Peter Collingbourne <pcc@google.com> > > > > > > Thanks a bunch! > > > > > > > --- > > > Alexander Graf wrote: > > >> I would love to take a patch from you here :). I'll still be stuck for a > > >> while with the sysreg sync rework that Peter asked for before I can look > > >> at WFI again. > > > Okay, here's a patch :) It's a relatively straightforward adaptation > > > of what we have in our fork, which can now boot Android to GUI while > > > remaining at around 4% CPU when idle. > > > > > > I'm not set up to boot a full Linux distribution at the moment so I > > > tested it on upstream QEMU by running a recent mainline Linux kernel > > > with a rootfs containing an init program that just does sleep(5) > > > and verified that the qemu process remains at low CPU usage during > > > the sleep. This was on top of your v2 plus the last patch of your v1 > > > since it doesn't look like you have a replacement for that logic yet. > > > > > > accel/hvf/hvf-cpus.c | 5 +-- > > > include/sysemu/hvf_int.h | 3 +- > > > target/arm/hvf/hvf.c | 94 +++++++++++----------------------------- > > > 3 files changed, 28 insertions(+), 74 deletions(-) > > > > > > diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c > > > index 4360f64671..b2c8fb57f6 100644 > > > --- a/accel/hvf/hvf-cpus.c > > > +++ b/accel/hvf/hvf-cpus.c > > > @@ -344,9 +344,8 @@ static int hvf_init_vcpu(CPUState *cpu) > > > sigact.sa_handler = dummy_signal; > > > sigaction(SIG_IPI, &sigact, NULL); > > > > > > - pthread_sigmask(SIG_BLOCK, NULL, &set); > > > - sigdelset(&set, SIG_IPI); > > > - pthread_sigmask(SIG_SETMASK, &set, NULL); > > > + pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->unblock_ipi_mask); > > > + sigdelset(&cpu->hvf->unblock_ipi_mask, SIG_IPI); > > > > > > What will this do to the x86 hvf implementation? We're now not > > unblocking SIG_IPI again for that, right? > > Yes and that was the case before your patch series. > > > > > > > #ifdef __aarch64__ > > > r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t **)&cpu->hvf->exit, NULL); > > > diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h > > > index c56baa3ae8..13adf6ea77 100644 > > > --- a/include/sysemu/hvf_int.h > > > +++ b/include/sysemu/hvf_int.h > > > @@ -62,8 +62,7 @@ extern HVFState *hvf_state; > > > struct hvf_vcpu_state { > > > uint64_t fd; > > > void *exit; > > > - struct timespec ts; > > > - bool sleeping; > > > + sigset_t unblock_ipi_mask; > > > }; > > > > > > void assert_hvf_ok(hv_return_t ret); > > > diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c > > > index 8fe10966d2..60a361ff38 100644 > > > --- a/target/arm/hvf/hvf.c > > > +++ b/target/arm/hvf/hvf.c > > > @@ -2,6 +2,7 @@ > > > * QEMU Hypervisor.framework support for Apple Silicon > > > > > > * Copyright 2020 Alexander Graf <agraf@csgraf.de> > > > + * Copyright 2020 Google LLC > > > * > > > * This work is licensed under the terms of the GNU GPL, version 2 or later. > > > * See the COPYING file in the top-level directory. > > > @@ -18,6 +19,7 @@ > > > #include "sysemu/hw_accel.h" > > > > > > #include <Hypervisor/Hypervisor.h> > > > +#include <mach/mach_time.h> > > > > > > #include "exec/address-spaces.h" > > > #include "hw/irq.h" > > > @@ -320,18 +322,8 @@ int hvf_arch_init_vcpu(CPUState *cpu) > > > > > > void hvf_kick_vcpu_thread(CPUState *cpu) > > > { > > > - if (cpu->hvf->sleeping) { > > > - /* > > > - * When sleeping, make sure we always send signals. Also, clear the > > > - * timespec, so that an IPI that arrives between setting hvf->sleeping > > > - * and the nanosleep syscall still aborts the sleep. > > > - */ > > > - cpu->thread_kicked = false; > > > - cpu->hvf->ts = (struct timespec){ }; > > > - cpus_kick_thread(cpu); > > > - } else { > > > - hv_vcpus_exit(&cpu->hvf->fd, 1); > > > - } > > > + cpus_kick_thread(cpu); > > > + hv_vcpus_exit(&cpu->hvf->fd, 1); > > > > > > This means your first WFI will almost always return immediately due to a > > pending signal, because there probably was an IRQ pending before on the > > same CPU, no? > > That's right. Any approach involving the "sleeping" field would need > to be implemented carefully to avoid races that may result in missed > wakeups so for simplicity I just decided to send both kinds of > wakeups. In particular the approach in the updated patch you sent is > racy and I'll elaborate more in the reply to that patch. > > > > } > > > > > > static int hvf_inject_interrupts(CPUState *cpu) > > > @@ -385,18 +377,19 @@ int hvf_vcpu_exec(CPUState *cpu) > > > uint64_t syndrome = hvf_exit->exception.syndrome; > > > uint32_t ec = syn_get_ec(syndrome); > > > > > > + qemu_mutex_lock_iothread(); > > > > > > Is there a particular reason you're moving the iothread lock out again > > from the individual bits? I would really like to keep a notion of fast > > path exits. > > We still need to lock at least once no matter the exit reason to check > the interrupts so I don't think it's worth it to try and avoid locking > like this. It also makes the implementation easier to reason about and > therefore more likely to be correct. In our implementation we just > stay locked the whole time unless we're in hv_vcpu_run() or pselect(). > But does it leaves a small window for a kick loss between qemu_mutex_unlock_iothread() and hv_vcpu_run()/pselect()? For x86 it could lose a kick between them. That was a reason for the sophisticated approach to catch the kick [1] (and related discussions in v1/v2/v3). Unfortunately I can't read ARM assembly yet so I don't if hv_vcpus_exit() suffers from the same issue as x86 hv_vcpu_interrupt(). 1. https://patchwork.kernel.org/project/qemu-devel/patch/20200729124832.79375-1-r.bolshakov@yadro.com/ Thanks, Roman > > > switch (exit_reason) { > > > case HV_EXIT_REASON_EXCEPTION: > > > /* This is the main one, handle below. */ > > > break; > > > case HV_EXIT_REASON_VTIMER_ACTIVATED: > > > - qemu_mutex_lock_iothread(); > > > current_cpu = cpu; > > > qemu_set_irq(arm_cpu->gt_timer_outputs[GTIMER_VIRT], 1); > > > qemu_mutex_unlock_iothread(); > > > continue; > > > case HV_EXIT_REASON_CANCELED: > > > /* we got kicked, no exit to process */ > > > + qemu_mutex_unlock_iothread(); > > > continue; > > > default: > > > assert(0); > > > @@ -413,7 +406,6 @@ int hvf_vcpu_exec(CPUState *cpu) > > > uint32_t srt = (syndrome >> 16) & 0x1f; > > > uint64_t val = 0; > > > > > > - qemu_mutex_lock_iothread(); > > > current_cpu = cpu; > > > > > > DPRINTF("data abort: [pc=0x%llx va=0x%016llx pa=0x%016llx isv=%x " > > > @@ -446,8 +438,6 @@ int hvf_vcpu_exec(CPUState *cpu) > > > hvf_set_reg(cpu, srt, val); > > > } > > > > > > - qemu_mutex_unlock_iothread(); > > > - > > > advance_pc = true; > > > break; > > > } > > > @@ -493,68 +483,36 @@ int hvf_vcpu_exec(CPUState *cpu) > > > case EC_WFX_TRAP: > > > if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request & > > > (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) { > > > - uint64_t cval, ctl, val, diff, now; > > > + uint64_t cval; > > > > > > - /* Set up a local timer for vtimer if necessary ... */ > > > - r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0, &ctl); > > > - assert_hvf_ok(r); > > > r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0, &cval); > > > assert_hvf_ok(r); > > > > > > - asm volatile("mrs %0, cntvct_el0" : "=r"(val)); > > > - diff = cval - val; > > > - > > > - now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) / > > > - gt_cntfrq_period_ns(arm_cpu); > > > - > > > - /* Timer disabled or masked, just wait for long */ > > > - if (!(ctl & 1) || (ctl & 2)) { > > > - diff = (120 * NANOSECONDS_PER_SECOND) / > > > - gt_cntfrq_period_ns(arm_cpu); > > > + int64_t ticks_to_sleep = cval - mach_absolute_time(); > > > + if (ticks_to_sleep < 0) { > > > + break; > > > > > > This will loop at 100% for Windows, which configures the vtimer as > > cval=0 ctl=7, so with IRQ mask bit set. > > Okay, but the 120s is kind of arbitrary so we should just sleep until > we get a signal. That can be done by passing null as the timespec > argument to pselect(). > > > > > > > Alex > > > > > > > } > > > > > > - if (diff < INT64_MAX) { > > > - uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu); > > > - struct timespec *ts = &cpu->hvf->ts; > > > - > > > - *ts = (struct timespec){ > > > - .tv_sec = ns / NANOSECONDS_PER_SECOND, > > > - .tv_nsec = ns % NANOSECONDS_PER_SECOND, > > > - }; > > > - > > > - /* > > > - * Waking up easily takes 1ms, don't go to sleep for smaller > > > - * time periods than 2ms. > > > - */ > > > - if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) { > > > > > > I put this logic here on purpose. A pselect(1 ns) easily takes 1-2ms to > > return. Without logic like this, super short WFIs will hurt performance > > quite badly. > > I don't think that's accurate. According to this benchmark it's a few > hundred nanoseconds at most. > > pcc@pac-mini /tmp> cat pselect.c > #include <signal.h> > #include <sys/select.h> > > int main() { > sigset_t mask, orig_mask; > pthread_sigmask(SIG_SETMASK, 0, &mask); > sigaddset(&mask, SIGUSR1); > pthread_sigmask(SIG_SETMASK, &mask, &orig_mask); > > for (int i = 0; i != 1000000; ++i) { > struct timespec ts = { 0, 1 }; > pselect(0, 0, 0, 0, &ts, &orig_mask); > } > } > pcc@pac-mini /tmp> time ./pselect > > ________________________________________________________ > Executed in 179.87 millis fish external > usr time 77.68 millis 57.00 micros 77.62 millis > sys time 101.37 millis 852.00 micros 100.52 millis > > Besides, all that you're really saving here is the single pselect > call. There are no doubt more expensive syscalls involved in exiting > and entering the VCPU that would dominate here. > > Peter > > > > > > > Alex > > > > > - advance_pc = true; > > > - break; > > > - } > > > - > > > - /* Set cpu->hvf->sleeping so that we get a SIG_IPI signal. */ > > > - cpu->hvf->sleeping = true; > > > - smp_mb(); > > > - > > > - /* Bail out if we received an IRQ meanwhile */ > > > - if (cpu->thread_kicked || (cpu->interrupt_request & > > > - (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) { > > > - cpu->hvf->sleeping = false; > > > - break; > > > - } > > > - > > > - /* nanosleep returns on signal, so we wake up on kick. */ > > > - nanosleep(ts, NULL); > > > - > > > - /* Out of sleep - either naturally or because of a kick */ > > > - cpu->hvf->sleeping = false; > > > - } > > > + uint64_t seconds = ticks_to_sleep / arm_cpu->gt_cntfrq_hz; > > > + uint64_t nanos = > > > + (ticks_to_sleep - arm_cpu->gt_cntfrq_hz * seconds) * > > > + 1000000000 / arm_cpu->gt_cntfrq_hz; > > > + struct timespec ts = { seconds, nanos }; > > > + > > > + /* > > > + * Use pselect to sleep so that other threads can IPI us while > > > + * we're sleeping. > > > + */ > > > + qatomic_mb_set(&cpu->thread_kicked, false); > > > + qemu_mutex_unlock_iothread(); > > > + pselect(0, 0, 0, 0, &ts, &cpu->hvf->unblock_ipi_mask); > > > + qemu_mutex_lock_iothread(); > > > > > > advance_pc = true; > > > } > > > break; > > > case EC_AA64_HVC: > > > cpu_synchronize_state(cpu); > > > - qemu_mutex_lock_iothread(); > > > current_cpu = cpu; > > > if (arm_is_psci_call(arm_cpu, EXCP_HVC)) { > > > arm_handle_psci_call(arm_cpu); > > > @@ -562,11 +520,9 @@ int hvf_vcpu_exec(CPUState *cpu) > > > DPRINTF("unknown HVC! %016llx", env->xregs[0]); > > > env->xregs[0] = -1; > > > } > > > - qemu_mutex_unlock_iothread(); > > > break; > > > case EC_AA64_SMC: > > > cpu_synchronize_state(cpu); > > > - qemu_mutex_lock_iothread(); > > > current_cpu = cpu; > > > if (arm_is_psci_call(arm_cpu, EXCP_SMC)) { > > > arm_handle_psci_call(arm_cpu); > > > @@ -575,7 +531,6 @@ int hvf_vcpu_exec(CPUState *cpu) > > > env->xregs[0] = -1; > > > env->pc += 4; > > > } > > > - qemu_mutex_unlock_iothread(); > > > break; > > > default: > > > cpu_synchronize_state(cpu); > > > @@ -594,6 +549,7 @@ int hvf_vcpu_exec(CPUState *cpu) > > > r = hv_vcpu_set_reg(cpu->hvf->fd, HV_REG_PC, pc); > > > assert_hvf_ok(r); > > > } > > > + qemu_mutex_unlock_iothread(); > > > } while (ret == 0); > > > > > > qemu_mutex_lock_iothread();
On Thu, Dec 3, 2020 at 2:12 AM Roman Bolshakov <r.bolshakov@yadro.com> wrote: > > On Tue, Dec 01, 2020 at 10:59:50AM -0800, Peter Collingbourne wrote: > > On Tue, Dec 1, 2020 at 3:16 AM Alexander Graf <agraf@csgraf.de> wrote: > > > > > > Hi Peter, > > > > > > On 01.12.20 09:21, Peter Collingbourne wrote: > > > > Sleep on WFx until the VTIMER is due but allow ourselves to be woken > > > > up on IPI. > > > > > > > > Signed-off-by: Peter Collingbourne <pcc@google.com> > > > > > > > > > Thanks a bunch! > > > > > > > > > > --- > > > > Alexander Graf wrote: > > > >> I would love to take a patch from you here :). I'll still be stuck for a > > > >> while with the sysreg sync rework that Peter asked for before I can look > > > >> at WFI again. > > > > Okay, here's a patch :) It's a relatively straightforward adaptation > > > > of what we have in our fork, which can now boot Android to GUI while > > > > remaining at around 4% CPU when idle. > > > > > > > > I'm not set up to boot a full Linux distribution at the moment so I > > > > tested it on upstream QEMU by running a recent mainline Linux kernel > > > > with a rootfs containing an init program that just does sleep(5) > > > > and verified that the qemu process remains at low CPU usage during > > > > the sleep. This was on top of your v2 plus the last patch of your v1 > > > > since it doesn't look like you have a replacement for that logic yet. > > > > > > > > accel/hvf/hvf-cpus.c | 5 +-- > > > > include/sysemu/hvf_int.h | 3 +- > > > > target/arm/hvf/hvf.c | 94 +++++++++++----------------------------- > > > > 3 files changed, 28 insertions(+), 74 deletions(-) > > > > > > > > diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c > > > > index 4360f64671..b2c8fb57f6 100644 > > > > --- a/accel/hvf/hvf-cpus.c > > > > +++ b/accel/hvf/hvf-cpus.c > > > > @@ -344,9 +344,8 @@ static int hvf_init_vcpu(CPUState *cpu) > > > > sigact.sa_handler = dummy_signal; > > > > sigaction(SIG_IPI, &sigact, NULL); > > > > > > > > - pthread_sigmask(SIG_BLOCK, NULL, &set); > > > > - sigdelset(&set, SIG_IPI); > > > > - pthread_sigmask(SIG_SETMASK, &set, NULL); > > > > + pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->unblock_ipi_mask); > > > > + sigdelset(&cpu->hvf->unblock_ipi_mask, SIG_IPI); > > > > > > > > > What will this do to the x86 hvf implementation? We're now not > > > unblocking SIG_IPI again for that, right? > > > > Yes and that was the case before your patch series. > > > > > > > > > > #ifdef __aarch64__ > > > > r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t **)&cpu->hvf->exit, NULL); > > > > diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h > > > > index c56baa3ae8..13adf6ea77 100644 > > > > --- a/include/sysemu/hvf_int.h > > > > +++ b/include/sysemu/hvf_int.h > > > > @@ -62,8 +62,7 @@ extern HVFState *hvf_state; > > > > struct hvf_vcpu_state { > > > > uint64_t fd; > > > > void *exit; > > > > - struct timespec ts; > > > > - bool sleeping; > > > > + sigset_t unblock_ipi_mask; > > > > }; > > > > > > > > void assert_hvf_ok(hv_return_t ret); > > > > diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c > > > > index 8fe10966d2..60a361ff38 100644 > > > > --- a/target/arm/hvf/hvf.c > > > > +++ b/target/arm/hvf/hvf.c > > > > @@ -2,6 +2,7 @@ > > > > * QEMU Hypervisor.framework support for Apple Silicon > > > > > > > > * Copyright 2020 Alexander Graf <agraf@csgraf.de> > > > > + * Copyright 2020 Google LLC > > > > * > > > > * This work is licensed under the terms of the GNU GPL, version 2 or later. > > > > * See the COPYING file in the top-level directory. > > > > @@ -18,6 +19,7 @@ > > > > #include "sysemu/hw_accel.h" > > > > > > > > #include <Hypervisor/Hypervisor.h> > > > > +#include <mach/mach_time.h> > > > > > > > > #include "exec/address-spaces.h" > > > > #include "hw/irq.h" > > > > @@ -320,18 +322,8 @@ int hvf_arch_init_vcpu(CPUState *cpu) > > > > > > > > void hvf_kick_vcpu_thread(CPUState *cpu) > > > > { > > > > - if (cpu->hvf->sleeping) { > > > > - /* > > > > - * When sleeping, make sure we always send signals. Also, clear the > > > > - * timespec, so that an IPI that arrives between setting hvf->sleeping > > > > - * and the nanosleep syscall still aborts the sleep. > > > > - */ > > > > - cpu->thread_kicked = false; > > > > - cpu->hvf->ts = (struct timespec){ }; > > > > - cpus_kick_thread(cpu); > > > > - } else { > > > > - hv_vcpus_exit(&cpu->hvf->fd, 1); > > > > - } > > > > + cpus_kick_thread(cpu); > > > > + hv_vcpus_exit(&cpu->hvf->fd, 1); > > > > > > > > > This means your first WFI will almost always return immediately due to a > > > pending signal, because there probably was an IRQ pending before on the > > > same CPU, no? > > > > That's right. Any approach involving the "sleeping" field would need > > to be implemented carefully to avoid races that may result in missed > > wakeups so for simplicity I just decided to send both kinds of > > wakeups. In particular the approach in the updated patch you sent is > > racy and I'll elaborate more in the reply to that patch. > > > > > > } > > > > > > > > static int hvf_inject_interrupts(CPUState *cpu) > > > > @@ -385,18 +377,19 @@ int hvf_vcpu_exec(CPUState *cpu) > > > > uint64_t syndrome = hvf_exit->exception.syndrome; > > > > uint32_t ec = syn_get_ec(syndrome); > > > > > > > > + qemu_mutex_lock_iothread(); > > > > > > > > > Is there a particular reason you're moving the iothread lock out again > > > from the individual bits? I would really like to keep a notion of fast > > > path exits. > > > > We still need to lock at least once no matter the exit reason to check > > the interrupts so I don't think it's worth it to try and avoid locking > > like this. It also makes the implementation easier to reason about and > > therefore more likely to be correct. In our implementation we just > > stay locked the whole time unless we're in hv_vcpu_run() or pselect(). > > > > But does it leaves a small window for a kick loss between > qemu_mutex_unlock_iothread() and hv_vcpu_run()/pselect()? > > For x86 it could lose a kick between them. That was a reason for the > sophisticated approach to catch the kick [1] (and related discussions in > v1/v2/v3). Unfortunately I can't read ARM assembly yet so I don't if > hv_vcpus_exit() suffers from the same issue as x86 hv_vcpu_interrupt(). > > 1. https://patchwork.kernel.org/project/qemu-devel/patch/20200729124832.79375-1-r.bolshakov@yadro.com/ I addressed pselect() in my other reply. It isn't on the website but the hv_vcpu.h header says this about hv_vcpus_exit(): * @discussion * If a vcpu is not running, the next time hv_vcpu_run is called for the corresponding * vcpu, it will return immediately without entering the guest. So at least as documented I think we are okay. Peter > > Thanks, > Roman > > > > > switch (exit_reason) { > > > > case HV_EXIT_REASON_EXCEPTION: > > > > /* This is the main one, handle below. */ > > > > break; > > > > case HV_EXIT_REASON_VTIMER_ACTIVATED: > > > > - qemu_mutex_lock_iothread(); > > > > current_cpu = cpu; > > > > qemu_set_irq(arm_cpu->gt_timer_outputs[GTIMER_VIRT], 1); > > > > qemu_mutex_unlock_iothread(); > > > > continue; > > > > case HV_EXIT_REASON_CANCELED: > > > > /* we got kicked, no exit to process */ > > > > + qemu_mutex_unlock_iothread(); > > > > continue; > > > > default: > > > > assert(0); > > > > @@ -413,7 +406,6 @@ int hvf_vcpu_exec(CPUState *cpu) > > > > uint32_t srt = (syndrome >> 16) & 0x1f; > > > > uint64_t val = 0; > > > > > > > > - qemu_mutex_lock_iothread(); > > > > current_cpu = cpu; > > > > > > > > DPRINTF("data abort: [pc=0x%llx va=0x%016llx pa=0x%016llx isv=%x " > > > > @@ -446,8 +438,6 @@ int hvf_vcpu_exec(CPUState *cpu) > > > > hvf_set_reg(cpu, srt, val); > > > > } > > > > > > > > - qemu_mutex_unlock_iothread(); > > > > - > > > > advance_pc = true; > > > > break; > > > > } > > > > @@ -493,68 +483,36 @@ int hvf_vcpu_exec(CPUState *cpu) > > > > case EC_WFX_TRAP: > > > > if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request & > > > > (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) { > > > > - uint64_t cval, ctl, val, diff, now; > > > > + uint64_t cval; > > > > > > > > - /* Set up a local timer for vtimer if necessary ... */ > > > > - r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0, &ctl); > > > > - assert_hvf_ok(r); > > > > r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0, &cval); > > > > assert_hvf_ok(r); > > > > > > > > - asm volatile("mrs %0, cntvct_el0" : "=r"(val)); > > > > - diff = cval - val; > > > > - > > > > - now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) / > > > > - gt_cntfrq_period_ns(arm_cpu); > > > > - > > > > - /* Timer disabled or masked, just wait for long */ > > > > - if (!(ctl & 1) || (ctl & 2)) { > > > > - diff = (120 * NANOSECONDS_PER_SECOND) / > > > > - gt_cntfrq_period_ns(arm_cpu); > > > > + int64_t ticks_to_sleep = cval - mach_absolute_time(); > > > > + if (ticks_to_sleep < 0) { > > > > + break; > > > > > > > > > This will loop at 100% for Windows, which configures the vtimer as > > > cval=0 ctl=7, so with IRQ mask bit set. > > > > Okay, but the 120s is kind of arbitrary so we should just sleep until > > we get a signal. That can be done by passing null as the timespec > > argument to pselect(). > > > > > > > > > > > Alex > > > > > > > > > > } > > > > > > > > - if (diff < INT64_MAX) { > > > > - uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu); > > > > - struct timespec *ts = &cpu->hvf->ts; > > > > - > > > > - *ts = (struct timespec){ > > > > - .tv_sec = ns / NANOSECONDS_PER_SECOND, > > > > - .tv_nsec = ns % NANOSECONDS_PER_SECOND, > > > > - }; > > > > - > > > > - /* > > > > - * Waking up easily takes 1ms, don't go to sleep for smaller > > > > - * time periods than 2ms. > > > > - */ > > > > - if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) { > > > > > > > > > I put this logic here on purpose. A pselect(1 ns) easily takes 1-2ms to > > > return. Without logic like this, super short WFIs will hurt performance > > > quite badly. > > > > I don't think that's accurate. According to this benchmark it's a few > > hundred nanoseconds at most. > > > > pcc@pac-mini /tmp> cat pselect.c > > #include <signal.h> > > #include <sys/select.h> > > > > int main() { > > sigset_t mask, orig_mask; > > pthread_sigmask(SIG_SETMASK, 0, &mask); > > sigaddset(&mask, SIGUSR1); > > pthread_sigmask(SIG_SETMASK, &mask, &orig_mask); > > > > for (int i = 0; i != 1000000; ++i) { > > struct timespec ts = { 0, 1 }; > > pselect(0, 0, 0, 0, &ts, &orig_mask); > > } > > } > > pcc@pac-mini /tmp> time ./pselect > > > > ________________________________________________________ > > Executed in 179.87 millis fish external > > usr time 77.68 millis 57.00 micros 77.62 millis > > sys time 101.37 millis 852.00 micros 100.52 millis > > > > Besides, all that you're really saving here is the single pselect > > call. There are no doubt more expensive syscalls involved in exiting > > and entering the VCPU that would dominate here. > > > > Peter > > > > > > > > > > > Alex > > > > > > > - advance_pc = true; > > > > - break; > > > > - } > > > > - > > > > - /* Set cpu->hvf->sleeping so that we get a SIG_IPI signal. */ > > > > - cpu->hvf->sleeping = true; > > > > - smp_mb(); > > > > - > > > > - /* Bail out if we received an IRQ meanwhile */ > > > > - if (cpu->thread_kicked || (cpu->interrupt_request & > > > > - (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) { > > > > - cpu->hvf->sleeping = false; > > > > - break; > > > > - } > > > > - > > > > - /* nanosleep returns on signal, so we wake up on kick. */ > > > > - nanosleep(ts, NULL); > > > > - > > > > - /* Out of sleep - either naturally or because of a kick */ > > > > - cpu->hvf->sleeping = false; > > > > - } > > > > + uint64_t seconds = ticks_to_sleep / arm_cpu->gt_cntfrq_hz; > > > > + uint64_t nanos = > > > > + (ticks_to_sleep - arm_cpu->gt_cntfrq_hz * seconds) * > > > > + 1000000000 / arm_cpu->gt_cntfrq_hz; > > > > + struct timespec ts = { seconds, nanos }; > > > > + > > > > + /* > > > > + * Use pselect to sleep so that other threads can IPI us while > > > > + * we're sleeping. > > > > + */ > > > > + qatomic_mb_set(&cpu->thread_kicked, false); > > > > + qemu_mutex_unlock_iothread(); > > > > + pselect(0, 0, 0, 0, &ts, &cpu->hvf->unblock_ipi_mask); > > > > + qemu_mutex_lock_iothread(); > > > > > > > > advance_pc = true; > > > > } > > > > break; > > > > case EC_AA64_HVC: > > > > cpu_synchronize_state(cpu); > > > > - qemu_mutex_lock_iothread(); > > > > current_cpu = cpu; > > > > if (arm_is_psci_call(arm_cpu, EXCP_HVC)) { > > > > arm_handle_psci_call(arm_cpu); > > > > @@ -562,11 +520,9 @@ int hvf_vcpu_exec(CPUState *cpu) > > > > DPRINTF("unknown HVC! %016llx", env->xregs[0]); > > > > env->xregs[0] = -1; > > > > } > > > > - qemu_mutex_unlock_iothread(); > > > > break; > > > > case EC_AA64_SMC: > > > > cpu_synchronize_state(cpu); > > > > - qemu_mutex_lock_iothread(); > > > > current_cpu = cpu; > > > > if (arm_is_psci_call(arm_cpu, EXCP_SMC)) { > > > > arm_handle_psci_call(arm_cpu); > > > > @@ -575,7 +531,6 @@ int hvf_vcpu_exec(CPUState *cpu) > > > > env->xregs[0] = -1; > > > > env->pc += 4; > > > > } > > > > - qemu_mutex_unlock_iothread(); > > > > break; > > > > default: > > > > cpu_synchronize_state(cpu); > > > > @@ -594,6 +549,7 @@ int hvf_vcpu_exec(CPUState *cpu) > > > > r = hv_vcpu_set_reg(cpu->hvf->fd, HV_REG_PC, pc); > > > > assert_hvf_ok(r); > > > > } > > > > + qemu_mutex_unlock_iothread(); > > > > } while (ret == 0); > > > > > > > > qemu_mutex_lock_iothread();
diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c index 4360f64671..b2c8fb57f6 100644 --- a/accel/hvf/hvf-cpus.c +++ b/accel/hvf/hvf-cpus.c @@ -344,9 +344,8 @@ static int hvf_init_vcpu(CPUState *cpu) sigact.sa_handler = dummy_signal; sigaction(SIG_IPI, &sigact, NULL); - pthread_sigmask(SIG_BLOCK, NULL, &set); - sigdelset(&set, SIG_IPI); - pthread_sigmask(SIG_SETMASK, &set, NULL); + pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->unblock_ipi_mask); + sigdelset(&cpu->hvf->unblock_ipi_mask, SIG_IPI); #ifdef __aarch64__ r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t **)&cpu->hvf->exit, NULL); diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h index c56baa3ae8..13adf6ea77 100644 --- a/include/sysemu/hvf_int.h +++ b/include/sysemu/hvf_int.h @@ -62,8 +62,7 @@ extern HVFState *hvf_state; struct hvf_vcpu_state { uint64_t fd; void *exit; - struct timespec ts; - bool sleeping; + sigset_t unblock_ipi_mask; }; void assert_hvf_ok(hv_return_t ret); diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c index 8fe10966d2..60a361ff38 100644 --- a/target/arm/hvf/hvf.c +++ b/target/arm/hvf/hvf.c @@ -2,6 +2,7 @@ * QEMU Hypervisor.framework support for Apple Silicon * Copyright 2020 Alexander Graf <agraf@csgraf.de> + * Copyright 2020 Google LLC * * This work is licensed under the terms of the GNU GPL, version 2 or later. * See the COPYING file in the top-level directory. @@ -18,6 +19,7 @@ #include "sysemu/hw_accel.h" #include <Hypervisor/Hypervisor.h> +#include <mach/mach_time.h> #include "exec/address-spaces.h" #include "hw/irq.h" @@ -320,18 +322,8 @@ int hvf_arch_init_vcpu(CPUState *cpu) void hvf_kick_vcpu_thread(CPUState *cpu) { - if (cpu->hvf->sleeping) { - /* - * When sleeping, make sure we always send signals. Also, clear the - * timespec, so that an IPI that arrives between setting hvf->sleeping - * and the nanosleep syscall still aborts the sleep. - */ - cpu->thread_kicked = false; - cpu->hvf->ts = (struct timespec){ }; - cpus_kick_thread(cpu); - } else { - hv_vcpus_exit(&cpu->hvf->fd, 1); - } + cpus_kick_thread(cpu); + hv_vcpus_exit(&cpu->hvf->fd, 1); } static int hvf_inject_interrupts(CPUState *cpu) @@ -385,18 +377,19 @@ int hvf_vcpu_exec(CPUState *cpu) uint64_t syndrome = hvf_exit->exception.syndrome; uint32_t ec = syn_get_ec(syndrome); + qemu_mutex_lock_iothread(); switch (exit_reason) { case HV_EXIT_REASON_EXCEPTION: /* This is the main one, handle below. */ break; case HV_EXIT_REASON_VTIMER_ACTIVATED: - qemu_mutex_lock_iothread(); current_cpu = cpu; qemu_set_irq(arm_cpu->gt_timer_outputs[GTIMER_VIRT], 1); qemu_mutex_unlock_iothread(); continue; case HV_EXIT_REASON_CANCELED: /* we got kicked, no exit to process */ + qemu_mutex_unlock_iothread(); continue; default: assert(0); @@ -413,7 +406,6 @@ int hvf_vcpu_exec(CPUState *cpu) uint32_t srt = (syndrome >> 16) & 0x1f; uint64_t val = 0; - qemu_mutex_lock_iothread(); current_cpu = cpu; DPRINTF("data abort: [pc=0x%llx va=0x%016llx pa=0x%016llx isv=%x " @@ -446,8 +438,6 @@ int hvf_vcpu_exec(CPUState *cpu) hvf_set_reg(cpu, srt, val); } - qemu_mutex_unlock_iothread(); - advance_pc = true; break; } @@ -493,68 +483,36 @@ int hvf_vcpu_exec(CPUState *cpu) case EC_WFX_TRAP: if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request & (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) { - uint64_t cval, ctl, val, diff, now; + uint64_t cval; - /* Set up a local timer for vtimer if necessary ... */ - r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0, &ctl); - assert_hvf_ok(r); r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0, &cval); assert_hvf_ok(r); - asm volatile("mrs %0, cntvct_el0" : "=r"(val)); - diff = cval - val; - - now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) / - gt_cntfrq_period_ns(arm_cpu); - - /* Timer disabled or masked, just wait for long */ - if (!(ctl & 1) || (ctl & 2)) { - diff = (120 * NANOSECONDS_PER_SECOND) / - gt_cntfrq_period_ns(arm_cpu); + int64_t ticks_to_sleep = cval - mach_absolute_time(); + if (ticks_to_sleep < 0) { + break; } - if (diff < INT64_MAX) { - uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu); - struct timespec *ts = &cpu->hvf->ts; - - *ts = (struct timespec){ - .tv_sec = ns / NANOSECONDS_PER_SECOND, - .tv_nsec = ns % NANOSECONDS_PER_SECOND, - }; - - /* - * Waking up easily takes 1ms, don't go to sleep for smaller - * time periods than 2ms. - */ - if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) { - advance_pc = true; - break; - } - - /* Set cpu->hvf->sleeping so that we get a SIG_IPI signal. */ - cpu->hvf->sleeping = true; - smp_mb(); - - /* Bail out if we received an IRQ meanwhile */ - if (cpu->thread_kicked || (cpu->interrupt_request & - (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) { - cpu->hvf->sleeping = false; - break; - } - - /* nanosleep returns on signal, so we wake up on kick. */ - nanosleep(ts, NULL); - - /* Out of sleep - either naturally or because of a kick */ - cpu->hvf->sleeping = false; - } + uint64_t seconds = ticks_to_sleep / arm_cpu->gt_cntfrq_hz; + uint64_t nanos = + (ticks_to_sleep - arm_cpu->gt_cntfrq_hz * seconds) * + 1000000000 / arm_cpu->gt_cntfrq_hz; + struct timespec ts = { seconds, nanos }; + + /* + * Use pselect to sleep so that other threads can IPI us while + * we're sleeping. + */ + qatomic_mb_set(&cpu->thread_kicked, false); + qemu_mutex_unlock_iothread(); + pselect(0, 0, 0, 0, &ts, &cpu->hvf->unblock_ipi_mask); + qemu_mutex_lock_iothread(); advance_pc = true; } break; case EC_AA64_HVC: cpu_synchronize_state(cpu); - qemu_mutex_lock_iothread(); current_cpu = cpu; if (arm_is_psci_call(arm_cpu, EXCP_HVC)) { arm_handle_psci_call(arm_cpu); @@ -562,11 +520,9 @@ int hvf_vcpu_exec(CPUState *cpu) DPRINTF("unknown HVC! %016llx", env->xregs[0]); env->xregs[0] = -1; } - qemu_mutex_unlock_iothread(); break; case EC_AA64_SMC: cpu_synchronize_state(cpu); - qemu_mutex_lock_iothread(); current_cpu = cpu; if (arm_is_psci_call(arm_cpu, EXCP_SMC)) { arm_handle_psci_call(arm_cpu); @@ -575,7 +531,6 @@ int hvf_vcpu_exec(CPUState *cpu) env->xregs[0] = -1; env->pc += 4; } - qemu_mutex_unlock_iothread(); break; default: cpu_synchronize_state(cpu); @@ -594,6 +549,7 @@ int hvf_vcpu_exec(CPUState *cpu) r = hv_vcpu_set_reg(cpu->hvf->fd, HV_REG_PC, pc); assert_hvf_ok(r); } + qemu_mutex_unlock_iothread(); } while (ret == 0); qemu_mutex_lock_iothread();