diff mbox

[RFC] arm64: KVM: remove fpsimd save/restore from the world switch

Message ID 1428598439-5217-1-git-send-email-marc.zyngier@arm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Marc Zyngier April 9, 2015, 4:53 p.m. UTC
The world switch spends quite some time dealing with the FP/SIMD
registers, as the state is quite sizeable (32 128bit registers,
plus some crumbs on the side). We save/restore them on each
entry/exit, so that both the host and the guest always see
the state they expect.

But let's face it: the host kernel doesn't care. It is the host
userspace that actually cares about FP. An obvious improvement is
to remove the save/restore from the world switch, and only perform
it when we're about to enter/exit the guest (by plugging it into
vcpu_load/vcpu_put). The effect is pretty spectacular when running
hackbench (which is the only benchmark worth looking at):

Without this patch:

	Running with 50*40 (== 2000) tasks.
	Time: 36.756
	Running with 50*40 (== 2000) tasks.
	Time: 36.679
	Running with 50*40 (== 2000) tasks.
	Time: 36.699

With this patch:

	Running with 50*40 (== 2000) tasks.
	Time: 30.947
	Running with 50*40 (== 2000) tasks.
	Time: 30.868
	Running with 50*40 (== 2000) tasks.
	Time: 30.961

This is on a HiKey board (8*A53), with a 4 vcpu guest.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 arch/arm/include/asm/kvm_host.h   |  3 +++
 arch/arm/kvm/arm.c                |  2 ++
 arch/arm64/include/asm/kvm_asm.h  |  4 ++++
 arch/arm64/include/asm/kvm_host.h |  3 +++
 arch/arm64/kvm/Makefile           |  1 +
 arch/arm64/kvm/fpsimd.S           | 39 ++++++++++++++++++++++++++++++++++++
 arch/arm64/kvm/handle_fpsimd.c    | 42 +++++++++++++++++++++++++++++++++++++++
 arch/arm64/kvm/hyp.S              | 27 -------------------------
 8 files changed, 94 insertions(+), 27 deletions(-)
 create mode 100644 arch/arm64/kvm/fpsimd.S
 create mode 100644 arch/arm64/kvm/handle_fpsimd.c

Comments

Paolo Bonzini April 10, 2015, 8:41 a.m. UTC | #1
On 09/04/2015 18:53, Marc Zyngier wrote:
> The world switch spends quite some time dealing with the FP/SIMD
> registers, as the state is quite sizeable (32 128bit registers,
> plus some crumbs on the side). We save/restore them on each
> entry/exit, so that both the host and the guest always see
> the state they expect.
> 
> But let's face it: the host kernel doesn't care. It is the host
> userspace that actually cares about FP. An obvious improvement is
> to remove the save/restore from the world switch, and only perform
> it when we're about to enter/exit the guest (by plugging it into
> vcpu_load/vcpu_put).

Good idea!  Does ARM have a way to do "lazy" save/restore?  On x86 we
know if the FP/SIMD state was in use on the host, and we can force an
exit the first time the guest uses the FP/SIMD.

On ARM, it wouldn't be a problem to handle this kind of exit straight in
EL2 (similar to the hack Ard used when playing with memory attributes).

Paolo

 The effect is pretty spectacular when running
> hackbench (which is the only benchmark worth looking at):
> 
> Without this patch:
> 
> 	Running with 50*40 (== 2000) tasks.
> 	Time: 36.756
> 	Running with 50*40 (== 2000) tasks.
> 	Time: 36.679
> 	Running with 50*40 (== 2000) tasks.
> 	Time: 36.699
> 
> With this patch:
> 
> 	Running with 50*40 (== 2000) tasks.
> 	Time: 30.947
> 	Running with 50*40 (== 2000) tasks.
> 	Time: 30.868
> 	Running with 50*40 (== 2000) tasks.
> 	Time: 30.961
> 
> This is on a HiKey board (8*A53), with a 4 vcpu guest.
> 
> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
> ---
>  arch/arm/include/asm/kvm_host.h   |  3 +++
>  arch/arm/kvm/arm.c                |  2 ++
>  arch/arm64/include/asm/kvm_asm.h  |  4 ++++
>  arch/arm64/include/asm/kvm_host.h |  3 +++
>  arch/arm64/kvm/Makefile           |  1 +
>  arch/arm64/kvm/fpsimd.S           | 39 ++++++++++++++++++++++++++++++++++++
>  arch/arm64/kvm/handle_fpsimd.c    | 42 +++++++++++++++++++++++++++++++++++++++
>  arch/arm64/kvm/hyp.S              | 27 -------------------------
>  8 files changed, 94 insertions(+), 27 deletions(-)
>  create mode 100644 arch/arm64/kvm/fpsimd.S
>  create mode 100644 arch/arm64/kvm/handle_fpsimd.c
> 
> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
> index d71607c..65cf1d1 100644
> --- a/arch/arm/include/asm/kvm_host.h
> +++ b/arch/arm/include/asm/kvm_host.h
> @@ -226,6 +226,9 @@ static inline void vgic_arch_setup(const struct vgic_params *vgic)
>  int kvm_perf_init(void);
>  int kvm_perf_teardown(void);
>  
> +static inline void kvm_fpsimd_flush_hwstate(struct kvm_vcpu *vcpu) {}
> +static inline void kvm_fpsimd_sync_hwstate(struct kvm_vcpu *vcpu) {}
> +
>  void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
>  
>  struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
> index 6f53645..ff1213c 100644
> --- a/arch/arm/kvm/arm.c
> +++ b/arch/arm/kvm/arm.c
> @@ -287,6 +287,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>  	vcpu->cpu = cpu;
>  	vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state);
>  
> +	kvm_fpsimd_flush_hwstate(vcpu);
>  	kvm_arm_set_running_vcpu(vcpu);
>  }
>  
> @@ -299,6 +300,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>  	 */
>  	vcpu->cpu = -1;
>  
> +	kvm_fpsimd_sync_hwstate(vcpu);
>  	kvm_arm_set_running_vcpu(NULL);
>  }
>  
> diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
> index 4f7310f..eafb0c3 100644
> --- a/arch/arm64/include/asm/kvm_asm.h
> +++ b/arch/arm64/include/asm/kvm_asm.h
> @@ -137,6 +137,10 @@ extern char __restore_vgic_v2_state[];
>  extern char __save_vgic_v3_state[];
>  extern char __restore_vgic_v3_state[];
>  
> +struct kvm_cpu_context;
> +extern void __kvm_save_fpsimd(struct kvm_cpu_context *);
> +extern void __kvm_restore_fpsimd(struct kvm_cpu_context *);
> +
>  #endif
>  
>  #endif /* __ARM_KVM_ASM_H__ */
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index f0f58c9..2b968e5 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -201,6 +201,9 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
>  int kvm_perf_init(void);
>  int kvm_perf_teardown(void);
>  
> +void kvm_fpsimd_flush_hwstate(struct kvm_vcpu *vcpu);
> +void kvm_fpsimd_sync_hwstate(struct kvm_vcpu *vcpu);
> +
>  struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
>  
>  static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
> diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
> index d5904f8..6d9c2b7 100644
> --- a/arch/arm64/kvm/Makefile
> +++ b/arch/arm64/kvm/Makefile
> @@ -18,6 +18,7 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/psci.o $(ARM)/perf.o
>  kvm-$(CONFIG_KVM_ARM_HOST) += emulate.o inject_fault.o regmap.o
>  kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
>  kvm-$(CONFIG_KVM_ARM_HOST) += guest.o reset.o sys_regs.o sys_regs_generic_v8.o
> +kvm-$(CONFIG_KVM_ARM_HOST) += fpsimd.o handle_fpsimd.o
>  
>  kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic.o
>  kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2.o
> diff --git a/arch/arm64/kvm/fpsimd.S b/arch/arm64/kvm/fpsimd.S
> new file mode 100644
> index 0000000..458a1a7
> --- /dev/null
> +++ b/arch/arm64/kvm/fpsimd.S
> @@ -0,0 +1,39 @@
> +/*
> + * Copyright (C) 2012,2013 - ARM Ltd
> + * Author: Marc Zyngier <marc.zyngier@arm.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program.  If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <linux/linkage.h>
> +
> +#include <asm/asm-offsets.h>
> +#include <asm/fpsimdmacros.h>
> +
> +#define CPU_GP_REG_OFFSET(x)	(CPU_GP_REGS + x)
> +
> +ENTRY(__kvm_save_fpsimd)
> +	// x0: cpu context address
> +	// x1, x2: tmp regs
> +	add	x1, x0, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
> +	fpsimd_save x1, 2
> +	ret
> +END(__kvm_save_fpsimd)
> +
> +ENTRY(__kvm_restore_fpsimd)
> +	// x0: cpu context address
> +	// x1, x2: tmp regs
> +	add	x1, x0, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
> +	fpsimd_restore x1, 2
> +	ret
> +END(__kvm_restore_fpsimd)
> diff --git a/arch/arm64/kvm/handle_fpsimd.c b/arch/arm64/kvm/handle_fpsimd.c
> new file mode 100644
> index 0000000..3d34cc9
> --- /dev/null
> +++ b/arch/arm64/kvm/handle_fpsimd.c
> @@ -0,0 +1,42 @@
> +/*
> + * Copyright (C) 2015 - ARM Ltd
> + * Author: Marc Zyngier <marc.zyngier@arm.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program.  If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <linux/kvm_host.h>
> +
> +void kvm_fpsimd_flush_hwstate(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long flags;
> +
> +	local_irq_save(flags);
> +
> +	__kvm_save_fpsimd(vcpu->arch.host_cpu_context);
> +	__kvm_restore_fpsimd(&vcpu->arch.ctxt);
> +
> +	local_irq_restore(flags);
> +}
> +
> +void kvm_fpsimd_sync_hwstate(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long flags;
> +
> +	local_irq_save(flags);
> +
> +	__kvm_save_fpsimd(&vcpu->arch.ctxt);
> +	__kvm_restore_fpsimd(vcpu->arch.host_cpu_context);
> +
> +	local_irq_restore(flags);
> +}
> diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
> index 5befd01..425c1ad 100644
> --- a/arch/arm64/kvm/hyp.S
> +++ b/arch/arm64/kvm/hyp.S
> @@ -21,7 +21,6 @@
>  #include <asm/assembler.h>
>  #include <asm/debug-monitors.h>
>  #include <asm/esr.h>
> -#include <asm/fpsimdmacros.h>
>  #include <asm/kvm.h>
>  #include <asm/kvm_arm.h>
>  #include <asm/kvm_asm.h>
> @@ -102,20 +101,6 @@
>  	restore_common_regs
>  .endm
>  
> -.macro save_fpsimd
> -	// x2: cpu context address
> -	// x3, x4: tmp regs
> -	add	x3, x2, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
> -	fpsimd_save x3, 4
> -.endm
> -
> -.macro restore_fpsimd
> -	// x2: cpu context address
> -	// x3, x4: tmp regs
> -	add	x3, x2, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
> -	fpsimd_restore x3, 4
> -.endm
> -
>  .macro save_guest_regs
>  	// x0 is the vcpu address
>  	// x1 is the return code, do not corrupt!
> @@ -904,14 +889,6 @@ __restore_debug:
>  	restore_debug
>  	ret
>  
> -__save_fpsimd:
> -	save_fpsimd
> -	ret
> -
> -__restore_fpsimd:
> -	restore_fpsimd
> -	ret
> -
>  /*
>   * u64 __kvm_vcpu_run(struct kvm_vcpu *vcpu);
>   *
> @@ -932,7 +909,6 @@ ENTRY(__kvm_vcpu_run)
>  	kern_hyp_va x2
>  
>  	save_host_regs
> -	bl __save_fpsimd
>  	bl __save_sysregs
>  
>  	compute_debug_state 1f
> @@ -948,7 +924,6 @@ ENTRY(__kvm_vcpu_run)
>  	add	x2, x0, #VCPU_CONTEXT
>  
>  	bl __restore_sysregs
> -	bl __restore_fpsimd
>  
>  	skip_debug_state x3, 1f
>  	bl	__restore_debug
> @@ -967,7 +942,6 @@ __kvm_vcpu_return:
>  	add	x2, x0, #VCPU_CONTEXT
>  
>  	save_guest_regs
> -	bl __save_fpsimd
>  	bl __save_sysregs
>  
>  	skip_debug_state x3, 1f
> @@ -986,7 +960,6 @@ __kvm_vcpu_return:
>  	kern_hyp_va x2
>  
>  	bl __restore_sysregs
> -	bl __restore_fpsimd
>  
>  	skip_debug_state x3, 1f
>  	// Clear the dirty flag for the next run, as all the state has
>
Marc Zyngier April 10, 2015, 9:11 a.m. UTC | #2
On 10/04/15 09:41, Paolo Bonzini wrote:

Hi Paolo,

> On 09/04/2015 18:53, Marc Zyngier wrote:
>> The world switch spends quite some time dealing with the FP/SIMD
>> registers, as the state is quite sizeable (32 128bit registers,
>> plus some crumbs on the side). We save/restore them on each
>> entry/exit, so that both the host and the guest always see
>> the state they expect.
>>
>> But let's face it: the host kernel doesn't care. It is the host
>> userspace that actually cares about FP. An obvious improvement is
>> to remove the save/restore from the world switch, and only perform
>> it when we're about to enter/exit the guest (by plugging it into
>> vcpu_load/vcpu_put).
> 
> Good idea!  Does ARM have a way to do "lazy" save/restore?  On x86 we
> know if the FP/SIMD state was in use on the host, and we can force an
> exit the first time the guest uses the FP/SIMD.
> 
> On ARM, it wouldn't be a problem to handle this kind of exit straight in
> EL2 (similar to the hack Ard used when playing with memory attributes).

We already do this on the 32bit port, and it works fine.

I did some experimentations on arm64 a long while ago (see the
kvm-arm64/lazy-fp branch in my tree), and it wasn't that great,
apparently because aarch64 userspace tends to be much more FP happy than
aarch32, but I'm not completely sure about it. Also this is from a time
when I didn't have much HW to play with...

Maybe I should resurrect it and compare it to what this patch does, just
as a comparison point.

Thanks,

	M.
Christoffer Dall April 10, 2015, 9:12 a.m. UTC | #3
On Thu, Apr 09, 2015 at 05:53:59PM +0100, Marc Zyngier wrote:
> The world switch spends quite some time dealing with the FP/SIMD
> registers, as the state is quite sizeable (32 128bit registers,
> plus some crumbs on the side). We save/restore them on each
> entry/exit, so that both the host and the guest always see
> the state they expect.
> 
> But let's face it: the host kernel doesn't care. It is the host
> userspace that actually cares about FP. An obvious improvement is
> to remove the save/restore from the world switch, and only perform
> it when we're about to enter/exit the guest (by plugging it into
> vcpu_load/vcpu_put). The effect is pretty spectacular when running
> hackbench (which is the only benchmark worth looking at):
> 
so the kernel never uses fp/simd registers for stuff like memcopies
etc.?

Can we also make a similar change for ARM on the 32-bit side?

-Christoffer
Paolo Bonzini April 10, 2015, 9:13 a.m. UTC | #4
On 10/04/2015 11:11, Marc Zyngier wrote:
>> Good idea!  Does ARM have a way to do "lazy" save/restore?  On x86 we
>> know if the FP/SIMD state was in use on the host, and we can force an
>> exit the first time the guest uses the FP/SIMD.
>>
>> On ARM, it wouldn't be a problem to handle this kind of exit straight in
>> EL2 (similar to the hack Ard used when playing with memory attributes).
> 
> We already do this on the 32bit port, and it works fine.
> 
> I did some experimentations on arm64 a long while ago (see the
> kvm-arm64/lazy-fp branch in my tree), and it wasn't that great,
> apparently because aarch64 userspace tends to be much more FP happy than
> aarch32, but I'm not completely sure about it. Also this is from a time
> when I didn't have much HW to play with...
> 
> Maybe I should resurrect it and compare it to what this patch does, just
> as a comparison point.

This patch looks like very nice low-hanging fruit anyway.  Would you
like to have it in 4.1?

Paolo
Marc Zyngier April 10, 2015, 9:36 a.m. UTC | #5
On 10/04/15 10:12, Christoffer Dall wrote:
> On Thu, Apr 09, 2015 at 05:53:59PM +0100, Marc Zyngier wrote:
>> The world switch spends quite some time dealing with the FP/SIMD
>> registers, as the state is quite sizeable (32 128bit registers,
>> plus some crumbs on the side). We save/restore them on each
>> entry/exit, so that both the host and the guest always see
>> the state they expect.
>>
>> But let's face it: the host kernel doesn't care. It is the host
>> userspace that actually cares about FP. An obvious improvement is
>> to remove the save/restore from the world switch, and only perform
>> it when we're about to enter/exit the guest (by plugging it into
>> vcpu_load/vcpu_put). The effect is pretty spectacular when running
>> hackbench (which is the only benchmark worth looking at):
>>
> so the kernel never uses fp/simd registers for stuff like memcopies
> etc.?

It can, but wraps it with kernel_neon_begin/kernel_neon_end, which does
its own save-restore. And when we perform our own save/restore, we do it
with interrupts disabled so that the kernel cannot change the state
during that time.

> Can we also make a similar change for ARM on the 32-bit side?

We could, but it is worth evaluating how much better this is compared to
what we have already.

Thanks,

	M.
Marc Zyngier April 10, 2015, 9:40 a.m. UTC | #6
On 10/04/15 10:13, Paolo Bonzini wrote:
> 
> 
> On 10/04/2015 11:11, Marc Zyngier wrote:
>>> Good idea!  Does ARM have a way to do "lazy" save/restore?  On x86 we
>>> know if the FP/SIMD state was in use on the host, and we can force an
>>> exit the first time the guest uses the FP/SIMD.
>>>
>>> On ARM, it wouldn't be a problem to handle this kind of exit straight in
>>> EL2 (similar to the hack Ard used when playing with memory attributes).
>>
>> We already do this on the 32bit port, and it works fine.
>>
>> I did some experimentations on arm64 a long while ago (see the
>> kvm-arm64/lazy-fp branch in my tree), and it wasn't that great,
>> apparently because aarch64 userspace tends to be much more FP happy than
>> aarch32, but I'm not completely sure about it. Also this is from a time
>> when I didn't have much HW to play with...
>>
>> Maybe I should resurrect it and compare it to what this patch does, just
>> as a comparison point.
> 
> This patch looks like very nice low-hanging fruit anyway.  Would you
> like to have it in 4.1?

It is a bit "fresh off the compiler", hence the RFC tag. I'm going to
run some tests on it during the day, using slightly less braindead
hardware (the HiKey board is nice because of its form factor, but that's
the only nice thing about it...).

Once I'm confident it doesn't introduce any regression, it will be worth
considering.

Thanks,

	M.
Peter Maydell April 10, 2015, 10:01 a.m. UTC | #7
On 10 April 2015 at 10:11, Marc Zyngier <marc.zyngier@arm.com> wrote:
> I did some experimentations on arm64 a long while ago (see the
> kvm-arm64/lazy-fp branch in my tree), and it wasn't that great,
> apparently because aarch64 userspace tends to be much more FP happy than
> aarch32, but I'm not completely sure about it.

Was that with the buggy version of the toolchain that had
incorrect costs and so was much more inclined to spill
integer values into the FPU than it should have been?
It might be worth retesting with a freshly compiled
userspace some day...

-- PMM
Marc Zyngier April 10, 2015, 10:08 a.m. UTC | #8
On 10/04/15 11:01, Peter Maydell wrote:
> On 10 April 2015 at 10:11, Marc Zyngier <marc.zyngier@arm.com> wrote:
>> I did some experimentations on arm64 a long while ago (see the
>> kvm-arm64/lazy-fp branch in my tree), and it wasn't that great,
>> apparently because aarch64 userspace tends to be much more FP happy than
>> aarch32, but I'm not completely sure about it.
> 
> Was that with the buggy version of the toolchain that had
> incorrect costs and so was much more inclined to spill
> integer values into the FPU than it should have been?
> It might be worth retesting with a freshly compiled
> userspace some day...

It definitely was with a very old toolchain and an even older userspace.
I'll respin the branch when I get some thime.

Thanks,

	M.
Catalin Marinas April 10, 2015, 10:20 a.m. UTC | #9
On Fri, Apr 10, 2015 at 11:01:36AM +0100, Peter Maydell wrote:
> On 10 April 2015 at 10:11, Marc Zyngier <marc.zyngier@arm.com> wrote:
> > I did some experimentations on arm64 a long while ago (see the
> > kvm-arm64/lazy-fp branch in my tree), and it wasn't that great,
> > apparently because aarch64 userspace tends to be much more FP happy than
> > aarch32, but I'm not completely sure about it.
> 
> Was that with the buggy version of the toolchain that had
> incorrect costs and so was much more inclined to spill
> integer values into the FPU than it should have been?
> It might be worth retesting with a freshly compiled
> userspace some day...

It really depends on what you mean by userspace (which benchmarks you
use). If for example you only care about Javascript heavy web
applications, all numbers are 64-bit floats. But I think it's worth
investigating.

We could also make it smarter and mark some tasks as FP heavy for a
limited time to avoid faults.
Christoffer Dall April 13, 2015, 12:57 p.m. UTC | #10
On Thu, Apr 09, 2015 at 05:53:59PM +0100, Marc Zyngier wrote:
> The world switch spends quite some time dealing with the FP/SIMD
> registers, as the state is quite sizeable (32 128bit registers,
> plus some crumbs on the side). We save/restore them on each
> entry/exit, so that both the host and the guest always see
> the state they expect.
> 
> But let's face it: the host kernel doesn't care. It is the host
> userspace that actually cares about FP. An obvious improvement is
> to remove the save/restore from the world switch, and only perform
> it when we're about to enter/exit the guest (by plugging it into
> vcpu_load/vcpu_put). The effect is pretty spectacular when running
> hackbench (which is the only benchmark worth looking at):
> 
> Without this patch:
> 
> 	Running with 50*40 (== 2000) tasks.
> 	Time: 36.756
> 	Running with 50*40 (== 2000) tasks.
> 	Time: 36.679
> 	Running with 50*40 (== 2000) tasks.
> 	Time: 36.699
> 
> With this patch:
> 
> 	Running with 50*40 (== 2000) tasks.
> 	Time: 30.947
> 	Running with 50*40 (== 2000) tasks.
> 	Time: 30.868
> 	Running with 50*40 (== 2000) tasks.
> 	Time: 30.961
> 
> This is on a HiKey board (8*A53), with a 4 vcpu guest.

cool.  Based on stats from kvm-unit-tests on A57 we should reduce the
overall world-switch cost (in the best cases, caches, etc.) with ~8.5%,
but this is even better and we are doing slightly more work than
context-switching here, so I'm guessing factoring in potential extra
cache misses, it can be this good.

However, on XGene with Ubuntu 14.04 Trusty, I get the following (do not
compare to Marc's results, I may be using different kernel settings and
different payload size):

Without the patch:

	Running with 50*40 (== 2000) tasks.
	Time: 15.970
	Running with 50*40 (== 2000) tasks.
	Time: 15.963
	Running with 50*40 (== 2000) tasks.
	Time: 15.875


With the patch:

	Running with 50*40 (== 2000) tasks.
	Time: 16.768:
	Running with 50*40 (== 2000) tasks.
	Time: 14.865
	Running with 50*40 (== 2000) tasks.
	Time: 14.880

On an HP Moonshot server I ran a number of other benchmarks and got
similarly boring results.

Comments on the patch itself below:

> 
> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
> ---
>  arch/arm/include/asm/kvm_host.h   |  3 +++
>  arch/arm/kvm/arm.c                |  2 ++
>  arch/arm64/include/asm/kvm_asm.h  |  4 ++++
>  arch/arm64/include/asm/kvm_host.h |  3 +++
>  arch/arm64/kvm/Makefile           |  1 +
>  arch/arm64/kvm/fpsimd.S           | 39 ++++++++++++++++++++++++++++++++++++
>  arch/arm64/kvm/handle_fpsimd.c    | 42 +++++++++++++++++++++++++++++++++++++++
>  arch/arm64/kvm/hyp.S              | 27 -------------------------
>  8 files changed, 94 insertions(+), 27 deletions(-)
>  create mode 100644 arch/arm64/kvm/fpsimd.S
>  create mode 100644 arch/arm64/kvm/handle_fpsimd.c
> 
> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
> index d71607c..65cf1d1 100644
> --- a/arch/arm/include/asm/kvm_host.h
> +++ b/arch/arm/include/asm/kvm_host.h
> @@ -226,6 +226,9 @@ static inline void vgic_arch_setup(const struct vgic_params *vgic)
>  int kvm_perf_init(void);
>  int kvm_perf_teardown(void);
>  
> +static inline void kvm_fpsimd_flush_hwstate(struct kvm_vcpu *vcpu) {}
> +static inline void kvm_fpsimd_sync_hwstate(struct kvm_vcpu *vcpu) {}
> +
>  void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
>  
>  struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
> index 6f53645..ff1213c 100644
> --- a/arch/arm/kvm/arm.c
> +++ b/arch/arm/kvm/arm.c
> @@ -287,6 +287,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>  	vcpu->cpu = cpu;
>  	vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state);
>  
> +	kvm_fpsimd_flush_hwstate(vcpu);

not sure about the flus/sync terminology here, because we're not really
flushing a software model to hardware state - we're doing both in every
step.

How about:

kvm_fpsimd_load_vcpu_state()
kvm_fpsimd_put_vcpu_state()

>  	kvm_arm_set_running_vcpu(vcpu);
>  }
>  
> @@ -299,6 +300,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>  	 */
>  	vcpu->cpu = -1;
>  
> +	kvm_fpsimd_sync_hwstate(vcpu);
>  	kvm_arm_set_running_vcpu(NULL);
>  }
>  
> diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
> index 4f7310f..eafb0c3 100644
> --- a/arch/arm64/include/asm/kvm_asm.h
> +++ b/arch/arm64/include/asm/kvm_asm.h
> @@ -137,6 +137,10 @@ extern char __restore_vgic_v2_state[];
>  extern char __save_vgic_v3_state[];
>  extern char __restore_vgic_v3_state[];
>  
> +struct kvm_cpu_context;
> +extern void __kvm_save_fpsimd(struct kvm_cpu_context *);
> +extern void __kvm_restore_fpsimd(struct kvm_cpu_context *);
> +
>  #endif
>  
>  #endif /* __ARM_KVM_ASM_H__ */
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index f0f58c9..2b968e5 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -201,6 +201,9 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
>  int kvm_perf_init(void);
>  int kvm_perf_teardown(void);
>  
> +void kvm_fpsimd_flush_hwstate(struct kvm_vcpu *vcpu);
> +void kvm_fpsimd_sync_hwstate(struct kvm_vcpu *vcpu);
> +
>  struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
>  
>  static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
> diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
> index d5904f8..6d9c2b7 100644
> --- a/arch/arm64/kvm/Makefile
> +++ b/arch/arm64/kvm/Makefile
> @@ -18,6 +18,7 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/psci.o $(ARM)/perf.o
>  kvm-$(CONFIG_KVM_ARM_HOST) += emulate.o inject_fault.o regmap.o
>  kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
>  kvm-$(CONFIG_KVM_ARM_HOST) += guest.o reset.o sys_regs.o sys_regs_generic_v8.o
> +kvm-$(CONFIG_KVM_ARM_HOST) += fpsimd.o handle_fpsimd.o
>  
>  kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic.o
>  kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2.o
> diff --git a/arch/arm64/kvm/fpsimd.S b/arch/arm64/kvm/fpsimd.S
> new file mode 100644
> index 0000000..458a1a7
> --- /dev/null
> +++ b/arch/arm64/kvm/fpsimd.S
> @@ -0,0 +1,39 @@
> +/*
> + * Copyright (C) 2012,2013 - ARM Ltd

I don't know if you care, but shouldn't this be 2015?

Otherwise looks good!
-Christoffer
Marc Zyngier April 13, 2015, 2:12 p.m. UTC | #11
On 13/04/15 13:57, Christoffer Dall wrote:
> On Thu, Apr 09, 2015 at 05:53:59PM +0100, Marc Zyngier wrote:
>> The world switch spends quite some time dealing with the FP/SIMD
>> registers, as the state is quite sizeable (32 128bit registers,
>> plus some crumbs on the side). We save/restore them on each
>> entry/exit, so that both the host and the guest always see
>> the state they expect.
>>
>> But let's face it: the host kernel doesn't care. It is the host
>> userspace that actually cares about FP. An obvious improvement is
>> to remove the save/restore from the world switch, and only perform
>> it when we're about to enter/exit the guest (by plugging it into
>> vcpu_load/vcpu_put). The effect is pretty spectacular when running
>> hackbench (which is the only benchmark worth looking at):
>>
>> Without this patch:
>>
>> 	Running with 50*40 (== 2000) tasks.
>> 	Time: 36.756
>> 	Running with 50*40 (== 2000) tasks.
>> 	Time: 36.679
>> 	Running with 50*40 (== 2000) tasks.
>> 	Time: 36.699
>>
>> With this patch:
>>
>> 	Running with 50*40 (== 2000) tasks.
>> 	Time: 30.947
>> 	Running with 50*40 (== 2000) tasks.
>> 	Time: 30.868
>> 	Running with 50*40 (== 2000) tasks.
>> 	Time: 30.961
>>
>> This is on a HiKey board (8*A53), with a 4 vcpu guest.
> 
> cool.  Based on stats from kvm-unit-tests on A57 we should reduce the
> overall world-switch cost (in the best cases, caches, etc.) with ~8.5%,
> but this is even better and we are doing slightly more work than
> context-switching here, so I'm guessing factoring in potential extra
> cache misses, it can be this good.
> 
> However, on XGene with Ubuntu 14.04 Trusty, I get the following (do not
> compare to Marc's results, I may be using different kernel settings and
> different payload size):
> 
> Without the patch:
> 
> 	Running with 50*40 (== 2000) tasks.
> 	Time: 15.970
> 	Running with 50*40 (== 2000) tasks.
> 	Time: 15.963
> 	Running with 50*40 (== 2000) tasks.
> 	Time: 15.875
> 
> 
> With the patch:
> 
> 	Running with 50*40 (== 2000) tasks.
> 	Time: 16.768:
> 	Running with 50*40 (== 2000) tasks.
> 	Time: 14.865
> 	Running with 50*40 (== 2000) tasks.
> 	Time: 14.880
> 
> On an HP Moonshot server I ran a number of other benchmarks and got
> similarly boring results.

I did another set of tests, this time involving Seattle, XGene and the
HiKey board. The result is that you cannot trust HiKey, this is the most
unpredictable platform I've ever seen (I had some chosen words for it,
that I don't want to write here....).

So while this patch seems to provide a small improvement in some cases,
it is definitely not as interesting as my first testing suggested. Too
good to be true! ;-)

I'm going to try and revive my lazy-fp patches, and see if there's
anything to improve here.

> Comments on the patch itself below:
> 
>>
>> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
>> ---
>>  arch/arm/include/asm/kvm_host.h   |  3 +++
>>  arch/arm/kvm/arm.c                |  2 ++
>>  arch/arm64/include/asm/kvm_asm.h  |  4 ++++
>>  arch/arm64/include/asm/kvm_host.h |  3 +++
>>  arch/arm64/kvm/Makefile           |  1 +
>>  arch/arm64/kvm/fpsimd.S           | 39 ++++++++++++++++++++++++++++++++++++
>>  arch/arm64/kvm/handle_fpsimd.c    | 42 +++++++++++++++++++++++++++++++++++++++
>>  arch/arm64/kvm/hyp.S              | 27 -------------------------
>>  8 files changed, 94 insertions(+), 27 deletions(-)
>>  create mode 100644 arch/arm64/kvm/fpsimd.S
>>  create mode 100644 arch/arm64/kvm/handle_fpsimd.c
>>
>> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
>> index d71607c..65cf1d1 100644
>> --- a/arch/arm/include/asm/kvm_host.h
>> +++ b/arch/arm/include/asm/kvm_host.h
>> @@ -226,6 +226,9 @@ static inline void vgic_arch_setup(const struct vgic_params *vgic)
>>  int kvm_perf_init(void);
>>  int kvm_perf_teardown(void);
>>  
>> +static inline void kvm_fpsimd_flush_hwstate(struct kvm_vcpu *vcpu) {}
>> +static inline void kvm_fpsimd_sync_hwstate(struct kvm_vcpu *vcpu) {}
>> +
>>  void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
>>  
>>  struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
>> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
>> index 6f53645..ff1213c 100644
>> --- a/arch/arm/kvm/arm.c
>> +++ b/arch/arm/kvm/arm.c
>> @@ -287,6 +287,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>>  	vcpu->cpu = cpu;
>>  	vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state);
>>  
>> +	kvm_fpsimd_flush_hwstate(vcpu);
> 
> not sure about the flus/sync terminology here, because we're not really
> flushing a software model to hardware state - we're doing both in every
> step.
> 
> How about:
> 
> kvm_fpsimd_load_vcpu_state()
> kvm_fpsimd_put_vcpu_state()

The rest of the kernel does use that flush/sync terminology, specially
for FP/SIMD. So it is a matter of either being consistent with the arm64
convention, or consistent with the KVM convention. I don't mind either way.

>>  	kvm_arm_set_running_vcpu(vcpu);
>>  }
>>  
>> @@ -299,6 +300,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>>  	 */
>>  	vcpu->cpu = -1;
>>  
>> +	kvm_fpsimd_sync_hwstate(vcpu);
>>  	kvm_arm_set_running_vcpu(NULL);
>>  }
>>  
>> diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
>> index 4f7310f..eafb0c3 100644
>> --- a/arch/arm64/include/asm/kvm_asm.h
>> +++ b/arch/arm64/include/asm/kvm_asm.h
>> @@ -137,6 +137,10 @@ extern char __restore_vgic_v2_state[];
>>  extern char __save_vgic_v3_state[];
>>  extern char __restore_vgic_v3_state[];
>>  
>> +struct kvm_cpu_context;
>> +extern void __kvm_save_fpsimd(struct kvm_cpu_context *);
>> +extern void __kvm_restore_fpsimd(struct kvm_cpu_context *);
>> +
>>  #endif
>>  
>>  #endif /* __ARM_KVM_ASM_H__ */
>> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
>> index f0f58c9..2b968e5 100644
>> --- a/arch/arm64/include/asm/kvm_host.h
>> +++ b/arch/arm64/include/asm/kvm_host.h
>> @@ -201,6 +201,9 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
>>  int kvm_perf_init(void);
>>  int kvm_perf_teardown(void);
>>  
>> +void kvm_fpsimd_flush_hwstate(struct kvm_vcpu *vcpu);
>> +void kvm_fpsimd_sync_hwstate(struct kvm_vcpu *vcpu);
>> +
>>  struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
>>  
>>  static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
>> diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
>> index d5904f8..6d9c2b7 100644
>> --- a/arch/arm64/kvm/Makefile
>> +++ b/arch/arm64/kvm/Makefile
>> @@ -18,6 +18,7 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/psci.o $(ARM)/perf.o
>>  kvm-$(CONFIG_KVM_ARM_HOST) += emulate.o inject_fault.o regmap.o
>>  kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
>>  kvm-$(CONFIG_KVM_ARM_HOST) += guest.o reset.o sys_regs.o sys_regs_generic_v8.o
>> +kvm-$(CONFIG_KVM_ARM_HOST) += fpsimd.o handle_fpsimd.o
>>  
>>  kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic.o
>>  kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2.o
>> diff --git a/arch/arm64/kvm/fpsimd.S b/arch/arm64/kvm/fpsimd.S
>> new file mode 100644
>> index 0000000..458a1a7
>> --- /dev/null
>> +++ b/arch/arm64/kvm/fpsimd.S
>> @@ -0,0 +1,39 @@
>> +/*
>> + * Copyright (C) 2012,2013 - ARM Ltd
> 
> I don't know if you care, but shouldn't this be 2015?

I wondered. I've just moved existing code around, so I decided to keep
the original date.

Thanks,

	M.
Christoffer Dall April 13, 2015, 2:26 p.m. UTC | #12
On Mon, Apr 13, 2015 at 03:12:10PM +0100, Marc Zyngier wrote:
> On 13/04/15 13:57, Christoffer Dall wrote:
> > On Thu, Apr 09, 2015 at 05:53:59PM +0100, Marc Zyngier wrote:
> >> The world switch spends quite some time dealing with the FP/SIMD
> >> registers, as the state is quite sizeable (32 128bit registers,
> >> plus some crumbs on the side). We save/restore them on each
> >> entry/exit, so that both the host and the guest always see
> >> the state they expect.
> >>
> >> But let's face it: the host kernel doesn't care. It is the host
> >> userspace that actually cares about FP. An obvious improvement is
> >> to remove the save/restore from the world switch, and only perform
> >> it when we're about to enter/exit the guest (by plugging it into
> >> vcpu_load/vcpu_put). The effect is pretty spectacular when running
> >> hackbench (which is the only benchmark worth looking at):
> >>
> >> Without this patch:
> >>
> >> 	Running with 50*40 (== 2000) tasks.
> >> 	Time: 36.756
> >> 	Running with 50*40 (== 2000) tasks.
> >> 	Time: 36.679
> >> 	Running with 50*40 (== 2000) tasks.
> >> 	Time: 36.699
> >>
> >> With this patch:
> >>
> >> 	Running with 50*40 (== 2000) tasks.
> >> 	Time: 30.947
> >> 	Running with 50*40 (== 2000) tasks.
> >> 	Time: 30.868
> >> 	Running with 50*40 (== 2000) tasks.
> >> 	Time: 30.961
> >>
> >> This is on a HiKey board (8*A53), with a 4 vcpu guest.
> > 
> > cool.  Based on stats from kvm-unit-tests on A57 we should reduce the
> > overall world-switch cost (in the best cases, caches, etc.) with ~8.5%,
> > but this is even better and we are doing slightly more work than
> > context-switching here, so I'm guessing factoring in potential extra
> > cache misses, it can be this good.
> > 
> > However, on XGene with Ubuntu 14.04 Trusty, I get the following (do not
> > compare to Marc's results, I may be using different kernel settings and
> > different payload size):
> > 
> > Without the patch:
> > 
> > 	Running with 50*40 (== 2000) tasks.
> > 	Time: 15.970
> > 	Running with 50*40 (== 2000) tasks.
> > 	Time: 15.963
> > 	Running with 50*40 (== 2000) tasks.
> > 	Time: 15.875
> > 
> > 
> > With the patch:
> > 
> > 	Running with 50*40 (== 2000) tasks.
> > 	Time: 16.768:
> > 	Running with 50*40 (== 2000) tasks.
> > 	Time: 14.865
> > 	Running with 50*40 (== 2000) tasks.
> > 	Time: 14.880
> > 
> > On an HP Moonshot server I ran a number of other benchmarks and got
> > similarly boring results.
> 
> I did another set of tests, this time involving Seattle, XGene and the
> HiKey board. The result is that you cannot trust HiKey, this is the most
> unpredictable platform I've ever seen (I had some chosen words for it,
> that I don't want to write here....).
> 
> So while this patch seems to provide a small improvement in some cases,
> it is definitely not as interesting as my first testing suggested. Too
> good to be true! ;-)
> 
> I'm going to try and revive my lazy-fp patches, and see if there's
> anything to improve here.
> 
> > Comments on the patch itself below:
> > 
> >>
> >> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
> >> ---
> >>  arch/arm/include/asm/kvm_host.h   |  3 +++
> >>  arch/arm/kvm/arm.c                |  2 ++
> >>  arch/arm64/include/asm/kvm_asm.h  |  4 ++++
> >>  arch/arm64/include/asm/kvm_host.h |  3 +++
> >>  arch/arm64/kvm/Makefile           |  1 +
> >>  arch/arm64/kvm/fpsimd.S           | 39 ++++++++++++++++++++++++++++++++++++
> >>  arch/arm64/kvm/handle_fpsimd.c    | 42 +++++++++++++++++++++++++++++++++++++++
> >>  arch/arm64/kvm/hyp.S              | 27 -------------------------
> >>  8 files changed, 94 insertions(+), 27 deletions(-)
> >>  create mode 100644 arch/arm64/kvm/fpsimd.S
> >>  create mode 100644 arch/arm64/kvm/handle_fpsimd.c
> >>
> >> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
> >> index d71607c..65cf1d1 100644
> >> --- a/arch/arm/include/asm/kvm_host.h
> >> +++ b/arch/arm/include/asm/kvm_host.h
> >> @@ -226,6 +226,9 @@ static inline void vgic_arch_setup(const struct vgic_params *vgic)
> >>  int kvm_perf_init(void);
> >>  int kvm_perf_teardown(void);
> >>  
> >> +static inline void kvm_fpsimd_flush_hwstate(struct kvm_vcpu *vcpu) {}
> >> +static inline void kvm_fpsimd_sync_hwstate(struct kvm_vcpu *vcpu) {}
> >> +
> >>  void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
> >>  
> >>  struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
> >> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
> >> index 6f53645..ff1213c 100644
> >> --- a/arch/arm/kvm/arm.c
> >> +++ b/arch/arm/kvm/arm.c
> >> @@ -287,6 +287,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> >>  	vcpu->cpu = cpu;
> >>  	vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state);
> >>  
> >> +	kvm_fpsimd_flush_hwstate(vcpu);
> > 
> > not sure about the flus/sync terminology here, because we're not really
> > flushing a software model to hardware state - we're doing both in every
> > step.
> > 
> > How about:
> > 
> > kvm_fpsimd_load_vcpu_state()
> > kvm_fpsimd_put_vcpu_state()
> 
> The rest of the kernel does use that flush/sync terminology, specially
> for FP/SIMD. So it is a matter of either being consistent with the arm64
> convention, or consistent with the KVM convention. I don't mind either way.
> 

ok, I didn't realize this was used for FP/SIMD.  The terminology always
confused me a bit, but let's just do the least churn.

> >>  	kvm_arm_set_running_vcpu(vcpu);
> >>  }
> >>  
> >> @@ -299,6 +300,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
> >>  	 */
> >>  	vcpu->cpu = -1;
> >>  
> >> +	kvm_fpsimd_sync_hwstate(vcpu);
> >>  	kvm_arm_set_running_vcpu(NULL);
> >>  }
> >>  
> >> diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
> >> index 4f7310f..eafb0c3 100644
> >> --- a/arch/arm64/include/asm/kvm_asm.h
> >> +++ b/arch/arm64/include/asm/kvm_asm.h
> >> @@ -137,6 +137,10 @@ extern char __restore_vgic_v2_state[];
> >>  extern char __save_vgic_v3_state[];
> >>  extern char __restore_vgic_v3_state[];
> >>  
> >> +struct kvm_cpu_context;
> >> +extern void __kvm_save_fpsimd(struct kvm_cpu_context *);
> >> +extern void __kvm_restore_fpsimd(struct kvm_cpu_context *);
> >> +
> >>  #endif
> >>  
> >>  #endif /* __ARM_KVM_ASM_H__ */
> >> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> >> index f0f58c9..2b968e5 100644
> >> --- a/arch/arm64/include/asm/kvm_host.h
> >> +++ b/arch/arm64/include/asm/kvm_host.h
> >> @@ -201,6 +201,9 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
> >>  int kvm_perf_init(void);
> >>  int kvm_perf_teardown(void);
> >>  
> >> +void kvm_fpsimd_flush_hwstate(struct kvm_vcpu *vcpu);
> >> +void kvm_fpsimd_sync_hwstate(struct kvm_vcpu *vcpu);
> >> +
> >>  struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
> >>  
> >>  static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
> >> diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
> >> index d5904f8..6d9c2b7 100644
> >> --- a/arch/arm64/kvm/Makefile
> >> +++ b/arch/arm64/kvm/Makefile
> >> @@ -18,6 +18,7 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/psci.o $(ARM)/perf.o
> >>  kvm-$(CONFIG_KVM_ARM_HOST) += emulate.o inject_fault.o regmap.o
> >>  kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
> >>  kvm-$(CONFIG_KVM_ARM_HOST) += guest.o reset.o sys_regs.o sys_regs_generic_v8.o
> >> +kvm-$(CONFIG_KVM_ARM_HOST) += fpsimd.o handle_fpsimd.o
> >>  
> >>  kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic.o
> >>  kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2.o
> >> diff --git a/arch/arm64/kvm/fpsimd.S b/arch/arm64/kvm/fpsimd.S
> >> new file mode 100644
> >> index 0000000..458a1a7
> >> --- /dev/null
> >> +++ b/arch/arm64/kvm/fpsimd.S
> >> @@ -0,0 +1,39 @@
> >> +/*
> >> + * Copyright (C) 2012,2013 - ARM Ltd
> > 
> > I don't know if you care, but shouldn't this be 2015?
> 
> I wondered. I've just moved existing code around, so I decided to keep
> the original date.

ah, ignore me then.

-Christoffer
diff mbox

Patch

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index d71607c..65cf1d1 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -226,6 +226,9 @@  static inline void vgic_arch_setup(const struct vgic_params *vgic)
 int kvm_perf_init(void);
 int kvm_perf_teardown(void);
 
+static inline void kvm_fpsimd_flush_hwstate(struct kvm_vcpu *vcpu) {}
+static inline void kvm_fpsimd_sync_hwstate(struct kvm_vcpu *vcpu) {}
+
 void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
 
 struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 6f53645..ff1213c 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -287,6 +287,7 @@  void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	vcpu->cpu = cpu;
 	vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state);
 
+	kvm_fpsimd_flush_hwstate(vcpu);
 	kvm_arm_set_running_vcpu(vcpu);
 }
 
@@ -299,6 +300,7 @@  void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	 */
 	vcpu->cpu = -1;
 
+	kvm_fpsimd_sync_hwstate(vcpu);
 	kvm_arm_set_running_vcpu(NULL);
 }
 
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 4f7310f..eafb0c3 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -137,6 +137,10 @@  extern char __restore_vgic_v2_state[];
 extern char __save_vgic_v3_state[];
 extern char __restore_vgic_v3_state[];
 
+struct kvm_cpu_context;
+extern void __kvm_save_fpsimd(struct kvm_cpu_context *);
+extern void __kvm_restore_fpsimd(struct kvm_cpu_context *);
+
 #endif
 
 #endif /* __ARM_KVM_ASM_H__ */
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index f0f58c9..2b968e5 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -201,6 +201,9 @@  int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
 int kvm_perf_init(void);
 int kvm_perf_teardown(void);
 
+void kvm_fpsimd_flush_hwstate(struct kvm_vcpu *vcpu);
+void kvm_fpsimd_sync_hwstate(struct kvm_vcpu *vcpu);
+
 struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
 
 static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index d5904f8..6d9c2b7 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -18,6 +18,7 @@  kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/psci.o $(ARM)/perf.o
 kvm-$(CONFIG_KVM_ARM_HOST) += emulate.o inject_fault.o regmap.o
 kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
 kvm-$(CONFIG_KVM_ARM_HOST) += guest.o reset.o sys_regs.o sys_regs_generic_v8.o
+kvm-$(CONFIG_KVM_ARM_HOST) += fpsimd.o handle_fpsimd.o
 
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2.o
diff --git a/arch/arm64/kvm/fpsimd.S b/arch/arm64/kvm/fpsimd.S
new file mode 100644
index 0000000..458a1a7
--- /dev/null
+++ b/arch/arm64/kvm/fpsimd.S
@@ -0,0 +1,39 @@ 
+/*
+ * Copyright (C) 2012,2013 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/asm-offsets.h>
+#include <asm/fpsimdmacros.h>
+
+#define CPU_GP_REG_OFFSET(x)	(CPU_GP_REGS + x)
+
+ENTRY(__kvm_save_fpsimd)
+	// x0: cpu context address
+	// x1, x2: tmp regs
+	add	x1, x0, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
+	fpsimd_save x1, 2
+	ret
+END(__kvm_save_fpsimd)
+
+ENTRY(__kvm_restore_fpsimd)
+	// x0: cpu context address
+	// x1, x2: tmp regs
+	add	x1, x0, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
+	fpsimd_restore x1, 2
+	ret
+END(__kvm_restore_fpsimd)
diff --git a/arch/arm64/kvm/handle_fpsimd.c b/arch/arm64/kvm/handle_fpsimd.c
new file mode 100644
index 0000000..3d34cc9
--- /dev/null
+++ b/arch/arm64/kvm/handle_fpsimd.c
@@ -0,0 +1,42 @@ 
+/*
+ * Copyright (C) 2015 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kvm_host.h>
+
+void kvm_fpsimd_flush_hwstate(struct kvm_vcpu *vcpu)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+
+	__kvm_save_fpsimd(vcpu->arch.host_cpu_context);
+	__kvm_restore_fpsimd(&vcpu->arch.ctxt);
+
+	local_irq_restore(flags);
+}
+
+void kvm_fpsimd_sync_hwstate(struct kvm_vcpu *vcpu)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+
+	__kvm_save_fpsimd(&vcpu->arch.ctxt);
+	__kvm_restore_fpsimd(vcpu->arch.host_cpu_context);
+
+	local_irq_restore(flags);
+}
diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
index 5befd01..425c1ad 100644
--- a/arch/arm64/kvm/hyp.S
+++ b/arch/arm64/kvm/hyp.S
@@ -21,7 +21,6 @@ 
 #include <asm/assembler.h>
 #include <asm/debug-monitors.h>
 #include <asm/esr.h>
-#include <asm/fpsimdmacros.h>
 #include <asm/kvm.h>
 #include <asm/kvm_arm.h>
 #include <asm/kvm_asm.h>
@@ -102,20 +101,6 @@ 
 	restore_common_regs
 .endm
 
-.macro save_fpsimd
-	// x2: cpu context address
-	// x3, x4: tmp regs
-	add	x3, x2, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
-	fpsimd_save x3, 4
-.endm
-
-.macro restore_fpsimd
-	// x2: cpu context address
-	// x3, x4: tmp regs
-	add	x3, x2, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
-	fpsimd_restore x3, 4
-.endm
-
 .macro save_guest_regs
 	// x0 is the vcpu address
 	// x1 is the return code, do not corrupt!
@@ -904,14 +889,6 @@  __restore_debug:
 	restore_debug
 	ret
 
-__save_fpsimd:
-	save_fpsimd
-	ret
-
-__restore_fpsimd:
-	restore_fpsimd
-	ret
-
 /*
  * u64 __kvm_vcpu_run(struct kvm_vcpu *vcpu);
  *
@@ -932,7 +909,6 @@  ENTRY(__kvm_vcpu_run)
 	kern_hyp_va x2
 
 	save_host_regs
-	bl __save_fpsimd
 	bl __save_sysregs
 
 	compute_debug_state 1f
@@ -948,7 +924,6 @@  ENTRY(__kvm_vcpu_run)
 	add	x2, x0, #VCPU_CONTEXT
 
 	bl __restore_sysregs
-	bl __restore_fpsimd
 
 	skip_debug_state x3, 1f
 	bl	__restore_debug
@@ -967,7 +942,6 @@  __kvm_vcpu_return:
 	add	x2, x0, #VCPU_CONTEXT
 
 	save_guest_regs
-	bl __save_fpsimd
 	bl __save_sysregs
 
 	skip_debug_state x3, 1f
@@ -986,7 +960,6 @@  __kvm_vcpu_return:
 	kern_hyp_va x2
 
 	bl __restore_sysregs
-	bl __restore_fpsimd
 
 	skip_debug_state x3, 1f
 	// Clear the dirty flag for the next run, as all the state has