diff mbox

[v3,1/3] arm/arm64: KVM: Use set/way op trapping to track the state of the caches

Message ID 1421865588-19761-2-git-send-email-marc.zyngier@arm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Marc Zyngier Jan. 21, 2015, 6:39 p.m. UTC
Trying to emulate the behaviour of set/way cache ops is fairly
pointless, as there are too many ways we can end-up missing stuff.
Also, there is some system caches out there that simply ignore
set/way operations.

So instead of trying to implement them, let's convert it to VA ops,
and use them as a way to re-enable the trapping of VM ops. That way,
we can detect the point when the MMU/caches are turned off, and do
a full VM flush (which is what the guest was trying to do anyway).

This allows a 32bit zImage to boot on the APM thingy, and will
probably help bootloaders in general.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 arch/arm/include/asm/kvm_emulate.h   | 10 +++++
 arch/arm/include/asm/kvm_host.h      |  3 --
 arch/arm/include/asm/kvm_mmu.h       |  3 +-
 arch/arm/kvm/arm.c                   | 10 -----
 arch/arm/kvm/coproc.c                | 64 ++++++------------------------
 arch/arm/kvm/coproc_a15.c            |  2 +-
 arch/arm/kvm/coproc_a7.c             |  2 +-
 arch/arm/kvm/mmu.c                   | 70 ++++++++++++++++++++++++++++++++-
 arch/arm/kvm/trace.h                 | 39 +++++++++++++++++++
 arch/arm64/include/asm/kvm_emulate.h | 10 +++++
 arch/arm64/include/asm/kvm_host.h    |  3 --
 arch/arm64/include/asm/kvm_mmu.h     |  3 +-
 arch/arm64/kvm/sys_regs.c            | 75 +++++-------------------------------
 13 files changed, 155 insertions(+), 139 deletions(-)

Comments

Christoffer Dall Jan. 26, 2015, 10:58 p.m. UTC | #1
On Wed, Jan 21, 2015 at 06:39:46PM +0000, Marc Zyngier wrote:
> Trying to emulate the behaviour of set/way cache ops is fairly
> pointless, as there are too many ways we can end-up missing stuff.
> Also, there is some system caches out there that simply ignore
> set/way operations.
> 
> So instead of trying to implement them, let's convert it to VA ops,
> and use them as a way to re-enable the trapping of VM ops. That way,
> we can detect the point when the MMU/caches are turned off, and do
> a full VM flush (which is what the guest was trying to do anyway).
> 
> This allows a 32bit zImage to boot on the APM thingy, and will
> probably help bootloaders in general.
> 
> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>

This had some conflicts with dirty page logging.  I fixed it up here,
and also removed some trailing white space and mixed spaces/tabs that
patch complained about:

http://git.linaro.org/people/christoffer.dall/linux-kvm-arm.git mm-fixes

> ---
>  arch/arm/include/asm/kvm_emulate.h   | 10 +++++
>  arch/arm/include/asm/kvm_host.h      |  3 --
>  arch/arm/include/asm/kvm_mmu.h       |  3 +-
>  arch/arm/kvm/arm.c                   | 10 -----
>  arch/arm/kvm/coproc.c                | 64 ++++++------------------------
>  arch/arm/kvm/coproc_a15.c            |  2 +-
>  arch/arm/kvm/coproc_a7.c             |  2 +-
>  arch/arm/kvm/mmu.c                   | 70 ++++++++++++++++++++++++++++++++-
>  arch/arm/kvm/trace.h                 | 39 +++++++++++++++++++
>  arch/arm64/include/asm/kvm_emulate.h | 10 +++++
>  arch/arm64/include/asm/kvm_host.h    |  3 --
>  arch/arm64/include/asm/kvm_mmu.h     |  3 +-
>  arch/arm64/kvm/sys_regs.c            | 75 +++++-------------------------------
>  13 files changed, 155 insertions(+), 139 deletions(-)
> 
> diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h
> index 66ce176..7b01523 100644
> --- a/arch/arm/include/asm/kvm_emulate.h
> +++ b/arch/arm/include/asm/kvm_emulate.h
> @@ -38,6 +38,16 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
>  	vcpu->arch.hcr = HCR_GUEST_MASK;
>  }
>  
> +static inline unsigned long vcpu_get_hcr(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.hcr;
> +}
> +
> +static inline void vcpu_set_hcr(struct kvm_vcpu *vcpu, unsigned long hcr)
> +{
> +	vcpu->arch.hcr = hcr;
> +}
> +
>  static inline bool vcpu_mode_is_32bit(struct kvm_vcpu *vcpu)
>  {
>  	return 1;
> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
> index 254e065..04b4ea0 100644
> --- a/arch/arm/include/asm/kvm_host.h
> +++ b/arch/arm/include/asm/kvm_host.h
> @@ -125,9 +125,6 @@ struct kvm_vcpu_arch {
>  	 * Anything that is not used directly from assembly code goes
>  	 * here.
>  	 */
> -	/* dcache set/way operation pending */
> -	int last_pcpu;
> -	cpumask_t require_dcache_flush;
>  
>  	/* Don't run the guest on this vcpu */
>  	bool pause;
> diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
> index 63e0ecc..286644c 100644
> --- a/arch/arm/include/asm/kvm_mmu.h
> +++ b/arch/arm/include/asm/kvm_mmu.h
> @@ -190,7 +190,8 @@ static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
>  
>  #define kvm_virt_to_phys(x)		virt_to_idmap((unsigned long)(x))
>  
> -void stage2_flush_vm(struct kvm *kvm);
> +void kvm_set_way_flush(struct kvm_vcpu *vcpu);
> +void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
>  
>  #endif	/* !__ASSEMBLY__ */
>  
> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
> index 2d6d910..0b0d58a 100644
> --- a/arch/arm/kvm/arm.c
> +++ b/arch/arm/kvm/arm.c
> @@ -281,15 +281,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>  	vcpu->cpu = cpu;
>  	vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state);
>  
> -	/*
> -	 * Check whether this vcpu requires the cache to be flushed on
> -	 * this physical CPU. This is a consequence of doing dcache
> -	 * operations by set/way on this vcpu. We do it here to be in
> -	 * a non-preemptible section.
> -	 */
> -	if (cpumask_test_and_clear_cpu(cpu, &vcpu->arch.require_dcache_flush))
> -		flush_cache_all(); /* We'd really want v7_flush_dcache_all() */
> -
>  	kvm_arm_set_running_vcpu(vcpu);
>  }
>  
> @@ -541,7 +532,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
>  		ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);
>  
>  		vcpu->mode = OUTSIDE_GUEST_MODE;
> -		vcpu->arch.last_pcpu = smp_processor_id();
>  		kvm_guest_exit();
>  		trace_kvm_exit(*vcpu_pc(vcpu));
>  		/*
> diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c
> index 7928dbd..0afcc00 100644
> --- a/arch/arm/kvm/coproc.c
> +++ b/arch/arm/kvm/coproc.c
> @@ -189,82 +189,40 @@ static bool access_l2ectlr(struct kvm_vcpu *vcpu,
>  	return true;
>  }
>  
> -/* See note at ARM ARM B1.14.4 */
> +/*
> + * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
> + */
>  static bool access_dcsw(struct kvm_vcpu *vcpu,
>  			const struct coproc_params *p,
>  			const struct coproc_reg *r)
>  {
> -	unsigned long val;
> -	int cpu;
> -
>  	if (!p->is_write)
>  		return read_from_write_only(vcpu, p);
>  
> -	cpu = get_cpu();
> -
> -	cpumask_setall(&vcpu->arch.require_dcache_flush);
> -	cpumask_clear_cpu(cpu, &vcpu->arch.require_dcache_flush);
> -
> -	/* If we were already preempted, take the long way around */
> -	if (cpu != vcpu->arch.last_pcpu) {
> -		flush_cache_all();
> -		goto done;
> -	}
> -
> -	val = *vcpu_reg(vcpu, p->Rt1);
> -
> -	switch (p->CRm) {
> -	case 6:			/* Upgrade DCISW to DCCISW, as per HCR.SWIO */
> -	case 14:		/* DCCISW */
> -		asm volatile("mcr p15, 0, %0, c7, c14, 2" : : "r" (val));
> -		break;
> -
> -	case 10:		/* DCCSW */
> -		asm volatile("mcr p15, 0, %0, c7, c10, 2" : : "r" (val));
> -		break;
> -	}
> -
> -done:
> -	put_cpu();
> -
> +	kvm_set_way_flush(vcpu);
>  	return true;
>  }
>  
>  /*
>   * Generic accessor for VM registers. Only called as long as HCR_TVM
> - * is set.
> + * is set.  If the guest enables the MMU, we stop trapping the VM
> + * sys_regs and leave it in complete control of the caches.
> + *
> + * Used by the cpu-specific code.
>   */
>  static bool access_vm_reg(struct kvm_vcpu *vcpu,
>  			  const struct coproc_params *p,
>  			  const struct coproc_reg *r)
>  {
> +	bool was_enabled = vcpu_has_cache_enabled(vcpu);
> +
>  	BUG_ON(!p->is_write);
>  
>  	vcpu->arch.cp15[r->reg] = *vcpu_reg(vcpu, p->Rt1);
>  	if (p->is_64bit)
>  		vcpu->arch.cp15[r->reg + 1] = *vcpu_reg(vcpu, p->Rt2);
>  
> -	return true;
> -}
> -
> -/*
> - * SCTLR accessor. Only called as long as HCR_TVM is set.  If the
> - * guest enables the MMU, we stop trapping the VM sys_regs and leave
> - * it in complete control of the caches.
> - *
> - * Used by the cpu-specific code.
> - */
> -bool access_sctlr(struct kvm_vcpu *vcpu,
> -		  const struct coproc_params *p,
> -		  const struct coproc_reg *r)
> -{

I think you left in a prototype in arch/arm/kvm/coproc.h

> -	access_vm_reg(vcpu, p, r);
> -
> -	if (vcpu_has_cache_enabled(vcpu)) {	/* MMU+Caches enabled? */
> -		vcpu->arch.hcr &= ~HCR_TVM;
> -		stage2_flush_vm(vcpu->kvm);
> -	}
> -
> +	kvm_toggle_cache(vcpu, was_enabled);
>  	return true;
>  }
>  
> diff --git a/arch/arm/kvm/coproc_a15.c b/arch/arm/kvm/coproc_a15.c
> index e6f4ae4..a713675 100644
> --- a/arch/arm/kvm/coproc_a15.c
> +++ b/arch/arm/kvm/coproc_a15.c
> @@ -34,7 +34,7 @@
>  static const struct coproc_reg a15_regs[] = {
>  	/* SCTLR: swapped by interrupt.S. */
>  	{ CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32,
> -			access_sctlr, reset_val, c1_SCTLR, 0x00C50078 },
> +			access_vm_reg, reset_val, c1_SCTLR, 0x00C50078 },
>  };
>  
>  static struct kvm_coproc_target_table a15_target_table = {
> diff --git a/arch/arm/kvm/coproc_a7.c b/arch/arm/kvm/coproc_a7.c
> index 17fc7cd..b19e46d 100644
> --- a/arch/arm/kvm/coproc_a7.c
> +++ b/arch/arm/kvm/coproc_a7.c
> @@ -37,7 +37,7 @@
>  static const struct coproc_reg a7_regs[] = {
>  	/* SCTLR: swapped by interrupt.S. */
>  	{ CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32,
> -			access_sctlr, reset_val, c1_SCTLR, 0x00C50878 },
> +			access_vm_reg, reset_val, c1_SCTLR, 0x00C50878 },

I'm confused how you tested this, this doesn't seem to compile for me
(access_vm_reg is a static function).

>  };
>  
>  static struct kvm_coproc_target_table a7_target_table = {
> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> index 1dc9778..106737e 100644
> --- a/arch/arm/kvm/mmu.c
> +++ b/arch/arm/kvm/mmu.c
> @@ -278,7 +278,7 @@ static void stage2_flush_memslot(struct kvm *kvm,
>   * Go through the stage 2 page tables and invalidate any cache lines
>   * backing memory already mapped to the VM.
>   */
> -void stage2_flush_vm(struct kvm *kvm)
> +static void stage2_flush_vm(struct kvm *kvm)
>  {
>  	struct kvm_memslots *slots;
>  	struct kvm_memory_slot *memslot;
> @@ -1411,3 +1411,71 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
>  	unmap_stage2_range(kvm, gpa, size);
>  	spin_unlock(&kvm->mmu_lock);
>  }
> +
> +/*
> + * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
> + *
> + * Main problems:
> + * - S/W ops are local to a CPU (not broadcast)
> + * - We have line migration behind our back (speculation)
> + * - System caches don't support S/W at all (damn!)
> + *
> + * In the face of the above, the best we can do is to try and convert
> + * S/W ops to VA ops. Because the guest is not allowed to infer the
> + * S/W to PA mapping, it can only use S/W to nuke the whole cache,
> + * which is a rather good thing for us.
> + *
> + * Also, it is only used when turning caches on/off ("The expected
> + * usage of the cache maintenance instructions that operate by set/way
> + * is associated with the cache maintenance instructions associated
> + * with the powerdown and powerup of caches, if this is required by
> + * the implementation.").
> + *
> + * We use the following policy:
> + *
> + * - If we trap a S/W operation, we enable VM trapping to detect
> + *   caches being turned on/off, and do a full clean.
> + *
> + * - We flush the caches on both caches being turned on and off.
> + *
> + * - Once the caches are enabled, we stop trapping VM ops.
> + */
> +void kvm_set_way_flush(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long hcr = vcpu_get_hcr(vcpu);
> +
> +	/*
> +	 * If this is the first time we do a S/W operation
> +	 * (i.e. HCR_TVM not set) flush the whole memory, and set the
> +	 * VM trapping.

what about when the guest boots, wouldn't it be using S/W operations to
flush the whole cache before turning it on?  But we ignore this case now
because we're already trapping VM regs so this is deferred until caches
are turned on.

However, when we are turning off caches, we both flush the cache when
doing the actual S/W operation and when the caches are then turned off.
Why?

> +	 *
> +	 * Otherwise, rely on the VM trapping to wait for the MMU +
> +	 * Caches to be turned off. At that point, we'll be able to
> +	 * clean the caches again.
> +	 */
> +	if (!(hcr & HCR_TVM)) {
> +		trace_kvm_set_way_flush(*vcpu_pc(vcpu),
> +					vcpu_has_cache_enabled(vcpu));
> +		stage2_flush_vm(vcpu->kvm);
> +		vcpu_set_hcr(vcpu, hcr | HCR_TVM);
> +	}
> +}
> +
> +void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
> +{
> +	bool now_enabled = vcpu_has_cache_enabled(vcpu);
> +
> +	/*
> +	 * If switching the MMU+caches on, need to invalidate the caches.
> +	 * If switching it off, need to clean the caches.
> +	 * Clean + invalidate does the trick always.
> +	 */
> +	if (now_enabled != was_enabled)
> +		stage2_flush_vm(vcpu->kvm);
> +
> +	/* Caches are now on, stop trapping VM ops (until a S/W op) */
> +	if (now_enabled)
> +		vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) & ~HCR_TVM);
> +
> +	trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
> +}
> diff --git a/arch/arm/kvm/trace.h b/arch/arm/kvm/trace.h
> index b1d640f..b6a6e71 100644
> --- a/arch/arm/kvm/trace.h
> +++ b/arch/arm/kvm/trace.h
> @@ -223,6 +223,45 @@ TRACE_EVENT(kvm_hvc,
>  		  __entry->vcpu_pc, __entry->r0, __entry->imm)
>  );
>  
> +TRACE_EVENT(kvm_set_way_flush,
> +	    TP_PROTO(unsigned long vcpu_pc, bool cache),
> +	    TP_ARGS(vcpu_pc, cache),
> +
> +	    TP_STRUCT__entry(
> +		    __field(	unsigned long,	vcpu_pc		)
> +		    __field(	bool,		cache		)
> +	    ),
> +
> +	    TP_fast_assign(
> +		    __entry->vcpu_pc		= vcpu_pc;
> +		    __entry->cache		= cache;
> +	    ),
> +
> +	    TP_printk("S/W flush at 0x%016lx (cache %s)",
> +		      __entry->vcpu_pc, __entry->cache ? "on" : "off")
> +);
> +
> +TRACE_EVENT(kvm_toggle_cache,
> +	    TP_PROTO(unsigned long vcpu_pc, bool was, bool now),
> +	    TP_ARGS(vcpu_pc, was, now),
> +
> +	    TP_STRUCT__entry(
> +		    __field(	unsigned long,	vcpu_pc		)
> +		    __field(	bool,		was		)
> +		    __field(	bool,		now		)
> +	    ),
> +
> +	    TP_fast_assign(
> +		    __entry->vcpu_pc		= vcpu_pc;
> +		    __entry->was		= was;
> +		    __entry->now		= now;
> +	    ),
> +
> +	    TP_printk("VM op at 0x%016lx (cache was %s, now %s)",
> +		      __entry->vcpu_pc, __entry->was ? "on" : "off",
> +		      __entry->now ? "on" : "off")
> +);
> +
>  #endif /* _TRACE_KVM_H */
>  
>  #undef TRACE_INCLUDE_PATH
> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
> index 865a7e2..3cb4c85 100644
> --- a/arch/arm64/include/asm/kvm_emulate.h
> +++ b/arch/arm64/include/asm/kvm_emulate.h
> @@ -45,6 +45,16 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
>  		vcpu->arch.hcr_el2 &= ~HCR_RW;
>  }
>  
> +static inline unsigned long vcpu_get_hcr(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.hcr_el2;
> +}
> +
> +static inline void vcpu_set_hcr(struct kvm_vcpu *vcpu, unsigned long hcr)
> +{
> +	vcpu->arch.hcr_el2 = hcr;
> +}
> +
>  static inline unsigned long *vcpu_pc(const struct kvm_vcpu *vcpu)
>  {
>  	return (unsigned long *)&vcpu_gp_regs(vcpu)->regs.pc;
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 0b7dfdb..acd101a 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -116,9 +116,6 @@ struct kvm_vcpu_arch {
>  	 * Anything that is not used directly from assembly code goes
>  	 * here.
>  	 */
> -	/* dcache set/way operation pending */
> -	int last_pcpu;
> -	cpumask_t require_dcache_flush;
>  
>  	/* Don't run the guest */
>  	bool pause;
> diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
> index 14a74f1..92d22e9 100644
> --- a/arch/arm64/include/asm/kvm_mmu.h
> +++ b/arch/arm64/include/asm/kvm_mmu.h
> @@ -260,7 +260,8 @@ static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
>  
>  #define kvm_virt_to_phys(x)		__virt_to_phys((unsigned long)(x))
>  
> -void stage2_flush_vm(struct kvm *kvm);
> +void kvm_set_way_flush(struct kvm_vcpu *vcpu);
> +void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
>  
>  #endif /* __ASSEMBLY__ */
>  #endif /* __ARM64_KVM_MMU_H__ */
> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> index 3d7c2df..95daa73 100644
> --- a/arch/arm64/kvm/sys_regs.c
> +++ b/arch/arm64/kvm/sys_regs.c
> @@ -69,68 +69,31 @@ static u32 get_ccsidr(u32 csselr)
>  	return ccsidr;
>  }
>  
> -static void do_dc_cisw(u32 val)
> -{
> -	asm volatile("dc cisw, %x0" : : "r" (val));
> -	dsb(ish);
> -}
> -
> -static void do_dc_csw(u32 val)
> -{
> -	asm volatile("dc csw, %x0" : : "r" (val));
> -	dsb(ish);
> -}
> -
> -/* See note at ARM ARM B1.14.4 */
> +/*
> + * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
> + */
>  static bool access_dcsw(struct kvm_vcpu *vcpu,
>  			const struct sys_reg_params *p,
>  			const struct sys_reg_desc *r)
>  {
> -	unsigned long val;
> -	int cpu;
> -
>  	if (!p->is_write)
>  		return read_from_write_only(vcpu, p);
>  
> -	cpu = get_cpu();
> -
> -	cpumask_setall(&vcpu->arch.require_dcache_flush);
> -	cpumask_clear_cpu(cpu, &vcpu->arch.require_dcache_flush);
> -
> -	/* If we were already preempted, take the long way around */
> -	if (cpu != vcpu->arch.last_pcpu) {
> -		flush_cache_all();
> -		goto done;
> -	}
> -
> -	val = *vcpu_reg(vcpu, p->Rt);
> -
> -	switch (p->CRm) {
> -	case 6:			/* Upgrade DCISW to DCCISW, as per HCR.SWIO */
> -	case 14:		/* DCCISW */
> -		do_dc_cisw(val);
> -		break;
> -
> -	case 10:		/* DCCSW */
> -		do_dc_csw(val);
> -		break;
> -	}
> -
> -done:
> -	put_cpu();
> -
> +	kvm_set_way_flush(vcpu);	
>  	return true;
>  }
>  
>  /*
>   * Generic accessor for VM registers. Only called as long as HCR_TVM
> - * is set.
> + * is set. If the guest enables the MMU, we stop trapping the VM
> + * sys_regs and leave it in complete control of the caches.
>   */
>  static bool access_vm_reg(struct kvm_vcpu *vcpu,
>  			  const struct sys_reg_params *p,
>  			  const struct sys_reg_desc *r)
>  {
>  	unsigned long val;
> +	bool was_enabled = vcpu_has_cache_enabled(vcpu);
>  
>  	BUG_ON(!p->is_write);
>  
> @@ -143,25 +106,7 @@ static bool access_vm_reg(struct kvm_vcpu *vcpu,
>  		vcpu_cp15_64_low(vcpu, r->reg) = val & 0xffffffffUL;
>  	}
>  
> -	return true;
> -}
> -
> -/*
> - * SCTLR_EL1 accessor. Only called as long as HCR_TVM is set.  If the
> - * guest enables the MMU, we stop trapping the VM sys_regs and leave
> - * it in complete control of the caches.
> - */
> -static bool access_sctlr(struct kvm_vcpu *vcpu,
> -			 const struct sys_reg_params *p,
> -			 const struct sys_reg_desc *r)
> -{
> -	access_vm_reg(vcpu, p, r);
> -
> -	if (vcpu_has_cache_enabled(vcpu)) {	/* MMU+Caches enabled? */
> -		vcpu->arch.hcr_el2 &= ~HCR_TVM;
> -		stage2_flush_vm(vcpu->kvm);
> -	}
> -
> +	kvm_toggle_cache(vcpu, was_enabled);
>  	return true;
>  }
>  
> @@ -377,7 +322,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
>  	  NULL, reset_mpidr, MPIDR_EL1 },
>  	/* SCTLR_EL1 */
>  	{ Op0(0b11), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b000),
> -	  access_sctlr, reset_val, SCTLR_EL1, 0x00C50078 },
> +	  access_vm_reg, reset_val, SCTLR_EL1, 0x00C50078 },
>  	/* CPACR_EL1 */
>  	{ Op0(0b11), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b010),
>  	  NULL, reset_val, CPACR_EL1, 0 },
> @@ -657,7 +602,7 @@ static const struct sys_reg_desc cp14_64_regs[] = {
>   * register).
>   */
>  static const struct sys_reg_desc cp15_regs[] = {
> -	{ Op1( 0), CRn( 1), CRm( 0), Op2( 0), access_sctlr, NULL, c1_SCTLR },
> +	{ Op1( 0), CRn( 1), CRm( 0), Op2( 0), access_vm_reg, NULL, c1_SCTLR },
>  	{ Op1( 0), CRn( 2), CRm( 0), Op2( 0), access_vm_reg, NULL, c2_TTBR0 },
>  	{ Op1( 0), CRn( 2), CRm( 0), Op2( 1), access_vm_reg, NULL, c2_TTBR1 },
>  	{ Op1( 0), CRn( 2), CRm( 0), Op2( 2), access_vm_reg, NULL, c2_TTBCR },
> -- 
> 2.1.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marc Zyngier Jan. 27, 2015, 11:21 a.m. UTC | #2
On 26/01/15 22:58, Christoffer Dall wrote:
> On Wed, Jan 21, 2015 at 06:39:46PM +0000, Marc Zyngier wrote:
>> Trying to emulate the behaviour of set/way cache ops is fairly
>> pointless, as there are too many ways we can end-up missing stuff.
>> Also, there is some system caches out there that simply ignore
>> set/way operations.
>>
>> So instead of trying to implement them, let's convert it to VA ops,
>> and use them as a way to re-enable the trapping of VM ops. That way,
>> we can detect the point when the MMU/caches are turned off, and do
>> a full VM flush (which is what the guest was trying to do anyway).
>>
>> This allows a 32bit zImage to boot on the APM thingy, and will
>> probably help bootloaders in general.
>>
>> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
> 
> This had some conflicts with dirty page logging.  I fixed it up here,
> and also removed some trailing white space and mixed spaces/tabs that
> patch complained about:
> 
> http://git.linaro.org/people/christoffer.dall/linux-kvm-arm.git mm-fixes

Thanks for doing so.

>> ---
>>  arch/arm/include/asm/kvm_emulate.h   | 10 +++++
>>  arch/arm/include/asm/kvm_host.h      |  3 --
>>  arch/arm/include/asm/kvm_mmu.h       |  3 +-
>>  arch/arm/kvm/arm.c                   | 10 -----
>>  arch/arm/kvm/coproc.c                | 64 ++++++------------------------
>>  arch/arm/kvm/coproc_a15.c            |  2 +-
>>  arch/arm/kvm/coproc_a7.c             |  2 +-
>>  arch/arm/kvm/mmu.c                   | 70 ++++++++++++++++++++++++++++++++-
>>  arch/arm/kvm/trace.h                 | 39 +++++++++++++++++++
>>  arch/arm64/include/asm/kvm_emulate.h | 10 +++++
>>  arch/arm64/include/asm/kvm_host.h    |  3 --
>>  arch/arm64/include/asm/kvm_mmu.h     |  3 +-
>>  arch/arm64/kvm/sys_regs.c            | 75 +++++-------------------------------
>>  13 files changed, 155 insertions(+), 139 deletions(-)
>>
>> diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h
>> index 66ce176..7b01523 100644
>> --- a/arch/arm/include/asm/kvm_emulate.h
>> +++ b/arch/arm/include/asm/kvm_emulate.h
>> @@ -38,6 +38,16 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
>>       vcpu->arch.hcr = HCR_GUEST_MASK;
>>  }
>>
>> +static inline unsigned long vcpu_get_hcr(struct kvm_vcpu *vcpu)
>> +{
>> +     return vcpu->arch.hcr;
>> +}
>> +
>> +static inline void vcpu_set_hcr(struct kvm_vcpu *vcpu, unsigned long hcr)
>> +{
>> +     vcpu->arch.hcr = hcr;
>> +}
>> +
>>  static inline bool vcpu_mode_is_32bit(struct kvm_vcpu *vcpu)
>>  {
>>       return 1;
>> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
>> index 254e065..04b4ea0 100644
>> --- a/arch/arm/include/asm/kvm_host.h
>> +++ b/arch/arm/include/asm/kvm_host.h
>> @@ -125,9 +125,6 @@ struct kvm_vcpu_arch {
>>        * Anything that is not used directly from assembly code goes
>>        * here.
>>        */
>> -     /* dcache set/way operation pending */
>> -     int last_pcpu;
>> -     cpumask_t require_dcache_flush;
>>
>>       /* Don't run the guest on this vcpu */
>>       bool pause;
>> diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
>> index 63e0ecc..286644c 100644
>> --- a/arch/arm/include/asm/kvm_mmu.h
>> +++ b/arch/arm/include/asm/kvm_mmu.h
>> @@ -190,7 +190,8 @@ static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
>>
>>  #define kvm_virt_to_phys(x)          virt_to_idmap((unsigned long)(x))
>>
>> -void stage2_flush_vm(struct kvm *kvm);
>> +void kvm_set_way_flush(struct kvm_vcpu *vcpu);
>> +void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
>>
>>  #endif       /* !__ASSEMBLY__ */
>>
>> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
>> index 2d6d910..0b0d58a 100644
>> --- a/arch/arm/kvm/arm.c
>> +++ b/arch/arm/kvm/arm.c
>> @@ -281,15 +281,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>>       vcpu->cpu = cpu;
>>       vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state);
>>
>> -     /*
>> -      * Check whether this vcpu requires the cache to be flushed on
>> -      * this physical CPU. This is a consequence of doing dcache
>> -      * operations by set/way on this vcpu. We do it here to be in
>> -      * a non-preemptible section.
>> -      */
>> -     if (cpumask_test_and_clear_cpu(cpu, &vcpu->arch.require_dcache_flush))
>> -             flush_cache_all(); /* We'd really want v7_flush_dcache_all() */
>> -
>>       kvm_arm_set_running_vcpu(vcpu);
>>  }
>>
>> @@ -541,7 +532,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
>>               ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);
>>
>>               vcpu->mode = OUTSIDE_GUEST_MODE;
>> -             vcpu->arch.last_pcpu = smp_processor_id();
>>               kvm_guest_exit();
>>               trace_kvm_exit(*vcpu_pc(vcpu));
>>               /*
>> diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c
>> index 7928dbd..0afcc00 100644
>> --- a/arch/arm/kvm/coproc.c
>> +++ b/arch/arm/kvm/coproc.c
>> @@ -189,82 +189,40 @@ static bool access_l2ectlr(struct kvm_vcpu *vcpu,
>>       return true;
>>  }
>>
>> -/* See note at ARM ARM B1.14.4 */
>> +/*
>> + * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
>> + */
>>  static bool access_dcsw(struct kvm_vcpu *vcpu,
>>                       const struct coproc_params *p,
>>                       const struct coproc_reg *r)
>>  {
>> -     unsigned long val;
>> -     int cpu;
>> -
>>       if (!p->is_write)
>>               return read_from_write_only(vcpu, p);
>>
>> -     cpu = get_cpu();
>> -
>> -     cpumask_setall(&vcpu->arch.require_dcache_flush);
>> -     cpumask_clear_cpu(cpu, &vcpu->arch.require_dcache_flush);
>> -
>> -     /* If we were already preempted, take the long way around */
>> -     if (cpu != vcpu->arch.last_pcpu) {
>> -             flush_cache_all();
>> -             goto done;
>> -     }
>> -
>> -     val = *vcpu_reg(vcpu, p->Rt1);
>> -
>> -     switch (p->CRm) {
>> -     case 6:                 /* Upgrade DCISW to DCCISW, as per HCR.SWIO */
>> -     case 14:                /* DCCISW */
>> -             asm volatile("mcr p15, 0, %0, c7, c14, 2" : : "r" (val));
>> -             break;
>> -
>> -     case 10:                /* DCCSW */
>> -             asm volatile("mcr p15, 0, %0, c7, c10, 2" : : "r" (val));
>> -             break;
>> -     }
>> -
>> -done:
>> -     put_cpu();
>> -
>> +     kvm_set_way_flush(vcpu);
>>       return true;
>>  }
>>
>>  /*
>>   * Generic accessor for VM registers. Only called as long as HCR_TVM
>> - * is set.
>> + * is set.  If the guest enables the MMU, we stop trapping the VM
>> + * sys_regs and leave it in complete control of the caches.
>> + *
>> + * Used by the cpu-specific code.
>>   */
>>  static bool access_vm_reg(struct kvm_vcpu *vcpu,
>>                         const struct coproc_params *p,
>>                         const struct coproc_reg *r)
>>  {
>> +     bool was_enabled = vcpu_has_cache_enabled(vcpu);
>> +
>>       BUG_ON(!p->is_write);
>>
>>       vcpu->arch.cp15[r->reg] = *vcpu_reg(vcpu, p->Rt1);
>>       if (p->is_64bit)
>>               vcpu->arch.cp15[r->reg + 1] = *vcpu_reg(vcpu, p->Rt2);
>>
>> -     return true;
>> -}
>> -
>> -/*
>> - * SCTLR accessor. Only called as long as HCR_TVM is set.  If the
>> - * guest enables the MMU, we stop trapping the VM sys_regs and leave
>> - * it in complete control of the caches.
>> - *
>> - * Used by the cpu-specific code.
>> - */
>> -bool access_sctlr(struct kvm_vcpu *vcpu,
>> -               const struct coproc_params *p,
>> -               const struct coproc_reg *r)
>> -{
> 
> I think you left in a prototype in arch/arm/kvm/coproc.h
> 
>> -     access_vm_reg(vcpu, p, r);
>> -
>> -     if (vcpu_has_cache_enabled(vcpu)) {     /* MMU+Caches enabled? */
>> -             vcpu->arch.hcr &= ~HCR_TVM;
>> -             stage2_flush_vm(vcpu->kvm);
>> -     }
>> -
>> +     kvm_toggle_cache(vcpu, was_enabled);
>>       return true;
>>  }
>>
>> diff --git a/arch/arm/kvm/coproc_a15.c b/arch/arm/kvm/coproc_a15.c
>> index e6f4ae4..a713675 100644
>> --- a/arch/arm/kvm/coproc_a15.c
>> +++ b/arch/arm/kvm/coproc_a15.c
>> @@ -34,7 +34,7 @@
>>  static const struct coproc_reg a15_regs[] = {
>>       /* SCTLR: swapped by interrupt.S. */
>>       { CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32,
>> -                     access_sctlr, reset_val, c1_SCTLR, 0x00C50078 },
>> +                     access_vm_reg, reset_val, c1_SCTLR, 0x00C50078 },
>>  };
>>
>>  static struct kvm_coproc_target_table a15_target_table = {
>> diff --git a/arch/arm/kvm/coproc_a7.c b/arch/arm/kvm/coproc_a7.c
>> index 17fc7cd..b19e46d 100644
>> --- a/arch/arm/kvm/coproc_a7.c
>> +++ b/arch/arm/kvm/coproc_a7.c
>> @@ -37,7 +37,7 @@
>>  static const struct coproc_reg a7_regs[] = {
>>       /* SCTLR: swapped by interrupt.S. */
>>       { CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32,
>> -                     access_sctlr, reset_val, c1_SCTLR, 0x00C50878 },
>> +                     access_vm_reg, reset_val, c1_SCTLR, 0x00C50878 },
> 
> I'm confused how you tested this, this doesn't seem to compile for me
> (access_vm_reg is a static function).

That's worrying. It looks like I dropped a fixup while rebasing the
series. Nothing major, but still... I'll fix that.

>>  };
>>
>>  static struct kvm_coproc_target_table a7_target_table = {
>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>> index 1dc9778..106737e 100644
>> --- a/arch/arm/kvm/mmu.c
>> +++ b/arch/arm/kvm/mmu.c
>> @@ -278,7 +278,7 @@ static void stage2_flush_memslot(struct kvm *kvm,
>>   * Go through the stage 2 page tables and invalidate any cache lines
>>   * backing memory already mapped to the VM.
>>   */
>> -void stage2_flush_vm(struct kvm *kvm)
>> +static void stage2_flush_vm(struct kvm *kvm)
>>  {
>>       struct kvm_memslots *slots;
>>       struct kvm_memory_slot *memslot;
>> @@ -1411,3 +1411,71 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
>>       unmap_stage2_range(kvm, gpa, size);
>>       spin_unlock(&kvm->mmu_lock);
>>  }
>> +
>> +/*
>> + * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
>> + *
>> + * Main problems:
>> + * - S/W ops are local to a CPU (not broadcast)
>> + * - We have line migration behind our back (speculation)
>> + * - System caches don't support S/W at all (damn!)
>> + *
>> + * In the face of the above, the best we can do is to try and convert
>> + * S/W ops to VA ops. Because the guest is not allowed to infer the
>> + * S/W to PA mapping, it can only use S/W to nuke the whole cache,
>> + * which is a rather good thing for us.
>> + *
>> + * Also, it is only used when turning caches on/off ("The expected
>> + * usage of the cache maintenance instructions that operate by set/way
>> + * is associated with the cache maintenance instructions associated
>> + * with the powerdown and powerup of caches, if this is required by
>> + * the implementation.").
>> + *
>> + * We use the following policy:
>> + *
>> + * - If we trap a S/W operation, we enable VM trapping to detect
>> + *   caches being turned on/off, and do a full clean.
>> + *
>> + * - We flush the caches on both caches being turned on and off.
>> + *
>> + * - Once the caches are enabled, we stop trapping VM ops.
>> + */
>> +void kvm_set_way_flush(struct kvm_vcpu *vcpu)
>> +{
>> +     unsigned long hcr = vcpu_get_hcr(vcpu);
>> +
>> +     /*
>> +      * If this is the first time we do a S/W operation
>> +      * (i.e. HCR_TVM not set) flush the whole memory, and set the
>> +      * VM trapping.
> 
> what about when the guest boots, wouldn't it be using S/W operations to
> flush the whole cache before turning it on?  But we ignore this case now
> because we're already trapping VM regs so this is deferred until caches
> are turned on.

Exactly. Doing S/W ops with caches off can always be postponed until we
turn it on (this is the point where it actually matters).

> However, when we are turning off caches, we both flush the cache when
> doing the actual S/W operation and when the caches are then turned off.
> Why?

Think of it as a "belt and braces" measure. The guest tries to do
something *at that point*, and we should respect that. It doesn't mean
it makes sense from our point of view though. This is what the 32bit
kernel does, BTW (have a look at early_cachepolicy -- Mark Rutland is
trying to get rid of this).

So you can think of these as two different, rather unrelated events:
- Guest does S/W ops with caches on: we flush, knowing that this is a
best effort thing, probably doomed, but hey...
- Guest turns its caches off: we flush, knowing that this is the right
time to do it.

Hope this helps,

	M.
Christoffer Dall Jan. 27, 2015, 1:17 p.m. UTC | #3
On Tue, Jan 27, 2015 at 11:21:38AM +0000, Marc Zyngier wrote:
> On 26/01/15 22:58, Christoffer Dall wrote:
> > On Wed, Jan 21, 2015 at 06:39:46PM +0000, Marc Zyngier wrote:
> >> Trying to emulate the behaviour of set/way cache ops is fairly
> >> pointless, as there are too many ways we can end-up missing stuff.
> >> Also, there is some system caches out there that simply ignore
> >> set/way operations.
> >>
> >> So instead of trying to implement them, let's convert it to VA ops,
> >> and use them as a way to re-enable the trapping of VM ops. That way,
> >> we can detect the point when the MMU/caches are turned off, and do
> >> a full VM flush (which is what the guest was trying to do anyway).
> >>
> >> This allows a 32bit zImage to boot on the APM thingy, and will
> >> probably help bootloaders in general.
> >>
> >> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
> > 
> > This had some conflicts with dirty page logging.  I fixed it up here,
> > and also removed some trailing white space and mixed spaces/tabs that
> > patch complained about:
> > 
> > http://git.linaro.org/people/christoffer.dall/linux-kvm-arm.git mm-fixes
> 
> Thanks for doing so.
> 
> >> ---
> >>  arch/arm/include/asm/kvm_emulate.h   | 10 +++++
> >>  arch/arm/include/asm/kvm_host.h      |  3 --
> >>  arch/arm/include/asm/kvm_mmu.h       |  3 +-
> >>  arch/arm/kvm/arm.c                   | 10 -----
> >>  arch/arm/kvm/coproc.c                | 64 ++++++------------------------
> >>  arch/arm/kvm/coproc_a15.c            |  2 +-
> >>  arch/arm/kvm/coproc_a7.c             |  2 +-
> >>  arch/arm/kvm/mmu.c                   | 70 ++++++++++++++++++++++++++++++++-
> >>  arch/arm/kvm/trace.h                 | 39 +++++++++++++++++++
> >>  arch/arm64/include/asm/kvm_emulate.h | 10 +++++
> >>  arch/arm64/include/asm/kvm_host.h    |  3 --
> >>  arch/arm64/include/asm/kvm_mmu.h     |  3 +-
> >>  arch/arm64/kvm/sys_regs.c            | 75 +++++-------------------------------
> >>  13 files changed, 155 insertions(+), 139 deletions(-)
> >>
> >> diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h
> >> index 66ce176..7b01523 100644
> >> --- a/arch/arm/include/asm/kvm_emulate.h
> >> +++ b/arch/arm/include/asm/kvm_emulate.h
> >> @@ -38,6 +38,16 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
> >>       vcpu->arch.hcr = HCR_GUEST_MASK;
> >>  }
> >>
> >> +static inline unsigned long vcpu_get_hcr(struct kvm_vcpu *vcpu)
> >> +{
> >> +     return vcpu->arch.hcr;
> >> +}
> >> +
> >> +static inline void vcpu_set_hcr(struct kvm_vcpu *vcpu, unsigned long hcr)
> >> +{
> >> +     vcpu->arch.hcr = hcr;
> >> +}
> >> +
> >>  static inline bool vcpu_mode_is_32bit(struct kvm_vcpu *vcpu)
> >>  {
> >>       return 1;
> >> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
> >> index 254e065..04b4ea0 100644
> >> --- a/arch/arm/include/asm/kvm_host.h
> >> +++ b/arch/arm/include/asm/kvm_host.h
> >> @@ -125,9 +125,6 @@ struct kvm_vcpu_arch {
> >>        * Anything that is not used directly from assembly code goes
> >>        * here.
> >>        */
> >> -     /* dcache set/way operation pending */
> >> -     int last_pcpu;
> >> -     cpumask_t require_dcache_flush;
> >>
> >>       /* Don't run the guest on this vcpu */
> >>       bool pause;
> >> diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
> >> index 63e0ecc..286644c 100644
> >> --- a/arch/arm/include/asm/kvm_mmu.h
> >> +++ b/arch/arm/include/asm/kvm_mmu.h
> >> @@ -190,7 +190,8 @@ static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
> >>
> >>  #define kvm_virt_to_phys(x)          virt_to_idmap((unsigned long)(x))
> >>
> >> -void stage2_flush_vm(struct kvm *kvm);
> >> +void kvm_set_way_flush(struct kvm_vcpu *vcpu);
> >> +void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
> >>
> >>  #endif       /* !__ASSEMBLY__ */
> >>
> >> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
> >> index 2d6d910..0b0d58a 100644
> >> --- a/arch/arm/kvm/arm.c
> >> +++ b/arch/arm/kvm/arm.c
> >> @@ -281,15 +281,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> >>       vcpu->cpu = cpu;
> >>       vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state);
> >>
> >> -     /*
> >> -      * Check whether this vcpu requires the cache to be flushed on
> >> -      * this physical CPU. This is a consequence of doing dcache
> >> -      * operations by set/way on this vcpu. We do it here to be in
> >> -      * a non-preemptible section.
> >> -      */
> >> -     if (cpumask_test_and_clear_cpu(cpu, &vcpu->arch.require_dcache_flush))
> >> -             flush_cache_all(); /* We'd really want v7_flush_dcache_all() */
> >> -
> >>       kvm_arm_set_running_vcpu(vcpu);
> >>  }
> >>
> >> @@ -541,7 +532,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
> >>               ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);
> >>
> >>               vcpu->mode = OUTSIDE_GUEST_MODE;
> >> -             vcpu->arch.last_pcpu = smp_processor_id();
> >>               kvm_guest_exit();
> >>               trace_kvm_exit(*vcpu_pc(vcpu));
> >>               /*
> >> diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c
> >> index 7928dbd..0afcc00 100644
> >> --- a/arch/arm/kvm/coproc.c
> >> +++ b/arch/arm/kvm/coproc.c
> >> @@ -189,82 +189,40 @@ static bool access_l2ectlr(struct kvm_vcpu *vcpu,
> >>       return true;
> >>  }
> >>
> >> -/* See note at ARM ARM B1.14.4 */
> >> +/*
> >> + * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
> >> + */
> >>  static bool access_dcsw(struct kvm_vcpu *vcpu,
> >>                       const struct coproc_params *p,
> >>                       const struct coproc_reg *r)
> >>  {
> >> -     unsigned long val;
> >> -     int cpu;
> >> -
> >>       if (!p->is_write)
> >>               return read_from_write_only(vcpu, p);
> >>
> >> -     cpu = get_cpu();
> >> -
> >> -     cpumask_setall(&vcpu->arch.require_dcache_flush);
> >> -     cpumask_clear_cpu(cpu, &vcpu->arch.require_dcache_flush);
> >> -
> >> -     /* If we were already preempted, take the long way around */
> >> -     if (cpu != vcpu->arch.last_pcpu) {
> >> -             flush_cache_all();
> >> -             goto done;
> >> -     }
> >> -
> >> -     val = *vcpu_reg(vcpu, p->Rt1);
> >> -
> >> -     switch (p->CRm) {
> >> -     case 6:                 /* Upgrade DCISW to DCCISW, as per HCR.SWIO */
> >> -     case 14:                /* DCCISW */
> >> -             asm volatile("mcr p15, 0, %0, c7, c14, 2" : : "r" (val));
> >> -             break;
> >> -
> >> -     case 10:                /* DCCSW */
> >> -             asm volatile("mcr p15, 0, %0, c7, c10, 2" : : "r" (val));
> >> -             break;
> >> -     }
> >> -
> >> -done:
> >> -     put_cpu();
> >> -
> >> +     kvm_set_way_flush(vcpu);
> >>       return true;
> >>  }
> >>
> >>  /*
> >>   * Generic accessor for VM registers. Only called as long as HCR_TVM
> >> - * is set.
> >> + * is set.  If the guest enables the MMU, we stop trapping the VM
> >> + * sys_regs and leave it in complete control of the caches.
> >> + *
> >> + * Used by the cpu-specific code.
> >>   */
> >>  static bool access_vm_reg(struct kvm_vcpu *vcpu,
> >>                         const struct coproc_params *p,
> >>                         const struct coproc_reg *r)
> >>  {
> >> +     bool was_enabled = vcpu_has_cache_enabled(vcpu);
> >> +
> >>       BUG_ON(!p->is_write);
> >>
> >>       vcpu->arch.cp15[r->reg] = *vcpu_reg(vcpu, p->Rt1);
> >>       if (p->is_64bit)
> >>               vcpu->arch.cp15[r->reg + 1] = *vcpu_reg(vcpu, p->Rt2);
> >>
> >> -     return true;
> >> -}
> >> -
> >> -/*
> >> - * SCTLR accessor. Only called as long as HCR_TVM is set.  If the
> >> - * guest enables the MMU, we stop trapping the VM sys_regs and leave
> >> - * it in complete control of the caches.
> >> - *
> >> - * Used by the cpu-specific code.
> >> - */
> >> -bool access_sctlr(struct kvm_vcpu *vcpu,
> >> -               const struct coproc_params *p,
> >> -               const struct coproc_reg *r)
> >> -{
> > 
> > I think you left in a prototype in arch/arm/kvm/coproc.h
> > 
> >> -     access_vm_reg(vcpu, p, r);
> >> -
> >> -     if (vcpu_has_cache_enabled(vcpu)) {     /* MMU+Caches enabled? */
> >> -             vcpu->arch.hcr &= ~HCR_TVM;
> >> -             stage2_flush_vm(vcpu->kvm);
> >> -     }
> >> -
> >> +     kvm_toggle_cache(vcpu, was_enabled);
> >>       return true;
> >>  }
> >>
> >> diff --git a/arch/arm/kvm/coproc_a15.c b/arch/arm/kvm/coproc_a15.c
> >> index e6f4ae4..a713675 100644
> >> --- a/arch/arm/kvm/coproc_a15.c
> >> +++ b/arch/arm/kvm/coproc_a15.c
> >> @@ -34,7 +34,7 @@
> >>  static const struct coproc_reg a15_regs[] = {
> >>       /* SCTLR: swapped by interrupt.S. */
> >>       { CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32,
> >> -                     access_sctlr, reset_val, c1_SCTLR, 0x00C50078 },
> >> +                     access_vm_reg, reset_val, c1_SCTLR, 0x00C50078 },
> >>  };
> >>
> >>  static struct kvm_coproc_target_table a15_target_table = {
> >> diff --git a/arch/arm/kvm/coproc_a7.c b/arch/arm/kvm/coproc_a7.c
> >> index 17fc7cd..b19e46d 100644
> >> --- a/arch/arm/kvm/coproc_a7.c
> >> +++ b/arch/arm/kvm/coproc_a7.c
> >> @@ -37,7 +37,7 @@
> >>  static const struct coproc_reg a7_regs[] = {
> >>       /* SCTLR: swapped by interrupt.S. */
> >>       { CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32,
> >> -                     access_sctlr, reset_val, c1_SCTLR, 0x00C50878 },
> >> +                     access_vm_reg, reset_val, c1_SCTLR, 0x00C50878 },
> > 
> > I'm confused how you tested this, this doesn't seem to compile for me
> > (access_vm_reg is a static function).
> 
> That's worrying. It looks like I dropped a fixup while rebasing the
> series. Nothing major, but still... I'll fix that.
> 

no worries, just wanted to make sure we have tested the right thing.

> >>  };
> >>
> >>  static struct kvm_coproc_target_table a7_target_table = {
> >> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> >> index 1dc9778..106737e 100644
> >> --- a/arch/arm/kvm/mmu.c
> >> +++ b/arch/arm/kvm/mmu.c
> >> @@ -278,7 +278,7 @@ static void stage2_flush_memslot(struct kvm *kvm,
> >>   * Go through the stage 2 page tables and invalidate any cache lines
> >>   * backing memory already mapped to the VM.
> >>   */
> >> -void stage2_flush_vm(struct kvm *kvm)
> >> +static void stage2_flush_vm(struct kvm *kvm)
> >>  {
> >>       struct kvm_memslots *slots;
> >>       struct kvm_memory_slot *memslot;
> >> @@ -1411,3 +1411,71 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
> >>       unmap_stage2_range(kvm, gpa, size);
> >>       spin_unlock(&kvm->mmu_lock);
> >>  }
> >> +
> >> +/*
> >> + * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
> >> + *
> >> + * Main problems:
> >> + * - S/W ops are local to a CPU (not broadcast)
> >> + * - We have line migration behind our back (speculation)
> >> + * - System caches don't support S/W at all (damn!)
> >> + *
> >> + * In the face of the above, the best we can do is to try and convert
> >> + * S/W ops to VA ops. Because the guest is not allowed to infer the
> >> + * S/W to PA mapping, it can only use S/W to nuke the whole cache,
> >> + * which is a rather good thing for us.
> >> + *
> >> + * Also, it is only used when turning caches on/off ("The expected
> >> + * usage of the cache maintenance instructions that operate by set/way
> >> + * is associated with the cache maintenance instructions associated
> >> + * with the powerdown and powerup of caches, if this is required by
> >> + * the implementation.").
> >> + *
> >> + * We use the following policy:
> >> + *
> >> + * - If we trap a S/W operation, we enable VM trapping to detect
> >> + *   caches being turned on/off, and do a full clean.
> >> + *
> >> + * - We flush the caches on both caches being turned on and off.
> >> + *
> >> + * - Once the caches are enabled, we stop trapping VM ops.
> >> + */
> >> +void kvm_set_way_flush(struct kvm_vcpu *vcpu)
> >> +{
> >> +     unsigned long hcr = vcpu_get_hcr(vcpu);
> >> +
> >> +     /*
> >> +      * If this is the first time we do a S/W operation
> >> +      * (i.e. HCR_TVM not set) flush the whole memory, and set the
> >> +      * VM trapping.
> > 
> > what about when the guest boots, wouldn't it be using S/W operations to
> > flush the whole cache before turning it on?  But we ignore this case now
> > because we're already trapping VM regs so this is deferred until caches
> > are turned on.
> 
> Exactly. Doing S/W ops with caches off can always be postponed until we
> turn it on (this is the point where it actually matters).
> 
> > However, when we are turning off caches, we both flush the cache when
> > doing the actual S/W operation and when the caches are then turned off.
> > Why?
> 
> Think of it as a "belt and braces" measure. The guest tries to do
> something *at that point*, and we should respect that. It doesn't mean
> it makes sense from our point of view though. This is what the 32bit
> kernel does, BTW (have a look at early_cachepolicy -- Mark Rutland is
> trying to get rid of this).

ok, and we don't need the "belt and braces" when the MMU is off because
the guest is not accessing the caches anyhow?

So shouldn't we be detecting the guest MMU state instead? But we found
this to be unstable before (for a still unknown reason), so we instead
rely on always having trapping TVM set when the MMU is off?

So is there any conceivable guest (UEFI or something else) that could
correctly be turning off the MMU without calling invalidate by set/way
and creating a situation that we don't catch now?  Or is that not a
problem because in the worst case we'll just be flushing too often?

What about the opposite case, if we have trapping set, but we actually
do need to flush the caches, how are we sure this never happens?

> 
> So you can think of these as two different, rather unrelated events:
> - Guest does S/W ops with caches on: we flush, knowing that this is a
> best effort thing, probably doomed, but hey...

ok, so Linux is basically doing something architecturally invalid?

> - Guest turns its caches off: we flush, knowing that this is the right
> time to do it.
> 
> Hope this helps,
> 

Yes a bit, but this does feel incredibly brittle, but I know it may be
the best option we have....

Thanks,
-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marc Zyngier Jan. 27, 2015, 1:44 p.m. UTC | #4
On 27/01/15 13:17, Christoffer Dall wrote:
> On Tue, Jan 27, 2015 at 11:21:38AM +0000, Marc Zyngier wrote:
>> On 26/01/15 22:58, Christoffer Dall wrote:
>>> On Wed, Jan 21, 2015 at 06:39:46PM +0000, Marc Zyngier wrote:
>>>> Trying to emulate the behaviour of set/way cache ops is fairly
>>>> pointless, as there are too many ways we can end-up missing stuff.
>>>> Also, there is some system caches out there that simply ignore
>>>> set/way operations.
>>>>
>>>> So instead of trying to implement them, let's convert it to VA ops,
>>>> and use them as a way to re-enable the trapping of VM ops. That way,
>>>> we can detect the point when the MMU/caches are turned off, and do
>>>> a full VM flush (which is what the guest was trying to do anyway).
>>>>
>>>> This allows a 32bit zImage to boot on the APM thingy, and will
>>>> probably help bootloaders in general.
>>>>
>>>> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
>>>
>>> This had some conflicts with dirty page logging.  I fixed it up here,
>>> and also removed some trailing white space and mixed spaces/tabs that
>>> patch complained about:
>>>
>>> http://git.linaro.org/people/christoffer.dall/linux-kvm-arm.git mm-fixes
>>
>> Thanks for doing so.
>>
>>>> ---
>>>>  arch/arm/include/asm/kvm_emulate.h   | 10 +++++
>>>>  arch/arm/include/asm/kvm_host.h      |  3 --
>>>>  arch/arm/include/asm/kvm_mmu.h       |  3 +-
>>>>  arch/arm/kvm/arm.c                   | 10 -----
>>>>  arch/arm/kvm/coproc.c                | 64 ++++++------------------------
>>>>  arch/arm/kvm/coproc_a15.c            |  2 +-
>>>>  arch/arm/kvm/coproc_a7.c             |  2 +-
>>>>  arch/arm/kvm/mmu.c                   | 70 ++++++++++++++++++++++++++++++++-
>>>>  arch/arm/kvm/trace.h                 | 39 +++++++++++++++++++
>>>>  arch/arm64/include/asm/kvm_emulate.h | 10 +++++
>>>>  arch/arm64/include/asm/kvm_host.h    |  3 --
>>>>  arch/arm64/include/asm/kvm_mmu.h     |  3 +-
>>>>  arch/arm64/kvm/sys_regs.c            | 75 +++++-------------------------------
>>>>  13 files changed, 155 insertions(+), 139 deletions(-)
>>>>
>>>> diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h
>>>> index 66ce176..7b01523 100644
>>>> --- a/arch/arm/include/asm/kvm_emulate.h
>>>> +++ b/arch/arm/include/asm/kvm_emulate.h
>>>> @@ -38,6 +38,16 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
>>>>       vcpu->arch.hcr = HCR_GUEST_MASK;
>>>>  }
>>>>
>>>> +static inline unsigned long vcpu_get_hcr(struct kvm_vcpu *vcpu)
>>>> +{
>>>> +     return vcpu->arch.hcr;
>>>> +}
>>>> +
>>>> +static inline void vcpu_set_hcr(struct kvm_vcpu *vcpu, unsigned long hcr)
>>>> +{
>>>> +     vcpu->arch.hcr = hcr;
>>>> +}
>>>> +
>>>>  static inline bool vcpu_mode_is_32bit(struct kvm_vcpu *vcpu)
>>>>  {
>>>>       return 1;
>>>> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
>>>> index 254e065..04b4ea0 100644
>>>> --- a/arch/arm/include/asm/kvm_host.h
>>>> +++ b/arch/arm/include/asm/kvm_host.h
>>>> @@ -125,9 +125,6 @@ struct kvm_vcpu_arch {
>>>>        * Anything that is not used directly from assembly code goes
>>>>        * here.
>>>>        */
>>>> -     /* dcache set/way operation pending */
>>>> -     int last_pcpu;
>>>> -     cpumask_t require_dcache_flush;
>>>>
>>>>       /* Don't run the guest on this vcpu */
>>>>       bool pause;
>>>> diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
>>>> index 63e0ecc..286644c 100644
>>>> --- a/arch/arm/include/asm/kvm_mmu.h
>>>> +++ b/arch/arm/include/asm/kvm_mmu.h
>>>> @@ -190,7 +190,8 @@ static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
>>>>
>>>>  #define kvm_virt_to_phys(x)          virt_to_idmap((unsigned long)(x))
>>>>
>>>> -void stage2_flush_vm(struct kvm *kvm);
>>>> +void kvm_set_way_flush(struct kvm_vcpu *vcpu);
>>>> +void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
>>>>
>>>>  #endif       /* !__ASSEMBLY__ */
>>>>
>>>> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
>>>> index 2d6d910..0b0d58a 100644
>>>> --- a/arch/arm/kvm/arm.c
>>>> +++ b/arch/arm/kvm/arm.c
>>>> @@ -281,15 +281,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>>>>       vcpu->cpu = cpu;
>>>>       vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state);
>>>>
>>>> -     /*
>>>> -      * Check whether this vcpu requires the cache to be flushed on
>>>> -      * this physical CPU. This is a consequence of doing dcache
>>>> -      * operations by set/way on this vcpu. We do it here to be in
>>>> -      * a non-preemptible section.
>>>> -      */
>>>> -     if (cpumask_test_and_clear_cpu(cpu, &vcpu->arch.require_dcache_flush))
>>>> -             flush_cache_all(); /* We'd really want v7_flush_dcache_all() */
>>>> -
>>>>       kvm_arm_set_running_vcpu(vcpu);
>>>>  }
>>>>
>>>> @@ -541,7 +532,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
>>>>               ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);
>>>>
>>>>               vcpu->mode = OUTSIDE_GUEST_MODE;
>>>> -             vcpu->arch.last_pcpu = smp_processor_id();
>>>>               kvm_guest_exit();
>>>>               trace_kvm_exit(*vcpu_pc(vcpu));
>>>>               /*
>>>> diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c
>>>> index 7928dbd..0afcc00 100644
>>>> --- a/arch/arm/kvm/coproc.c
>>>> +++ b/arch/arm/kvm/coproc.c
>>>> @@ -189,82 +189,40 @@ static bool access_l2ectlr(struct kvm_vcpu *vcpu,
>>>>       return true;
>>>>  }
>>>>
>>>> -/* See note at ARM ARM B1.14.4 */
>>>> +/*
>>>> + * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
>>>> + */
>>>>  static bool access_dcsw(struct kvm_vcpu *vcpu,
>>>>                       const struct coproc_params *p,
>>>>                       const struct coproc_reg *r)
>>>>  {
>>>> -     unsigned long val;
>>>> -     int cpu;
>>>> -
>>>>       if (!p->is_write)
>>>>               return read_from_write_only(vcpu, p);
>>>>
>>>> -     cpu = get_cpu();
>>>> -
>>>> -     cpumask_setall(&vcpu->arch.require_dcache_flush);
>>>> -     cpumask_clear_cpu(cpu, &vcpu->arch.require_dcache_flush);
>>>> -
>>>> -     /* If we were already preempted, take the long way around */
>>>> -     if (cpu != vcpu->arch.last_pcpu) {
>>>> -             flush_cache_all();
>>>> -             goto done;
>>>> -     }
>>>> -
>>>> -     val = *vcpu_reg(vcpu, p->Rt1);
>>>> -
>>>> -     switch (p->CRm) {
>>>> -     case 6:                 /* Upgrade DCISW to DCCISW, as per HCR.SWIO */
>>>> -     case 14:                /* DCCISW */
>>>> -             asm volatile("mcr p15, 0, %0, c7, c14, 2" : : "r" (val));
>>>> -             break;
>>>> -
>>>> -     case 10:                /* DCCSW */
>>>> -             asm volatile("mcr p15, 0, %0, c7, c10, 2" : : "r" (val));
>>>> -             break;
>>>> -     }
>>>> -
>>>> -done:
>>>> -     put_cpu();
>>>> -
>>>> +     kvm_set_way_flush(vcpu);
>>>>       return true;
>>>>  }
>>>>
>>>>  /*
>>>>   * Generic accessor for VM registers. Only called as long as HCR_TVM
>>>> - * is set.
>>>> + * is set.  If the guest enables the MMU, we stop trapping the VM
>>>> + * sys_regs and leave it in complete control of the caches.
>>>> + *
>>>> + * Used by the cpu-specific code.
>>>>   */
>>>>  static bool access_vm_reg(struct kvm_vcpu *vcpu,
>>>>                         const struct coproc_params *p,
>>>>                         const struct coproc_reg *r)
>>>>  {
>>>> +     bool was_enabled = vcpu_has_cache_enabled(vcpu);
>>>> +
>>>>       BUG_ON(!p->is_write);
>>>>
>>>>       vcpu->arch.cp15[r->reg] = *vcpu_reg(vcpu, p->Rt1);
>>>>       if (p->is_64bit)
>>>>               vcpu->arch.cp15[r->reg + 1] = *vcpu_reg(vcpu, p->Rt2);
>>>>
>>>> -     return true;
>>>> -}
>>>> -
>>>> -/*
>>>> - * SCTLR accessor. Only called as long as HCR_TVM is set.  If the
>>>> - * guest enables the MMU, we stop trapping the VM sys_regs and leave
>>>> - * it in complete control of the caches.
>>>> - *
>>>> - * Used by the cpu-specific code.
>>>> - */
>>>> -bool access_sctlr(struct kvm_vcpu *vcpu,
>>>> -               const struct coproc_params *p,
>>>> -               const struct coproc_reg *r)
>>>> -{
>>>
>>> I think you left in a prototype in arch/arm/kvm/coproc.h
>>>
>>>> -     access_vm_reg(vcpu, p, r);
>>>> -
>>>> -     if (vcpu_has_cache_enabled(vcpu)) {     /* MMU+Caches enabled? */
>>>> -             vcpu->arch.hcr &= ~HCR_TVM;
>>>> -             stage2_flush_vm(vcpu->kvm);
>>>> -     }
>>>> -
>>>> +     kvm_toggle_cache(vcpu, was_enabled);
>>>>       return true;
>>>>  }
>>>>
>>>> diff --git a/arch/arm/kvm/coproc_a15.c b/arch/arm/kvm/coproc_a15.c
>>>> index e6f4ae4..a713675 100644
>>>> --- a/arch/arm/kvm/coproc_a15.c
>>>> +++ b/arch/arm/kvm/coproc_a15.c
>>>> @@ -34,7 +34,7 @@
>>>>  static const struct coproc_reg a15_regs[] = {
>>>>       /* SCTLR: swapped by interrupt.S. */
>>>>       { CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32,
>>>> -                     access_sctlr, reset_val, c1_SCTLR, 0x00C50078 },
>>>> +                     access_vm_reg, reset_val, c1_SCTLR, 0x00C50078 },
>>>>  };
>>>>
>>>>  static struct kvm_coproc_target_table a15_target_table = {
>>>> diff --git a/arch/arm/kvm/coproc_a7.c b/arch/arm/kvm/coproc_a7.c
>>>> index 17fc7cd..b19e46d 100644
>>>> --- a/arch/arm/kvm/coproc_a7.c
>>>> +++ b/arch/arm/kvm/coproc_a7.c
>>>> @@ -37,7 +37,7 @@
>>>>  static const struct coproc_reg a7_regs[] = {
>>>>       /* SCTLR: swapped by interrupt.S. */
>>>>       { CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32,
>>>> -                     access_sctlr, reset_val, c1_SCTLR, 0x00C50878 },
>>>> +                     access_vm_reg, reset_val, c1_SCTLR, 0x00C50878 },
>>>
>>> I'm confused how you tested this, this doesn't seem to compile for me
>>> (access_vm_reg is a static function).
>>
>> That's worrying. It looks like I dropped a fixup while rebasing the
>> series. Nothing major, but still... I'll fix that.
>>
> 
> no worries, just wanted to make sure we have tested the right thing.
> 
>>>>  };
>>>>
>>>>  static struct kvm_coproc_target_table a7_target_table = {
>>>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>>>> index 1dc9778..106737e 100644
>>>> --- a/arch/arm/kvm/mmu.c
>>>> +++ b/arch/arm/kvm/mmu.c
>>>> @@ -278,7 +278,7 @@ static void stage2_flush_memslot(struct kvm *kvm,
>>>>   * Go through the stage 2 page tables and invalidate any cache lines
>>>>   * backing memory already mapped to the VM.
>>>>   */
>>>> -void stage2_flush_vm(struct kvm *kvm)
>>>> +static void stage2_flush_vm(struct kvm *kvm)
>>>>  {
>>>>       struct kvm_memslots *slots;
>>>>       struct kvm_memory_slot *memslot;
>>>> @@ -1411,3 +1411,71 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
>>>>       unmap_stage2_range(kvm, gpa, size);
>>>>       spin_unlock(&kvm->mmu_lock);
>>>>  }
>>>> +
>>>> +/*
>>>> + * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
>>>> + *
>>>> + * Main problems:
>>>> + * - S/W ops are local to a CPU (not broadcast)
>>>> + * - We have line migration behind our back (speculation)
>>>> + * - System caches don't support S/W at all (damn!)
>>>> + *
>>>> + * In the face of the above, the best we can do is to try and convert
>>>> + * S/W ops to VA ops. Because the guest is not allowed to infer the
>>>> + * S/W to PA mapping, it can only use S/W to nuke the whole cache,
>>>> + * which is a rather good thing for us.
>>>> + *
>>>> + * Also, it is only used when turning caches on/off ("The expected
>>>> + * usage of the cache maintenance instructions that operate by set/way
>>>> + * is associated with the cache maintenance instructions associated
>>>> + * with the powerdown and powerup of caches, if this is required by
>>>> + * the implementation.").
>>>> + *
>>>> + * We use the following policy:
>>>> + *
>>>> + * - If we trap a S/W operation, we enable VM trapping to detect
>>>> + *   caches being turned on/off, and do a full clean.
>>>> + *
>>>> + * - We flush the caches on both caches being turned on and off.
>>>> + *
>>>> + * - Once the caches are enabled, we stop trapping VM ops.
>>>> + */
>>>> +void kvm_set_way_flush(struct kvm_vcpu *vcpu)
>>>> +{
>>>> +     unsigned long hcr = vcpu_get_hcr(vcpu);
>>>> +
>>>> +     /*
>>>> +      * If this is the first time we do a S/W operation
>>>> +      * (i.e. HCR_TVM not set) flush the whole memory, and set the
>>>> +      * VM trapping.
>>>
>>> what about when the guest boots, wouldn't it be using S/W operations to
>>> flush the whole cache before turning it on?  But we ignore this case now
>>> because we're already trapping VM regs so this is deferred until caches
>>> are turned on.
>>
>> Exactly. Doing S/W ops with caches off can always be postponed until we
>> turn it on (this is the point where it actually matters).
>>
>>> However, when we are turning off caches, we both flush the cache when
>>> doing the actual S/W operation and when the caches are then turned off.
>>> Why?
>>
>> Think of it as a "belt and braces" measure. The guest tries to do
>> something *at that point*, and we should respect that. It doesn't mean
>> it makes sense from our point of view though. This is what the 32bit
>> kernel does, BTW (have a look at early_cachepolicy -- Mark Rutland is
>> trying to get rid of this).
> 
> ok, and we don't need the "belt and braces" when the MMU is off because
> the guest is not accessing the caches anyhow?

Yes. We don't hit in the cache (since they are off), so we can safely
defer this until caches are turned on.

> So shouldn't we be detecting the guest MMU state instead? But we found
> this to be unstable before (for a still unknown reason), so we instead
> rely on always having trapping TVM set when the MMU is off?

Tracking the MMU state would be the ideal thing to do. It is not so much
that it is unreliable, but that detecting the MMU being turned off is
extremely expensive (TVM traps all accessed to the VM regs).

Also, as you mentioned, there is still a code path I don't really
understand in the 32bit decompressor, where we somehow failed to notice
that the MMU had been turned off (it was with another version of the
patch, and things have changed quite a bit since).

> So is there any conceivable guest (UEFI or something else) that could
> correctly be turning off the MMU without calling invalidate by set/way
> and creating a situation that we don't catch now?  Or is that not a
> problem because in the worst case we'll just be flushing too often?

If the guest flushes by VA, then we're safe. If it doesn't flush at all,
it is a blatant bug. In both cases, trapping the MMU access doesn't buy
us much.

> What about the opposite case, if we have trapping set, but we actually
> do need to flush the caches, how are we sure this never happens?

You lost me here. What is the exact case?

>>
>> So you can think of these as two different, rather unrelated events:
>> - Guest does S/W ops with caches on: we flush, knowing that this is a
>> best effort thing, probably doomed, but hey...
> 
> ok, so Linux is basically doing something architecturally invalid?

Indeed. That area of the code is fairly old and covers all versions of
the architecture, without much distinctions as to what is valid where...

>> - Guest turns its caches off: we flush, knowing that this is the right
>> time to do it.
>>
>> Hope this helps,
>>
> 
> Yes a bit, but this does feel incredibly brittle, but I know it may be
> the best option we have....

Yup, this is probably the worse part of the architecture. We have to
work our way through both architectural issues (inherited from previous
revisions), and legacy code that has hardly evolved since ARMv4 (and
basically works by luck, and the lack of a system cache).

Thanks,

	M.
Christoffer Dall Jan. 29, 2015, 1:40 p.m. UTC | #5
On Tue, Jan 27, 2015 at 01:44:05PM +0000, Marc Zyngier wrote:
> On 27/01/15 13:17, Christoffer Dall wrote:
> > On Tue, Jan 27, 2015 at 11:21:38AM +0000, Marc Zyngier wrote:
> >> On 26/01/15 22:58, Christoffer Dall wrote:
> >>> On Wed, Jan 21, 2015 at 06:39:46PM +0000, Marc Zyngier wrote:
> >>>> Trying to emulate the behaviour of set/way cache ops is fairly
> >>>> pointless, as there are too many ways we can end-up missing stuff.
> >>>> Also, there is some system caches out there that simply ignore
> >>>> set/way operations.
> >>>>
> >>>> So instead of trying to implement them, let's convert it to VA ops,
> >>>> and use them as a way to re-enable the trapping of VM ops. That way,
> >>>> we can detect the point when the MMU/caches are turned off, and do
> >>>> a full VM flush (which is what the guest was trying to do anyway).
> >>>>
> >>>> This allows a 32bit zImage to boot on the APM thingy, and will
> >>>> probably help bootloaders in general.
> >>>>
> >>>> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
> >>>
> >>> This had some conflicts with dirty page logging.  I fixed it up here,
> >>> and also removed some trailing white space and mixed spaces/tabs that
> >>> patch complained about:
> >>>
> >>> http://git.linaro.org/people/christoffer.dall/linux-kvm-arm.git mm-fixes
> >>
> >> Thanks for doing so.
> >>
> >>>> ---
> >>>>  arch/arm/include/asm/kvm_emulate.h   | 10 +++++
> >>>>  arch/arm/include/asm/kvm_host.h      |  3 --
> >>>>  arch/arm/include/asm/kvm_mmu.h       |  3 +-
> >>>>  arch/arm/kvm/arm.c                   | 10 -----
> >>>>  arch/arm/kvm/coproc.c                | 64 ++++++------------------------
> >>>>  arch/arm/kvm/coproc_a15.c            |  2 +-
> >>>>  arch/arm/kvm/coproc_a7.c             |  2 +-
> >>>>  arch/arm/kvm/mmu.c                   | 70 ++++++++++++++++++++++++++++++++-
> >>>>  arch/arm/kvm/trace.h                 | 39 +++++++++++++++++++
> >>>>  arch/arm64/include/asm/kvm_emulate.h | 10 +++++
> >>>>  arch/arm64/include/asm/kvm_host.h    |  3 --
> >>>>  arch/arm64/include/asm/kvm_mmu.h     |  3 +-
> >>>>  arch/arm64/kvm/sys_regs.c            | 75 +++++-------------------------------
> >>>>  13 files changed, 155 insertions(+), 139 deletions(-)
> >>>>
> >>>> diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h
> >>>> index 66ce176..7b01523 100644
> >>>> --- a/arch/arm/include/asm/kvm_emulate.h
> >>>> +++ b/arch/arm/include/asm/kvm_emulate.h
> >>>> @@ -38,6 +38,16 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
> >>>>       vcpu->arch.hcr = HCR_GUEST_MASK;
> >>>>  }
> >>>>
> >>>> +static inline unsigned long vcpu_get_hcr(struct kvm_vcpu *vcpu)
> >>>> +{
> >>>> +     return vcpu->arch.hcr;
> >>>> +}
> >>>> +
> >>>> +static inline void vcpu_set_hcr(struct kvm_vcpu *vcpu, unsigned long hcr)
> >>>> +{
> >>>> +     vcpu->arch.hcr = hcr;
> >>>> +}
> >>>> +
> >>>>  static inline bool vcpu_mode_is_32bit(struct kvm_vcpu *vcpu)
> >>>>  {
> >>>>       return 1;
> >>>> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
> >>>> index 254e065..04b4ea0 100644
> >>>> --- a/arch/arm/include/asm/kvm_host.h
> >>>> +++ b/arch/arm/include/asm/kvm_host.h
> >>>> @@ -125,9 +125,6 @@ struct kvm_vcpu_arch {
> >>>>        * Anything that is not used directly from assembly code goes
> >>>>        * here.
> >>>>        */
> >>>> -     /* dcache set/way operation pending */
> >>>> -     int last_pcpu;
> >>>> -     cpumask_t require_dcache_flush;
> >>>>
> >>>>       /* Don't run the guest on this vcpu */
> >>>>       bool pause;
> >>>> diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
> >>>> index 63e0ecc..286644c 100644
> >>>> --- a/arch/arm/include/asm/kvm_mmu.h
> >>>> +++ b/arch/arm/include/asm/kvm_mmu.h
> >>>> @@ -190,7 +190,8 @@ static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
> >>>>
> >>>>  #define kvm_virt_to_phys(x)          virt_to_idmap((unsigned long)(x))
> >>>>
> >>>> -void stage2_flush_vm(struct kvm *kvm);
> >>>> +void kvm_set_way_flush(struct kvm_vcpu *vcpu);
> >>>> +void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
> >>>>
> >>>>  #endif       /* !__ASSEMBLY__ */
> >>>>
> >>>> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
> >>>> index 2d6d910..0b0d58a 100644
> >>>> --- a/arch/arm/kvm/arm.c
> >>>> +++ b/arch/arm/kvm/arm.c
> >>>> @@ -281,15 +281,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> >>>>       vcpu->cpu = cpu;
> >>>>       vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state);
> >>>>
> >>>> -     /*
> >>>> -      * Check whether this vcpu requires the cache to be flushed on
> >>>> -      * this physical CPU. This is a consequence of doing dcache
> >>>> -      * operations by set/way on this vcpu. We do it here to be in
> >>>> -      * a non-preemptible section.
> >>>> -      */
> >>>> -     if (cpumask_test_and_clear_cpu(cpu, &vcpu->arch.require_dcache_flush))
> >>>> -             flush_cache_all(); /* We'd really want v7_flush_dcache_all() */
> >>>> -
> >>>>       kvm_arm_set_running_vcpu(vcpu);
> >>>>  }
> >>>>
> >>>> @@ -541,7 +532,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
> >>>>               ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);
> >>>>
> >>>>               vcpu->mode = OUTSIDE_GUEST_MODE;
> >>>> -             vcpu->arch.last_pcpu = smp_processor_id();
> >>>>               kvm_guest_exit();
> >>>>               trace_kvm_exit(*vcpu_pc(vcpu));
> >>>>               /*
> >>>> diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c
> >>>> index 7928dbd..0afcc00 100644
> >>>> --- a/arch/arm/kvm/coproc.c
> >>>> +++ b/arch/arm/kvm/coproc.c
> >>>> @@ -189,82 +189,40 @@ static bool access_l2ectlr(struct kvm_vcpu *vcpu,
> >>>>       return true;
> >>>>  }
> >>>>
> >>>> -/* See note at ARM ARM B1.14.4 */
> >>>> +/*
> >>>> + * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
> >>>> + */
> >>>>  static bool access_dcsw(struct kvm_vcpu *vcpu,
> >>>>                       const struct coproc_params *p,
> >>>>                       const struct coproc_reg *r)
> >>>>  {
> >>>> -     unsigned long val;
> >>>> -     int cpu;
> >>>> -
> >>>>       if (!p->is_write)
> >>>>               return read_from_write_only(vcpu, p);
> >>>>
> >>>> -     cpu = get_cpu();
> >>>> -
> >>>> -     cpumask_setall(&vcpu->arch.require_dcache_flush);
> >>>> -     cpumask_clear_cpu(cpu, &vcpu->arch.require_dcache_flush);
> >>>> -
> >>>> -     /* If we were already preempted, take the long way around */
> >>>> -     if (cpu != vcpu->arch.last_pcpu) {
> >>>> -             flush_cache_all();
> >>>> -             goto done;
> >>>> -     }
> >>>> -
> >>>> -     val = *vcpu_reg(vcpu, p->Rt1);
> >>>> -
> >>>> -     switch (p->CRm) {
> >>>> -     case 6:                 /* Upgrade DCISW to DCCISW, as per HCR.SWIO */
> >>>> -     case 14:                /* DCCISW */
> >>>> -             asm volatile("mcr p15, 0, %0, c7, c14, 2" : : "r" (val));
> >>>> -             break;
> >>>> -
> >>>> -     case 10:                /* DCCSW */
> >>>> -             asm volatile("mcr p15, 0, %0, c7, c10, 2" : : "r" (val));
> >>>> -             break;
> >>>> -     }
> >>>> -
> >>>> -done:
> >>>> -     put_cpu();
> >>>> -
> >>>> +     kvm_set_way_flush(vcpu);
> >>>>       return true;
> >>>>  }
> >>>>
> >>>>  /*
> >>>>   * Generic accessor for VM registers. Only called as long as HCR_TVM
> >>>> - * is set.
> >>>> + * is set.  If the guest enables the MMU, we stop trapping the VM
> >>>> + * sys_regs and leave it in complete control of the caches.
> >>>> + *
> >>>> + * Used by the cpu-specific code.
> >>>>   */
> >>>>  static bool access_vm_reg(struct kvm_vcpu *vcpu,
> >>>>                         const struct coproc_params *p,
> >>>>                         const struct coproc_reg *r)
> >>>>  {
> >>>> +     bool was_enabled = vcpu_has_cache_enabled(vcpu);
> >>>> +
> >>>>       BUG_ON(!p->is_write);
> >>>>
> >>>>       vcpu->arch.cp15[r->reg] = *vcpu_reg(vcpu, p->Rt1);
> >>>>       if (p->is_64bit)
> >>>>               vcpu->arch.cp15[r->reg + 1] = *vcpu_reg(vcpu, p->Rt2);
> >>>>
> >>>> -     return true;
> >>>> -}
> >>>> -
> >>>> -/*
> >>>> - * SCTLR accessor. Only called as long as HCR_TVM is set.  If the
> >>>> - * guest enables the MMU, we stop trapping the VM sys_regs and leave
> >>>> - * it in complete control of the caches.
> >>>> - *
> >>>> - * Used by the cpu-specific code.
> >>>> - */
> >>>> -bool access_sctlr(struct kvm_vcpu *vcpu,
> >>>> -               const struct coproc_params *p,
> >>>> -               const struct coproc_reg *r)
> >>>> -{
> >>>
> >>> I think you left in a prototype in arch/arm/kvm/coproc.h
> >>>
> >>>> -     access_vm_reg(vcpu, p, r);
> >>>> -
> >>>> -     if (vcpu_has_cache_enabled(vcpu)) {     /* MMU+Caches enabled? */
> >>>> -             vcpu->arch.hcr &= ~HCR_TVM;
> >>>> -             stage2_flush_vm(vcpu->kvm);
> >>>> -     }
> >>>> -
> >>>> +     kvm_toggle_cache(vcpu, was_enabled);
> >>>>       return true;
> >>>>  }
> >>>>
> >>>> diff --git a/arch/arm/kvm/coproc_a15.c b/arch/arm/kvm/coproc_a15.c
> >>>> index e6f4ae4..a713675 100644
> >>>> --- a/arch/arm/kvm/coproc_a15.c
> >>>> +++ b/arch/arm/kvm/coproc_a15.c
> >>>> @@ -34,7 +34,7 @@
> >>>>  static const struct coproc_reg a15_regs[] = {
> >>>>       /* SCTLR: swapped by interrupt.S. */
> >>>>       { CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32,
> >>>> -                     access_sctlr, reset_val, c1_SCTLR, 0x00C50078 },
> >>>> +                     access_vm_reg, reset_val, c1_SCTLR, 0x00C50078 },
> >>>>  };
> >>>>
> >>>>  static struct kvm_coproc_target_table a15_target_table = {
> >>>> diff --git a/arch/arm/kvm/coproc_a7.c b/arch/arm/kvm/coproc_a7.c
> >>>> index 17fc7cd..b19e46d 100644
> >>>> --- a/arch/arm/kvm/coproc_a7.c
> >>>> +++ b/arch/arm/kvm/coproc_a7.c
> >>>> @@ -37,7 +37,7 @@
> >>>>  static const struct coproc_reg a7_regs[] = {
> >>>>       /* SCTLR: swapped by interrupt.S. */
> >>>>       { CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32,
> >>>> -                     access_sctlr, reset_val, c1_SCTLR, 0x00C50878 },
> >>>> +                     access_vm_reg, reset_val, c1_SCTLR, 0x00C50878 },
> >>>
> >>> I'm confused how you tested this, this doesn't seem to compile for me
> >>> (access_vm_reg is a static function).
> >>
> >> That's worrying. It looks like I dropped a fixup while rebasing the
> >> series. Nothing major, but still... I'll fix that.
> >>
> > 
> > no worries, just wanted to make sure we have tested the right thing.
> > 
> >>>>  };
> >>>>
> >>>>  static struct kvm_coproc_target_table a7_target_table = {
> >>>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> >>>> index 1dc9778..106737e 100644
> >>>> --- a/arch/arm/kvm/mmu.c
> >>>> +++ b/arch/arm/kvm/mmu.c
> >>>> @@ -278,7 +278,7 @@ static void stage2_flush_memslot(struct kvm *kvm,
> >>>>   * Go through the stage 2 page tables and invalidate any cache lines
> >>>>   * backing memory already mapped to the VM.
> >>>>   */
> >>>> -void stage2_flush_vm(struct kvm *kvm)
> >>>> +static void stage2_flush_vm(struct kvm *kvm)
> >>>>  {
> >>>>       struct kvm_memslots *slots;
> >>>>       struct kvm_memory_slot *memslot;
> >>>> @@ -1411,3 +1411,71 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
> >>>>       unmap_stage2_range(kvm, gpa, size);
> >>>>       spin_unlock(&kvm->mmu_lock);
> >>>>  }
> >>>> +
> >>>> +/*
> >>>> + * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
> >>>> + *
> >>>> + * Main problems:
> >>>> + * - S/W ops are local to a CPU (not broadcast)
> >>>> + * - We have line migration behind our back (speculation)
> >>>> + * - System caches don't support S/W at all (damn!)
> >>>> + *
> >>>> + * In the face of the above, the best we can do is to try and convert
> >>>> + * S/W ops to VA ops. Because the guest is not allowed to infer the
> >>>> + * S/W to PA mapping, it can only use S/W to nuke the whole cache,
> >>>> + * which is a rather good thing for us.
> >>>> + *
> >>>> + * Also, it is only used when turning caches on/off ("The expected
> >>>> + * usage of the cache maintenance instructions that operate by set/way
> >>>> + * is associated with the cache maintenance instructions associated
> >>>> + * with the powerdown and powerup of caches, if this is required by
> >>>> + * the implementation.").
> >>>> + *
> >>>> + * We use the following policy:
> >>>> + *
> >>>> + * - If we trap a S/W operation, we enable VM trapping to detect
> >>>> + *   caches being turned on/off, and do a full clean.
> >>>> + *
> >>>> + * - We flush the caches on both caches being turned on and off.
> >>>> + *
> >>>> + * - Once the caches are enabled, we stop trapping VM ops.
> >>>> + */
> >>>> +void kvm_set_way_flush(struct kvm_vcpu *vcpu)
> >>>> +{
> >>>> +     unsigned long hcr = vcpu_get_hcr(vcpu);
> >>>> +
> >>>> +     /*
> >>>> +      * If this is the first time we do a S/W operation
> >>>> +      * (i.e. HCR_TVM not set) flush the whole memory, and set the
> >>>> +      * VM trapping.
> >>>
> >>> what about when the guest boots, wouldn't it be using S/W operations to
> >>> flush the whole cache before turning it on?  But we ignore this case now
> >>> because we're already trapping VM regs so this is deferred until caches
> >>> are turned on.
> >>
> >> Exactly. Doing S/W ops with caches off can always be postponed until we
> >> turn it on (this is the point where it actually matters).
> >>
> >>> However, when we are turning off caches, we both flush the cache when
> >>> doing the actual S/W operation and when the caches are then turned off.
> >>> Why?
> >>
> >> Think of it as a "belt and braces" measure. The guest tries to do
> >> something *at that point*, and we should respect that. It doesn't mean
> >> it makes sense from our point of view though. This is what the 32bit
> >> kernel does, BTW (have a look at early_cachepolicy -- Mark Rutland is
> >> trying to get rid of this).
> > 
> > ok, and we don't need the "belt and braces" when the MMU is off because
> > the guest is not accessing the caches anyhow?
> 
> Yes. We don't hit in the cache (since they are off), so we can safely
> defer this until caches are turned on.
> 
> > So shouldn't we be detecting the guest MMU state instead? But we found
> > this to be unstable before (for a still unknown reason), so we instead
> > rely on always having trapping TVM set when the MMU is off?
> 
> Tracking the MMU state would be the ideal thing to do. It is not so much
> that it is unreliable, but that detecting the MMU being turned off is
> extremely expensive (TVM traps all accessed to the VM regs).

I meant that when we trap on S/W, we should decide on flushing 
based on whether the guest has turned the MMU off, and not trying to
infer this from our setting of HCR_TVM.  The only reason I can see for
doing this is that we are somehow not properly detecting the guest MMU
state.

> 
> Also, as you mentioned, there is still a code path I don't really
> understand in the 32bit decompressor, where we somehow failed to notice
> that the MMU had been turned off (it was with another version of the
> patch, and things have changed quite a bit since).
> 

hmm, the thing I don't really get by looking at the code now is why we
can avoid doing the flush if the TVM is set, this feels at least very
unintuitive.

> > So is there any conceivable guest (UEFI or something else) that could
> > correctly be turning off the MMU without calling invalidate by set/way
> > and creating a situation that we don't catch now?  Or is that not a
> > problem because in the worst case we'll just be flushing too often?
> 
> If the guest flushes by VA, then we're safe. If it doesn't flush at all,
> it is a blatant bug. In both cases, trapping the MMU access doesn't buy
> us much.
> 
> > What about the opposite case, if we have trapping set, but we actually
> > do need to flush the caches, how are we sure this never happens?
> 
> You lost me here. What is the exact case?
> 

If we have HCR_TVM set, and the guest does a flush by SW, we currently
do nothing.

So for example:

1. Guest calls flush by SW
2. We flush and enable trapping
3. guest manipulates memory, and flushes again
4. We do nothing because trapping is enabled.
5. The guest touches some VM register (e.g. TTBR), we don't flush because
   now_enable == was_enabled == true.  We stop trapping.

does this mean the guest is so broken that we don't care?

> >>
> >> So you can think of these as two different, rather unrelated events:
> >> - Guest does S/W ops with caches on: we flush, knowing that this is a
> >> best effort thing, probably doomed, but hey...
> > 
> > ok, so Linux is basically doing something architecturally invalid?
> 
> Indeed. That area of the code is fairly old and covers all versions of
> the architecture, without much distinctions as to what is valid where...
> 
> >> - Guest turns its caches off: we flush, knowing that this is the right
> >> time to do it.
> >>
> >> Hope this helps,
> >>
> > 
> > Yes a bit, but this does feel incredibly brittle, but I know it may be
> > the best option we have....
> 
> Yup, this is probably the worse part of the architecture. We have to
> work our way through both architectural issues (inherited from previous
> revisions), and legacy code that has hardly evolved since ARMv4 (and
> basically works by luck, and the lack of a system cache).
> 
Yikes!

Thanks,
-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h
index 66ce176..7b01523 100644
--- a/arch/arm/include/asm/kvm_emulate.h
+++ b/arch/arm/include/asm/kvm_emulate.h
@@ -38,6 +38,16 @@  static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
 	vcpu->arch.hcr = HCR_GUEST_MASK;
 }
 
+static inline unsigned long vcpu_get_hcr(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.hcr;
+}
+
+static inline void vcpu_set_hcr(struct kvm_vcpu *vcpu, unsigned long hcr)
+{
+	vcpu->arch.hcr = hcr;
+}
+
 static inline bool vcpu_mode_is_32bit(struct kvm_vcpu *vcpu)
 {
 	return 1;
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 254e065..04b4ea0 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -125,9 +125,6 @@  struct kvm_vcpu_arch {
 	 * Anything that is not used directly from assembly code goes
 	 * here.
 	 */
-	/* dcache set/way operation pending */
-	int last_pcpu;
-	cpumask_t require_dcache_flush;
 
 	/* Don't run the guest on this vcpu */
 	bool pause;
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 63e0ecc..286644c 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -190,7 +190,8 @@  static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
 
 #define kvm_virt_to_phys(x)		virt_to_idmap((unsigned long)(x))
 
-void stage2_flush_vm(struct kvm *kvm);
+void kvm_set_way_flush(struct kvm_vcpu *vcpu);
+void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
 
 #endif	/* !__ASSEMBLY__ */
 
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 2d6d910..0b0d58a 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -281,15 +281,6 @@  void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	vcpu->cpu = cpu;
 	vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state);
 
-	/*
-	 * Check whether this vcpu requires the cache to be flushed on
-	 * this physical CPU. This is a consequence of doing dcache
-	 * operations by set/way on this vcpu. We do it here to be in
-	 * a non-preemptible section.
-	 */
-	if (cpumask_test_and_clear_cpu(cpu, &vcpu->arch.require_dcache_flush))
-		flush_cache_all(); /* We'd really want v7_flush_dcache_all() */
-
 	kvm_arm_set_running_vcpu(vcpu);
 }
 
@@ -541,7 +532,6 @@  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);
 
 		vcpu->mode = OUTSIDE_GUEST_MODE;
-		vcpu->arch.last_pcpu = smp_processor_id();
 		kvm_guest_exit();
 		trace_kvm_exit(*vcpu_pc(vcpu));
 		/*
diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c
index 7928dbd..0afcc00 100644
--- a/arch/arm/kvm/coproc.c
+++ b/arch/arm/kvm/coproc.c
@@ -189,82 +189,40 @@  static bool access_l2ectlr(struct kvm_vcpu *vcpu,
 	return true;
 }
 
-/* See note at ARM ARM B1.14.4 */
+/*
+ * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
+ */
 static bool access_dcsw(struct kvm_vcpu *vcpu,
 			const struct coproc_params *p,
 			const struct coproc_reg *r)
 {
-	unsigned long val;
-	int cpu;
-
 	if (!p->is_write)
 		return read_from_write_only(vcpu, p);
 
-	cpu = get_cpu();
-
-	cpumask_setall(&vcpu->arch.require_dcache_flush);
-	cpumask_clear_cpu(cpu, &vcpu->arch.require_dcache_flush);
-
-	/* If we were already preempted, take the long way around */
-	if (cpu != vcpu->arch.last_pcpu) {
-		flush_cache_all();
-		goto done;
-	}
-
-	val = *vcpu_reg(vcpu, p->Rt1);
-
-	switch (p->CRm) {
-	case 6:			/* Upgrade DCISW to DCCISW, as per HCR.SWIO */
-	case 14:		/* DCCISW */
-		asm volatile("mcr p15, 0, %0, c7, c14, 2" : : "r" (val));
-		break;
-
-	case 10:		/* DCCSW */
-		asm volatile("mcr p15, 0, %0, c7, c10, 2" : : "r" (val));
-		break;
-	}
-
-done:
-	put_cpu();
-
+	kvm_set_way_flush(vcpu);
 	return true;
 }
 
 /*
  * Generic accessor for VM registers. Only called as long as HCR_TVM
- * is set.
+ * is set.  If the guest enables the MMU, we stop trapping the VM
+ * sys_regs and leave it in complete control of the caches.
+ *
+ * Used by the cpu-specific code.
  */
 static bool access_vm_reg(struct kvm_vcpu *vcpu,
 			  const struct coproc_params *p,
 			  const struct coproc_reg *r)
 {
+	bool was_enabled = vcpu_has_cache_enabled(vcpu);
+
 	BUG_ON(!p->is_write);
 
 	vcpu->arch.cp15[r->reg] = *vcpu_reg(vcpu, p->Rt1);
 	if (p->is_64bit)
 		vcpu->arch.cp15[r->reg + 1] = *vcpu_reg(vcpu, p->Rt2);
 
-	return true;
-}
-
-/*
- * SCTLR accessor. Only called as long as HCR_TVM is set.  If the
- * guest enables the MMU, we stop trapping the VM sys_regs and leave
- * it in complete control of the caches.
- *
- * Used by the cpu-specific code.
- */
-bool access_sctlr(struct kvm_vcpu *vcpu,
-		  const struct coproc_params *p,
-		  const struct coproc_reg *r)
-{
-	access_vm_reg(vcpu, p, r);
-
-	if (vcpu_has_cache_enabled(vcpu)) {	/* MMU+Caches enabled? */
-		vcpu->arch.hcr &= ~HCR_TVM;
-		stage2_flush_vm(vcpu->kvm);
-	}
-
+	kvm_toggle_cache(vcpu, was_enabled);
 	return true;
 }
 
diff --git a/arch/arm/kvm/coproc_a15.c b/arch/arm/kvm/coproc_a15.c
index e6f4ae4..a713675 100644
--- a/arch/arm/kvm/coproc_a15.c
+++ b/arch/arm/kvm/coproc_a15.c
@@ -34,7 +34,7 @@ 
 static const struct coproc_reg a15_regs[] = {
 	/* SCTLR: swapped by interrupt.S. */
 	{ CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32,
-			access_sctlr, reset_val, c1_SCTLR, 0x00C50078 },
+			access_vm_reg, reset_val, c1_SCTLR, 0x00C50078 },
 };
 
 static struct kvm_coproc_target_table a15_target_table = {
diff --git a/arch/arm/kvm/coproc_a7.c b/arch/arm/kvm/coproc_a7.c
index 17fc7cd..b19e46d 100644
--- a/arch/arm/kvm/coproc_a7.c
+++ b/arch/arm/kvm/coproc_a7.c
@@ -37,7 +37,7 @@ 
 static const struct coproc_reg a7_regs[] = {
 	/* SCTLR: swapped by interrupt.S. */
 	{ CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32,
-			access_sctlr, reset_val, c1_SCTLR, 0x00C50878 },
+			access_vm_reg, reset_val, c1_SCTLR, 0x00C50878 },
 };
 
 static struct kvm_coproc_target_table a7_target_table = {
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 1dc9778..106737e 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -278,7 +278,7 @@  static void stage2_flush_memslot(struct kvm *kvm,
  * Go through the stage 2 page tables and invalidate any cache lines
  * backing memory already mapped to the VM.
  */
-void stage2_flush_vm(struct kvm *kvm)
+static void stage2_flush_vm(struct kvm *kvm)
 {
 	struct kvm_memslots *slots;
 	struct kvm_memory_slot *memslot;
@@ -1411,3 +1411,71 @@  void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 	unmap_stage2_range(kvm, gpa, size);
 	spin_unlock(&kvm->mmu_lock);
 }
+
+/*
+ * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
+ *
+ * Main problems:
+ * - S/W ops are local to a CPU (not broadcast)
+ * - We have line migration behind our back (speculation)
+ * - System caches don't support S/W at all (damn!)
+ *
+ * In the face of the above, the best we can do is to try and convert
+ * S/W ops to VA ops. Because the guest is not allowed to infer the
+ * S/W to PA mapping, it can only use S/W to nuke the whole cache,
+ * which is a rather good thing for us.
+ *
+ * Also, it is only used when turning caches on/off ("The expected
+ * usage of the cache maintenance instructions that operate by set/way
+ * is associated with the cache maintenance instructions associated
+ * with the powerdown and powerup of caches, if this is required by
+ * the implementation.").
+ *
+ * We use the following policy:
+ *
+ * - If we trap a S/W operation, we enable VM trapping to detect
+ *   caches being turned on/off, and do a full clean.
+ *
+ * - We flush the caches on both caches being turned on and off.
+ *
+ * - Once the caches are enabled, we stop trapping VM ops.
+ */
+void kvm_set_way_flush(struct kvm_vcpu *vcpu)
+{
+	unsigned long hcr = vcpu_get_hcr(vcpu);
+
+	/*
+	 * If this is the first time we do a S/W operation
+	 * (i.e. HCR_TVM not set) flush the whole memory, and set the
+	 * VM trapping.
+	 *
+	 * Otherwise, rely on the VM trapping to wait for the MMU +
+	 * Caches to be turned off. At that point, we'll be able to
+	 * clean the caches again.
+	 */
+	if (!(hcr & HCR_TVM)) {
+		trace_kvm_set_way_flush(*vcpu_pc(vcpu),
+					vcpu_has_cache_enabled(vcpu));
+		stage2_flush_vm(vcpu->kvm);
+		vcpu_set_hcr(vcpu, hcr | HCR_TVM);
+	}
+}
+
+void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
+{
+	bool now_enabled = vcpu_has_cache_enabled(vcpu);
+
+	/*
+	 * If switching the MMU+caches on, need to invalidate the caches.
+	 * If switching it off, need to clean the caches.
+	 * Clean + invalidate does the trick always.
+	 */
+	if (now_enabled != was_enabled)
+		stage2_flush_vm(vcpu->kvm);
+
+	/* Caches are now on, stop trapping VM ops (until a S/W op) */
+	if (now_enabled)
+		vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) & ~HCR_TVM);
+
+	trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
+}
diff --git a/arch/arm/kvm/trace.h b/arch/arm/kvm/trace.h
index b1d640f..b6a6e71 100644
--- a/arch/arm/kvm/trace.h
+++ b/arch/arm/kvm/trace.h
@@ -223,6 +223,45 @@  TRACE_EVENT(kvm_hvc,
 		  __entry->vcpu_pc, __entry->r0, __entry->imm)
 );
 
+TRACE_EVENT(kvm_set_way_flush,
+	    TP_PROTO(unsigned long vcpu_pc, bool cache),
+	    TP_ARGS(vcpu_pc, cache),
+
+	    TP_STRUCT__entry(
+		    __field(	unsigned long,	vcpu_pc		)
+		    __field(	bool,		cache		)
+	    ),
+
+	    TP_fast_assign(
+		    __entry->vcpu_pc		= vcpu_pc;
+		    __entry->cache		= cache;
+	    ),
+
+	    TP_printk("S/W flush at 0x%016lx (cache %s)",
+		      __entry->vcpu_pc, __entry->cache ? "on" : "off")
+);
+
+TRACE_EVENT(kvm_toggle_cache,
+	    TP_PROTO(unsigned long vcpu_pc, bool was, bool now),
+	    TP_ARGS(vcpu_pc, was, now),
+
+	    TP_STRUCT__entry(
+		    __field(	unsigned long,	vcpu_pc		)
+		    __field(	bool,		was		)
+		    __field(	bool,		now		)
+	    ),
+
+	    TP_fast_assign(
+		    __entry->vcpu_pc		= vcpu_pc;
+		    __entry->was		= was;
+		    __entry->now		= now;
+	    ),
+
+	    TP_printk("VM op at 0x%016lx (cache was %s, now %s)",
+		      __entry->vcpu_pc, __entry->was ? "on" : "off",
+		      __entry->now ? "on" : "off")
+);
+
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 865a7e2..3cb4c85 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -45,6 +45,16 @@  static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
 		vcpu->arch.hcr_el2 &= ~HCR_RW;
 }
 
+static inline unsigned long vcpu_get_hcr(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.hcr_el2;
+}
+
+static inline void vcpu_set_hcr(struct kvm_vcpu *vcpu, unsigned long hcr)
+{
+	vcpu->arch.hcr_el2 = hcr;
+}
+
 static inline unsigned long *vcpu_pc(const struct kvm_vcpu *vcpu)
 {
 	return (unsigned long *)&vcpu_gp_regs(vcpu)->regs.pc;
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 0b7dfdb..acd101a 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -116,9 +116,6 @@  struct kvm_vcpu_arch {
 	 * Anything that is not used directly from assembly code goes
 	 * here.
 	 */
-	/* dcache set/way operation pending */
-	int last_pcpu;
-	cpumask_t require_dcache_flush;
 
 	/* Don't run the guest */
 	bool pause;
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 14a74f1..92d22e9 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -260,7 +260,8 @@  static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
 
 #define kvm_virt_to_phys(x)		__virt_to_phys((unsigned long)(x))
 
-void stage2_flush_vm(struct kvm *kvm);
+void kvm_set_way_flush(struct kvm_vcpu *vcpu);
+void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
 
 #endif /* __ASSEMBLY__ */
 #endif /* __ARM64_KVM_MMU_H__ */
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 3d7c2df..95daa73 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -69,68 +69,31 @@  static u32 get_ccsidr(u32 csselr)
 	return ccsidr;
 }
 
-static void do_dc_cisw(u32 val)
-{
-	asm volatile("dc cisw, %x0" : : "r" (val));
-	dsb(ish);
-}
-
-static void do_dc_csw(u32 val)
-{
-	asm volatile("dc csw, %x0" : : "r" (val));
-	dsb(ish);
-}
-
-/* See note at ARM ARM B1.14.4 */
+/*
+ * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
+ */
 static bool access_dcsw(struct kvm_vcpu *vcpu,
 			const struct sys_reg_params *p,
 			const struct sys_reg_desc *r)
 {
-	unsigned long val;
-	int cpu;
-
 	if (!p->is_write)
 		return read_from_write_only(vcpu, p);
 
-	cpu = get_cpu();
-
-	cpumask_setall(&vcpu->arch.require_dcache_flush);
-	cpumask_clear_cpu(cpu, &vcpu->arch.require_dcache_flush);
-
-	/* If we were already preempted, take the long way around */
-	if (cpu != vcpu->arch.last_pcpu) {
-		flush_cache_all();
-		goto done;
-	}
-
-	val = *vcpu_reg(vcpu, p->Rt);
-
-	switch (p->CRm) {
-	case 6:			/* Upgrade DCISW to DCCISW, as per HCR.SWIO */
-	case 14:		/* DCCISW */
-		do_dc_cisw(val);
-		break;
-
-	case 10:		/* DCCSW */
-		do_dc_csw(val);
-		break;
-	}
-
-done:
-	put_cpu();
-
+	kvm_set_way_flush(vcpu);	
 	return true;
 }
 
 /*
  * Generic accessor for VM registers. Only called as long as HCR_TVM
- * is set.
+ * is set. If the guest enables the MMU, we stop trapping the VM
+ * sys_regs and leave it in complete control of the caches.
  */
 static bool access_vm_reg(struct kvm_vcpu *vcpu,
 			  const struct sys_reg_params *p,
 			  const struct sys_reg_desc *r)
 {
 	unsigned long val;
+	bool was_enabled = vcpu_has_cache_enabled(vcpu);
 
 	BUG_ON(!p->is_write);
 
@@ -143,25 +106,7 @@  static bool access_vm_reg(struct kvm_vcpu *vcpu,
 		vcpu_cp15_64_low(vcpu, r->reg) = val & 0xffffffffUL;
 	}
 
-	return true;
-}
-
-/*
- * SCTLR_EL1 accessor. Only called as long as HCR_TVM is set.  If the
- * guest enables the MMU, we stop trapping the VM sys_regs and leave
- * it in complete control of the caches.
- */
-static bool access_sctlr(struct kvm_vcpu *vcpu,
-			 const struct sys_reg_params *p,
-			 const struct sys_reg_desc *r)
-{
-	access_vm_reg(vcpu, p, r);
-
-	if (vcpu_has_cache_enabled(vcpu)) {	/* MMU+Caches enabled? */
-		vcpu->arch.hcr_el2 &= ~HCR_TVM;
-		stage2_flush_vm(vcpu->kvm);
-	}
-
+	kvm_toggle_cache(vcpu, was_enabled);
 	return true;
 }
 
@@ -377,7 +322,7 @@  static const struct sys_reg_desc sys_reg_descs[] = {
 	  NULL, reset_mpidr, MPIDR_EL1 },
 	/* SCTLR_EL1 */
 	{ Op0(0b11), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b000),
-	  access_sctlr, reset_val, SCTLR_EL1, 0x00C50078 },
+	  access_vm_reg, reset_val, SCTLR_EL1, 0x00C50078 },
 	/* CPACR_EL1 */
 	{ Op0(0b11), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b010),
 	  NULL, reset_val, CPACR_EL1, 0 },
@@ -657,7 +602,7 @@  static const struct sys_reg_desc cp14_64_regs[] = {
  * register).
  */
 static const struct sys_reg_desc cp15_regs[] = {
-	{ Op1( 0), CRn( 1), CRm( 0), Op2( 0), access_sctlr, NULL, c1_SCTLR },
+	{ Op1( 0), CRn( 1), CRm( 0), Op2( 0), access_vm_reg, NULL, c1_SCTLR },
 	{ Op1( 0), CRn( 2), CRm( 0), Op2( 0), access_vm_reg, NULL, c2_TTBR0 },
 	{ Op1( 0), CRn( 2), CRm( 0), Op2( 1), access_vm_reg, NULL, c2_TTBR1 },
 	{ Op1( 0), CRn( 2), CRm( 0), Op2( 2), access_vm_reg, NULL, c2_TTBCR },