diff mbox

[4/4] arm/arm64: KVM: use kernel mapping to perform invalidation on page fault

Message ID 1420718349-24152-5-git-send-email-marc.zyngier@arm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Marc Zyngier Jan. 8, 2015, 11:59 a.m. UTC
When handling a fault in stage-2, we need to resync I$ and D$, just
to be sure we don't leave any old cache line behind.

That's very good, except that we do so using the *user* address.
Under heavy load (swapping like crazy), we may end up in a situation
where the page gets mapped in stage-2 while being unmapped from
userspace by another CPU.

At that point, the DC/IC instructions can generate a fault, which
we handle with kvm->mmu_lock held. The box quickly deadlocks, user
is unhappy.

Instead, perform this invalidation through the kernel mapping,
which is guaranteed to be present. The box is much happier, and so
am I.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 arch/arm/include/asm/kvm_mmu.h   | 37 ++++++++++++++++++++++++++++---------
 arch/arm/kvm/mmu.c               | 12 ++++++++----
 arch/arm64/include/asm/kvm_mmu.h | 13 ++++++++-----
 3 files changed, 44 insertions(+), 18 deletions(-)

Comments

Peter Maydell Jan. 8, 2015, 12:30 p.m. UTC | #1
On 8 January 2015 at 11:59, Marc Zyngier <marc.zyngier@arm.com> wrote:
> @@ -180,10 +177,32 @@ static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
>          *
>          * VIVT caches are tagged using both the ASID and the VMID and doesn't
>          * need any kind of flushing (DDI 0406C.b - Page B3-1392).
> +        *
> +        * We need to do this through a kernel mapping (using the
> +        * user-space mapping has proved to be the wrong
> +        * solution). For that, we need to kmap one page at a time,
> +        * and iterate over the range.
>          */
> -       if (icache_is_pipt()) {
> -               __cpuc_coherent_user_range(hva, hva + size);
> -       } else if (!icache_is_vivt_asid_tagged()) {
> +
> +       VM_BUG_ON(size & PAGE_MASK);
> +
> +       while (size) {
> +               void *va = kmap_atomic_pfn(pfn);
> +
> +               if (!vcpu_has_cache_enabled(vcpu) || ipa_uncached)
> +                       kvm_flush_dcache_to_poc(va, PAGE_SIZE);
> +
> +               if (icache_is_pipt())
> +                       __cpuc_coherent_user_range((unsigned long)va,
> +                                                  (unsigned long)va + PAGE_SIZE);
> +
> +               size -= PAGE_SIZE;
> +               pfn++;
> +
> +               kunmap_atomic(va);
> +       }

If (vcpu_has_cache_enabled(vcpu) && !ipa_uncached && !icache_is_pipt())
then we're going to run round this loop mapping and unmapping without
actually doing anything. Is it worth hoisting that check out of the
loop? (I think it's going to be the common case for a non-PIPT icache,
right?)

> +       if (!icache_is_pipt() && !icache_is_vivt_asid_tagged()) {
>                 /* any kind of VIPT cache */
>                 __flush_icache_all();
>         }

Can you remind me why it's OK not to flush the icache for an
ASID tagged VIVT icache? Making this page coherent might actually
be revealing a change in the instructions associated with the VA,
mightn't it?

thanks
-- PMM
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marc Zyngier Jan. 8, 2015, 1:07 p.m. UTC | #2
On 08/01/15 12:30, Peter Maydell wrote:
> On 8 January 2015 at 11:59, Marc Zyngier <marc.zyngier@arm.com> wrote:
>> @@ -180,10 +177,32 @@ static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
>>          *
>>          * VIVT caches are tagged using both the ASID and the VMID and doesn't
>>          * need any kind of flushing (DDI 0406C.b - Page B3-1392).
>> +        *
>> +        * We need to do this through a kernel mapping (using the
>> +        * user-space mapping has proved to be the wrong
>> +        * solution). For that, we need to kmap one page at a time,
>> +        * and iterate over the range.
>>          */
>> -       if (icache_is_pipt()) {
>> -               __cpuc_coherent_user_range(hva, hva + size);
>> -       } else if (!icache_is_vivt_asid_tagged()) {
>> +
>> +       VM_BUG_ON(size & PAGE_MASK);
>> +
>> +       while (size) {
>> +               void *va = kmap_atomic_pfn(pfn);
>> +
>> +               if (!vcpu_has_cache_enabled(vcpu) || ipa_uncached)
>> +                       kvm_flush_dcache_to_poc(va, PAGE_SIZE);
>> +
>> +               if (icache_is_pipt())
>> +                       __cpuc_coherent_user_range((unsigned long)va,
>> +                                                  (unsigned long)va + PAGE_SIZE);
>> +
>> +               size -= PAGE_SIZE;
>> +               pfn++;
>> +
>> +               kunmap_atomic(va);
>> +       }
> 
> If (vcpu_has_cache_enabled(vcpu) && !ipa_uncached && !icache_is_pipt())
> then we're going to run round this loop mapping and unmapping without
> actually doing anything. Is it worth hoisting that check out of the
> loop? (I think it's going to be the common case for a non-PIPT icache,
> right?)

Yup, that's a valid optimization.

>> +       if (!icache_is_pipt() && !icache_is_vivt_asid_tagged()) {
>>                 /* any kind of VIPT cache */
>>                 __flush_icache_all();
>>         }
> 
> Can you remind me why it's OK not to flush the icache for an
> ASID tagged VIVT icache? Making this page coherent might actually
> be revealing a change in the instructions associated with the VA,
> mightn't it?

ASID cached VIVT icaches are also VMID tagged. It is thus impossible for
stale cache lines to come with a new page. And if by synchronizing the
caches you obtain a different instruction stream, it means you've
restored the wrong page.

Thanks,

	M.
Peter Maydell Jan. 8, 2015, 1:16 p.m. UTC | #3
On 8 January 2015 at 13:07, Marc Zyngier <marc.zyngier@arm.com> wrote:
>> Can you remind me why it's OK not to flush the icache for an
>> ASID tagged VIVT icache? Making this page coherent might actually
>> be revealing a change in the instructions associated with the VA,
>> mightn't it?
>
> ASID cached VIVT icaches are also VMID tagged. It is thus impossible for
> stale cache lines to come with a new page. And if by synchronizing the
> caches you obtain a different instruction stream, it means you've
> restored the wrong page.

...is that true even if the dirty data in the dcache comes from
the userspace process doing DMA or writing the initial boot
image or whatever?

-- PMM
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marc Zyngier Jan. 8, 2015, 3:06 p.m. UTC | #4
On 08/01/15 13:16, Peter Maydell wrote:
> On 8 January 2015 at 13:07, Marc Zyngier <marc.zyngier@arm.com> wrote:
>>> Can you remind me why it's OK not to flush the icache for an
>>> ASID tagged VIVT icache? Making this page coherent might actually
>>> be revealing a change in the instructions associated with the VA,
>>> mightn't it?
>>
>> ASID cached VIVT icaches are also VMID tagged. It is thus impossible for
>> stale cache lines to come with a new page. And if by synchronizing the
>> caches you obtain a different instruction stream, it means you've
>> restored the wrong page.
> 
> ...is that true even if the dirty data in the dcache comes from
> the userspace process doing DMA or writing the initial boot
> image or whatever?

We perform this on a page that is being brought in stage-2. Two cases:

- This is a page is mapped for the first time: the icache should be
invalid for this page (the guest should have invalidated it the first
place),

- This is a page that we bring back from swap: the page must match the
one that has been swapped out. If it has been DMA'ed in in the meantime,
then the guest surely has flushed its icache if it intends to branch to
it, hasn't it?

I have the feeling I'm missing something from your question...

	M.
Peter Maydell Jan. 8, 2015, 3:21 p.m. UTC | #5
On 8 January 2015 at 15:06, Marc Zyngier <marc.zyngier@arm.com> wrote:
> On 08/01/15 13:16, Peter Maydell wrote:
>>> ASID cached VIVT icaches are also VMID tagged. It is thus impossible for
>>> stale cache lines to come with a new page. And if by synchronizing the
>>> caches you obtain a different instruction stream, it means you've
>>> restored the wrong page.
>>
>> ...is that true even if the dirty data in the dcache comes from
>> the userspace process doing DMA or writing the initial boot
>> image or whatever?
>
> We perform this on a page that is being brought in stage-2. Two cases:
>
> - This is a page is mapped for the first time: the icache should be
> invalid for this page (the guest should have invalidated it the first
> place),

If this is the first instruction in the guest (ie we've just
(warm) reset the VM and are running the kernel as loaded into the guest
by QEMU/kvmtool) then the guest can't have invalidated the icache,
and QEMU can't do the invalidate because it doesn't have the vaddr
and VMID of the guest.

> - This is a page that we bring back from swap: the page must match the
> one that has been swapped out. If it has been DMA'ed in in the meantime,
> then the guest surely has flushed its icache if it intends to branch to
> it, hasn't it?

I agree that for the DMA case the guest will have done the invalidate.

-- PMM
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoffer Dall Jan. 9, 2015, 12:50 p.m. UTC | #6
On Thu, Jan 08, 2015 at 03:21:50PM +0000, Peter Maydell wrote:
> On 8 January 2015 at 15:06, Marc Zyngier <marc.zyngier@arm.com> wrote:
> > On 08/01/15 13:16, Peter Maydell wrote:
> >>> ASID cached VIVT icaches are also VMID tagged. It is thus impossible for
> >>> stale cache lines to come with a new page. And if by synchronizing the
> >>> caches you obtain a different instruction stream, it means you've
> >>> restored the wrong page.
> >>
> >> ...is that true even if the dirty data in the dcache comes from
> >> the userspace process doing DMA or writing the initial boot
> >> image or whatever?
> >
> > We perform this on a page that is being brought in stage-2. Two cases:
> >
> > - This is a page is mapped for the first time: the icache should be
> > invalid for this page (the guest should have invalidated it the first
> > place),
> 
> If this is the first instruction in the guest (ie we've just
> (warm) reset the VM and are running the kernel as loaded into the guest
> by QEMU/kvmtool) then the guest can't have invalidated the icache,
> and QEMU can't do the invalidate because it doesn't have the vaddr
> and VMID of the guest.
> 
The guest must clean its icache before turning on the MMU, no?

Whenever we reuse a VMID (rollover), we flush the entire icache for that
vmid.

-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoffer Dall Jan. 9, 2015, 12:51 p.m. UTC | #7
On Thu, Jan 08, 2015 at 11:59:09AM +0000, Marc Zyngier wrote:
> When handling a fault in stage-2, we need to resync I$ and D$, just
> to be sure we don't leave any old cache line behind.
> 
> That's very good, except that we do so using the *user* address.
> Under heavy load (swapping like crazy), we may end up in a situation
> where the page gets mapped in stage-2 while being unmapped from
> userspace by another CPU.
> 
> At that point, the DC/IC instructions can generate a fault, which
> we handle with kvm->mmu_lock held. The box quickly deadlocks, user
> is unhappy.
> 
> Instead, perform this invalidation through the kernel mapping,
> which is guaranteed to be present. The box is much happier, and so
> am I.
> 
> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>

This looks good to me!

Thanks,
-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Maydell Jan. 9, 2015, 1:03 p.m. UTC | #8
On 9 January 2015 at 12:50, Christoffer Dall
<christoffer.dall@linaro.org> wrote:
> On Thu, Jan 08, 2015 at 03:21:50PM +0000, Peter Maydell wrote:
>> If this is the first instruction in the guest (ie we've just
>> (warm) reset the VM and are running the kernel as loaded into the guest
>> by QEMU/kvmtool) then the guest can't have invalidated the icache,
>> and QEMU can't do the invalidate because it doesn't have the vaddr
>> and VMID of the guest.
>>
> The guest must clean its icache before turning on the MMU, no?

Yes, but to execute the "clean icache" insn inside the guest,
the guest is executing instructions, so we'd better not have
stale info for that code...

> Whenever we reuse a VMID (rollover), we flush the entire icache for that
> vmid.

When we reset a cpu by re-calling KVM_ARM_VCPU_INIT, that doesn't
mean we get a new VMID for it, though, does it? I thought that
what causes the icache flush to happen for the reset guest is
that we unmap all of stage 2 and then fault it back in, via
this code. That works for PIPT (we flush the range) and for
VIPT (we do a full icache flush), but at the moment for VIVT
ASID tagged we assume we can do nothing, and I don't think that's
right for this case (though it is right for "faulted because
page was swapped out" and OK for "page was written by DMA").

-- PMM
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marc Zyngier Jan. 9, 2015, 2:16 p.m. UTC | #9
On 09/01/15 13:03, Peter Maydell wrote:
> On 9 January 2015 at 12:50, Christoffer Dall
> <christoffer.dall@linaro.org> wrote:
>> On Thu, Jan 08, 2015 at 03:21:50PM +0000, Peter Maydell wrote:
>>> If this is the first instruction in the guest (ie we've just
>>> (warm) reset the VM and are running the kernel as loaded into the guest
>>> by QEMU/kvmtool) then the guest can't have invalidated the icache,
>>> and QEMU can't do the invalidate because it doesn't have the vaddr
>>> and VMID of the guest.
>>>
>> The guest must clean its icache before turning on the MMU, no?
> 
> Yes, but to execute the "clean icache" insn inside the guest,
> the guest is executing instructions, so we'd better not have
> stale info for that code...

But we never start a guest with caches on. It has to enable them on its own.

>> Whenever we reuse a VMID (rollover), we flush the entire icache for that
>> vmid.
> 
> When we reset a cpu by re-calling KVM_ARM_VCPU_INIT, that doesn't
> mean we get a new VMID for it, though, does it? I thought that
> what causes the icache flush to happen for the reset guest is
> that we unmap all of stage 2 and then fault it back in, via
> this code. That works for PIPT (we flush the range) and for
> VIPT (we do a full icache flush), but at the moment for VIVT
> ASID tagged we assume we can do nothing, and I don't think that's
> right for this case (though it is right for "faulted because
> page was swapped out" and OK for "page was written by DMA").

When we reset the guest, we also turn both its Icache off. Before
turning it back on, the guest has to invalidate it (the ARM ARM doesn't
seem to define the state of the cache out of reset).

	M.
Peter Maydell Jan. 9, 2015, 3:28 p.m. UTC | #10
On 9 January 2015 at 14:16, Marc Zyngier <marc.zyngier@arm.com> wrote:
> On 09/01/15 13:03, Peter Maydell wrote:
>> When we reset a cpu by re-calling KVM_ARM_VCPU_INIT, that doesn't
>> mean we get a new VMID for it, though, does it? I thought that
>> what causes the icache flush to happen for the reset guest is
>> that we unmap all of stage 2 and then fault it back in, via
>> this code. That works for PIPT (we flush the range) and for
>> VIPT (we do a full icache flush), but at the moment for VIVT
>> ASID tagged we assume we can do nothing, and I don't think that's
>> right for this case (though it is right for "faulted because
>> page was swapped out" and OK for "page was written by DMA").
>
> When we reset the guest, we also turn both its Icache off. Before
> turning it back on, the guest has to invalidate it (the ARM ARM doesn't
> seem to define the state of the cache out of reset).

But implementations are allowed to hit in the cache even
when the cache is disabled. In particular, setting the guest
SCTLR_EL1.I to 0 means "iside accesses are Normal Noncacheable
for stage 1 attributes" and (v8 ARM ARM D3.4.6) these can
be held in the instruction cache. So the guest is required
to do an icache-invalidate for any instructions that it writes
*itself* (or DMAs in) even with the icache off. But it cannot
possibly do so for its own initial startup code, and it must
be the job of KVM to do that for it. (You can think of this as
"the VCPU provided by KVM always invalidates the icache on reset
and does not require an impdef magic cache-init routine as
described by D3.4.4" if you like.)

-- PMM
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marc Zyngier Jan. 9, 2015, 5:18 p.m. UTC | #11
On 09/01/15 15:28, Peter Maydell wrote:
> On 9 January 2015 at 14:16, Marc Zyngier <marc.zyngier@arm.com> wrote:
>> On 09/01/15 13:03, Peter Maydell wrote:
>>> When we reset a cpu by re-calling KVM_ARM_VCPU_INIT, that doesn't
>>> mean we get a new VMID for it, though, does it? I thought that
>>> what causes the icache flush to happen for the reset guest is
>>> that we unmap all of stage 2 and then fault it back in, via
>>> this code. That works for PIPT (we flush the range) and for
>>> VIPT (we do a full icache flush), but at the moment for VIVT
>>> ASID tagged we assume we can do nothing, and I don't think that's
>>> right for this case (though it is right for "faulted because
>>> page was swapped out" and OK for "page was written by DMA").
>>
>> When we reset the guest, we also turn both its Icache off. Before
>> turning it back on, the guest has to invalidate it (the ARM ARM doesn't
>> seem to define the state of the cache out of reset).
> 
> But implementations are allowed to hit in the cache even
> when the cache is disabled. In particular, setting the guest
> SCTLR_EL1.I to 0 means "iside accesses are Normal Noncacheable
> for stage 1 attributes" and (v8 ARM ARM D3.4.6) these can
> be held in the instruction cache. So the guest is required
> to do an icache-invalidate for any instructions that it writes
> *itself* (or DMAs in) even with the icache off. But it cannot
> possibly do so for its own initial startup code, and it must
> be the job of KVM to do that for it. (You can think of this as
> "the VCPU provided by KVM always invalidates the icache on reset
> and does not require an impdef magic cache-init routine as
> described by D3.4.4" if you like.)

Right. I think I finally see your point. We've been safe so far that no
ASID-tagged VIVT icache platform is virtualisation capable, AFAIK.

Probably best to just nuke the icache always for non-PIPT caches, then.

I'll fold that in when I respin the series.

Thanks,

	M.
Christoffer Dall Jan. 11, 2015, 12:33 p.m. UTC | #12
On Fri, Jan 09, 2015 at 03:28:58PM +0000, Peter Maydell wrote:
> On 9 January 2015 at 14:16, Marc Zyngier <marc.zyngier@arm.com> wrote:
> > On 09/01/15 13:03, Peter Maydell wrote:
> >> When we reset a cpu by re-calling KVM_ARM_VCPU_INIT, that doesn't
> >> mean we get a new VMID for it, though, does it? I thought that
> >> what causes the icache flush to happen for the reset guest is
> >> that we unmap all of stage 2 and then fault it back in, via
> >> this code. That works for PIPT (we flush the range) and for
> >> VIPT (we do a full icache flush), but at the moment for VIVT
> >> ASID tagged we assume we can do nothing, and I don't think that's
> >> right for this case (though it is right for "faulted because
> >> page was swapped out" and OK for "page was written by DMA").
> >
> > When we reset the guest, we also turn both its Icache off. Before
> > turning it back on, the guest has to invalidate it (the ARM ARM doesn't
> > seem to define the state of the cache out of reset).
> 
> But implementations are allowed to hit in the cache even
> when the cache is disabled. In particular, setting the guest

But how can it hit anything when the icache for the used VMID is
guaranteed to be clear (maybe that requires another full icache
invalidate for that VMID for PSCI reset)?

-Christoffer

> SCTLR_EL1.I to 0 means "iside accesses are Normal Noncacheable
> for stage 1 attributes" and (v8 ARM ARM D3.4.6) these can
> be held in the instruction cache. So the guest is required
> to do an icache-invalidate for any instructions that it writes
> *itself* (or DMAs in) even with the icache off. But it cannot
> possibly do so for its own initial startup code, and it must
> be the job of KVM to do that for it. (You can think of this as
> "the VCPU provided by KVM always invalidates the icache on reset
> and does not require an impdef magic cache-init routine as
> described by D3.4.4" if you like.)
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Maydell Jan. 11, 2015, 5:37 p.m. UTC | #13
On 11 January 2015 at 12:33, Christoffer Dall
<christoffer.dall@linaro.org> wrote:
> On Fri, Jan 09, 2015 at 03:28:58PM +0000, Peter Maydell wrote:
>> But implementations are allowed to hit in the cache even
>> when the cache is disabled. In particular, setting the guest
>
> But how can it hit anything when the icache for the used VMID is
> guaranteed to be clear (maybe that requires another full icache
> invalidate for that VMID for PSCI reset)?

The point is that at the moment we don't do anything to
guarantee that we've cleared the icache. (Plus could there be
stale data in the icache for this physical CPU for this VMID
because we've run some other vCPU on it? Or does the process
of rescheduling vCPUs across pCPUs and guest ASID management
deal with that?)

You probably want to clear the icache on vcpu (re-)init rather
than reset, though (no guarantee that userspace is going to
handle system resets via PSCI).

-- PMM
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoffer Dall Jan. 11, 2015, 5:58 p.m. UTC | #14
On Sun, Jan 11, 2015 at 05:37:52PM +0000, Peter Maydell wrote:
> On 11 January 2015 at 12:33, Christoffer Dall
> <christoffer.dall@linaro.org> wrote:
> > On Fri, Jan 09, 2015 at 03:28:58PM +0000, Peter Maydell wrote:
> >> But implementations are allowed to hit in the cache even
> >> when the cache is disabled. In particular, setting the guest
> >
> > But how can it hit anything when the icache for the used VMID is
> > guaranteed to be clear (maybe that requires another full icache
> > invalidate for that VMID for PSCI reset)?
> 
> The point is that at the moment we don't do anything to
> guarantee that we've cleared the icache. 

that's not entirely accurate, I assume all of the icache is
invalidated/cleaned at system bring-up time, and every time we re-use a
VMID (when we start a VMID rollover) we invalidate the entire icache.

> (Plus could there be
> stale data in the icache for this physical CPU for this VMID
> because we've run some other vCPU on it? Or does the process
> of rescheduling vCPUs across pCPUs and guest ASID management
> deal with that?)

we don't clear the icache for vCPUs migrating onto other pCPUs but
invalidating the icache on a page fault won't guarantee that either.  Do
we really need to do that?

> 
> You probably want to clear the icache on vcpu (re-)init rather
> than reset, though (no guarantee that userspace is going to
> handle system resets via PSCI).
> 
Yes, good point.

-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Maydell Jan. 11, 2015, 6:27 p.m. UTC | #15
On 11 January 2015 at 17:58, Christoffer Dall
<christoffer.dall@linaro.org> wrote:
> On Sun, Jan 11, 2015 at 05:37:52PM +0000, Peter Maydell wrote:
>> On 11 January 2015 at 12:33, Christoffer Dall
>> <christoffer.dall@linaro.org> wrote:
>> > On Fri, Jan 09, 2015 at 03:28:58PM +0000, Peter Maydell wrote:
>> >> But implementations are allowed to hit in the cache even
>> >> when the cache is disabled. In particular, setting the guest
>> >
>> > But how can it hit anything when the icache for the used VMID is
>> > guaranteed to be clear (maybe that requires another full icache
>> > invalidate for that VMID for PSCI reset)?
>>
>> The point is that at the moment we don't do anything to
>> guarantee that we've cleared the icache.
>
> that's not entirely accurate, I assume all of the icache is
> invalidated/cleaned at system bring-up time, and every time we re-use a
> VMID (when we start a VMID rollover) we invalidate the entire icache.

Right, but that doesn't catch the VM reset case, which is the
one we're talking about.

>> (Plus could there be
>> stale data in the icache for this physical CPU for this VMID
>> because we've run some other vCPU on it? Or does the process
>> of rescheduling vCPUs across pCPUs and guest ASID management
>> deal with that?)
>
> we don't clear the icache for vCPUs migrating onto other pCPUs but
> invalidating the icache on a page fault won't guarantee that either.  Do
> we really need to do that?

I don't think we do, but I haven't thought through exactly
why we don't yet :-)

-- PMM
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoffer Dall Jan. 11, 2015, 6:38 p.m. UTC | #16
On Sun, Jan 11, 2015 at 06:27:35PM +0000, Peter Maydell wrote:
> On 11 January 2015 at 17:58, Christoffer Dall
> <christoffer.dall@linaro.org> wrote:
> > On Sun, Jan 11, 2015 at 05:37:52PM +0000, Peter Maydell wrote:
> >> On 11 January 2015 at 12:33, Christoffer Dall
> >> <christoffer.dall@linaro.org> wrote:
> >> > On Fri, Jan 09, 2015 at 03:28:58PM +0000, Peter Maydell wrote:
> >> >> But implementations are allowed to hit in the cache even
> >> >> when the cache is disabled. In particular, setting the guest
> >> >
> >> > But how can it hit anything when the icache for the used VMID is
> >> > guaranteed to be clear (maybe that requires another full icache
> >> > invalidate for that VMID for PSCI reset)?
> >>
> >> The point is that at the moment we don't do anything to
> >> guarantee that we've cleared the icache.
> >
> > that's not entirely accurate, I assume all of the icache is
> > invalidated/cleaned at system bring-up time, and every time we re-use a
> > VMID (when we start a VMID rollover) we invalidate the entire icache.
> 
> Right, but that doesn't catch the VM reset case, which is the
> one we're talking about.
> 

ok, so that's what you meant by warm reset, I see.

Then I would think we should add that single invalidate on vcpu init
rather than flushing the icache on every page fault?

> >> (Plus could there be
> >> stale data in the icache for this physical CPU for this VMID
> >> because we've run some other vCPU on it? Or does the process
> >> of rescheduling vCPUs across pCPUs and guest ASID management
> >> deal with that?)
> >
> > we don't clear the icache for vCPUs migrating onto other pCPUs but
> > invalidating the icache on a page fault won't guarantee that either.  Do
> > we really need to do that?
> 
> I don't think we do, but I haven't thought through exactly
> why we don't yet :-)
> 

So once you start a secondary vCPU that one can then hit in the icache
from what the primary vCPU put there which I guess is different behavior
from a physical secondary core coming out of reset with the MMU off and
never hitting the icache, right?

And is this not also a different behavior from a native system once the
vCPUs have turned their MMUs on, but we just don't happen to observe it
as being a problem?

In any case, I don't have a great solution for how to solve this except
for always invalidating the icache when we migrate a vCPU to a pCPU, but
that's really nasty...

-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marc Zyngier Jan. 12, 2015, 9:58 a.m. UTC | #17
On 11/01/15 18:38, Christoffer Dall wrote:
> On Sun, Jan 11, 2015 at 06:27:35PM +0000, Peter Maydell wrote:
>> On 11 January 2015 at 17:58, Christoffer Dall
>> <christoffer.dall@linaro.org> wrote:
>>> On Sun, Jan 11, 2015 at 05:37:52PM +0000, Peter Maydell wrote:
>>>> On 11 January 2015 at 12:33, Christoffer Dall
>>>> <christoffer.dall@linaro.org> wrote:
>>>>> On Fri, Jan 09, 2015 at 03:28:58PM +0000, Peter Maydell wrote:
>>>>>> But implementations are allowed to hit in the cache even
>>>>>> when the cache is disabled. In particular, setting the guest
>>>>>
>>>>> But how can it hit anything when the icache for the used VMID is
>>>>> guaranteed to be clear (maybe that requires another full icache
>>>>> invalidate for that VMID for PSCI reset)?
>>>>
>>>> The point is that at the moment we don't do anything to
>>>> guarantee that we've cleared the icache.
>>>
>>> that's not entirely accurate, I assume all of the icache is
>>> invalidated/cleaned at system bring-up time, and every time we re-use a
>>> VMID (when we start a VMID rollover) we invalidate the entire icache.
>>
>> Right, but that doesn't catch the VM reset case, which is the
>> one we're talking about.
>>
> 
> ok, so that's what you meant by warm reset, I see.
> 
> Then I would think we should add that single invalidate on vcpu init
> rather than flushing the icache on every page fault?
> 
>>>> (Plus could there be
>>>> stale data in the icache for this physical CPU for this VMID
>>>> because we've run some other vCPU on it? Or does the process
>>>> of rescheduling vCPUs across pCPUs and guest ASID management
>>>> deal with that?)
>>>
>>> we don't clear the icache for vCPUs migrating onto other pCPUs but
>>> invalidating the icache on a page fault won't guarantee that either.  Do
>>> we really need to do that?
>>
>> I don't think we do, but I haven't thought through exactly
>> why we don't yet :-)
>>
> 
> So once you start a secondary vCPU that one can then hit in the icache
> from what the primary vCPU put there which I guess is different behavior
> from a physical secondary core coming out of reset with the MMU off and
> never hitting the icache, right?
> 
> And is this not also a different behavior from a native system once the
> vCPUs have turned their MMUs on, but we just don't happen to observe it
> as being a problem?
> 
> In any case, I don't have a great solution for how to solve this except
> for always invalidating the icache when we migrate a vCPU to a pCPU, but
> that's really nasty...

No, it only needs to happen once per vcpu, on any CPU. IC IALLUIS is
broadcast across CPUs, so once it has taken place on the first CPU this
vcpu runs on, we're good.

Thanks,

	M.
Christoffer Dall Jan. 12, 2015, 8:10 p.m. UTC | #18
On Mon, Jan 12, 2015 at 09:58:30AM +0000, Marc Zyngier wrote:
> On 11/01/15 18:38, Christoffer Dall wrote:
> > On Sun, Jan 11, 2015 at 06:27:35PM +0000, Peter Maydell wrote:
> >> On 11 January 2015 at 17:58, Christoffer Dall
> >> <christoffer.dall@linaro.org> wrote:
> >>> On Sun, Jan 11, 2015 at 05:37:52PM +0000, Peter Maydell wrote:
> >>>> On 11 January 2015 at 12:33, Christoffer Dall
> >>>> <christoffer.dall@linaro.org> wrote:
> >>>>> On Fri, Jan 09, 2015 at 03:28:58PM +0000, Peter Maydell wrote:
> >>>>>> But implementations are allowed to hit in the cache even
> >>>>>> when the cache is disabled. In particular, setting the guest
> >>>>>
> >>>>> But how can it hit anything when the icache for the used VMID is
> >>>>> guaranteed to be clear (maybe that requires another full icache
> >>>>> invalidate for that VMID for PSCI reset)?
> >>>>
> >>>> The point is that at the moment we don't do anything to
> >>>> guarantee that we've cleared the icache.
> >>>
> >>> that's not entirely accurate, I assume all of the icache is
> >>> invalidated/cleaned at system bring-up time, and every time we re-use a
> >>> VMID (when we start a VMID rollover) we invalidate the entire icache.
> >>
> >> Right, but that doesn't catch the VM reset case, which is the
> >> one we're talking about.
> >>
> > 
> > ok, so that's what you meant by warm reset, I see.
> > 
> > Then I would think we should add that single invalidate on vcpu init
> > rather than flushing the icache on every page fault?
> > 
> >>>> (Plus could there be
> >>>> stale data in the icache for this physical CPU for this VMID
> >>>> because we've run some other vCPU on it? Or does the process
> >>>> of rescheduling vCPUs across pCPUs and guest ASID management
> >>>> deal with that?)
> >>>
> >>> we don't clear the icache for vCPUs migrating onto other pCPUs but
> >>> invalidating the icache on a page fault won't guarantee that either.  Do
> >>> we really need to do that?
> >>
> >> I don't think we do, but I haven't thought through exactly
> >> why we don't yet :-)
> >>
> > 
> > So once you start a secondary vCPU that one can then hit in the icache
> > from what the primary vCPU put there which I guess is different behavior
> > from a physical secondary core coming out of reset with the MMU off and
> > never hitting the icache, right?
> > 
> > And is this not also a different behavior from a native system once the
> > vCPUs have turned their MMUs on, but we just don't happen to observe it
> > as being a problem?
> > 
> > In any case, I don't have a great solution for how to solve this except
> > for always invalidating the icache when we migrate a vCPU to a pCPU, but
> > that's really nasty...
> 
> No, it only needs to happen once per vcpu, on any CPU. IC IALLUIS is
> broadcast across CPUs, so once it has taken place on the first CPU this
> vcpu runs on, we're good.
> 
But if you compare strictly to a native system, wouldn't a vCPU be able
to hit in the icache suddenly if migrated onto a pCPU that has run code
for the same VM (with the VMID) wihtout having tuned the MMU on?

-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marc Zyngier Jan. 13, 2015, 11:38 a.m. UTC | #19
On 12/01/15 20:10, Christoffer Dall wrote:
> On Mon, Jan 12, 2015 at 09:58:30AM +0000, Marc Zyngier wrote:
>> On 11/01/15 18:38, Christoffer Dall wrote:
>>> On Sun, Jan 11, 2015 at 06:27:35PM +0000, Peter Maydell wrote:
>>>> On 11 January 2015 at 17:58, Christoffer Dall
>>>> <christoffer.dall@linaro.org> wrote:
>>>>> On Sun, Jan 11, 2015 at 05:37:52PM +0000, Peter Maydell wrote:
>>>>>> On 11 January 2015 at 12:33, Christoffer Dall
>>>>>> <christoffer.dall@linaro.org> wrote:
>>>>>>> On Fri, Jan 09, 2015 at 03:28:58PM +0000, Peter Maydell wrote:
>>>>>>>> But implementations are allowed to hit in the cache even
>>>>>>>> when the cache is disabled. In particular, setting the guest
>>>>>>>
>>>>>>> But how can it hit anything when the icache for the used VMID is
>>>>>>> guaranteed to be clear (maybe that requires another full icache
>>>>>>> invalidate for that VMID for PSCI reset)?
>>>>>>
>>>>>> The point is that at the moment we don't do anything to
>>>>>> guarantee that we've cleared the icache.
>>>>>
>>>>> that's not entirely accurate, I assume all of the icache is
>>>>> invalidated/cleaned at system bring-up time, and every time we re-use a
>>>>> VMID (when we start a VMID rollover) we invalidate the entire icache.
>>>>
>>>> Right, but that doesn't catch the VM reset case, which is the
>>>> one we're talking about.
>>>>
>>>
>>> ok, so that's what you meant by warm reset, I see.
>>>
>>> Then I would think we should add that single invalidate on vcpu init
>>> rather than flushing the icache on every page fault?
>>>
>>>>>> (Plus could there be
>>>>>> stale data in the icache for this physical CPU for this VMID
>>>>>> because we've run some other vCPU on it? Or does the process
>>>>>> of rescheduling vCPUs across pCPUs and guest ASID management
>>>>>> deal with that?)
>>>>>
>>>>> we don't clear the icache for vCPUs migrating onto other pCPUs but
>>>>> invalidating the icache on a page fault won't guarantee that either.  Do
>>>>> we really need to do that?
>>>>
>>>> I don't think we do, but I haven't thought through exactly
>>>> why we don't yet :-)
>>>>
>>>
>>> So once you start a secondary vCPU that one can then hit in the icache
>>> from what the primary vCPU put there which I guess is different behavior
>>> from a physical secondary core coming out of reset with the MMU off and
>>> never hitting the icache, right?
>>>
>>> And is this not also a different behavior from a native system once the
>>> vCPUs have turned their MMUs on, but we just don't happen to observe it
>>> as being a problem?
>>>
>>> In any case, I don't have a great solution for how to solve this except
>>> for always invalidating the icache when we migrate a vCPU to a pCPU, but
>>> that's really nasty...
>>
>> No, it only needs to happen once per vcpu, on any CPU. IC IALLUIS is
>> broadcast across CPUs, so once it has taken place on the first CPU this
>> vcpu runs on, we're good.
>>
> But if you compare strictly to a native system, wouldn't a vCPU be able
> to hit in the icache suddenly if migrated onto a pCPU that has run code
> for the same VM (with the VMID) wihtout having tuned the MMU on?

Hmmm. Yes. ASID-tagged VIVT icache are really turning my brain into
jelly, and that's not a pretty thing to see.

So we're back to your initial approach: Each time a vcpu is migrated to
another CPU while its MMU/icache is off, we nuke the icache.

Do we want that in now, or do we keep that for later, when we actually
see such an implementation?

	M.
Christoffer Dall Jan. 13, 2015, 12:04 p.m. UTC | #20
On Tue, Jan 13, 2015 at 11:38:54AM +0000, Marc Zyngier wrote:
> On 12/01/15 20:10, Christoffer Dall wrote:
> > On Mon, Jan 12, 2015 at 09:58:30AM +0000, Marc Zyngier wrote:
> >> On 11/01/15 18:38, Christoffer Dall wrote:
> >>> On Sun, Jan 11, 2015 at 06:27:35PM +0000, Peter Maydell wrote:
> >>>> On 11 January 2015 at 17:58, Christoffer Dall
> >>>> <christoffer.dall@linaro.org> wrote:
> >>>>> On Sun, Jan 11, 2015 at 05:37:52PM +0000, Peter Maydell wrote:
> >>>>>> On 11 January 2015 at 12:33, Christoffer Dall
> >>>>>> <christoffer.dall@linaro.org> wrote:
> >>>>>>> On Fri, Jan 09, 2015 at 03:28:58PM +0000, Peter Maydell wrote:
> >>>>>>>> But implementations are allowed to hit in the cache even
> >>>>>>>> when the cache is disabled. In particular, setting the guest
> >>>>>>>
> >>>>>>> But how can it hit anything when the icache for the used VMID is
> >>>>>>> guaranteed to be clear (maybe that requires another full icache
> >>>>>>> invalidate for that VMID for PSCI reset)?
> >>>>>>
> >>>>>> The point is that at the moment we don't do anything to
> >>>>>> guarantee that we've cleared the icache.
> >>>>>
> >>>>> that's not entirely accurate, I assume all of the icache is
> >>>>> invalidated/cleaned at system bring-up time, and every time we re-use a
> >>>>> VMID (when we start a VMID rollover) we invalidate the entire icache.
> >>>>
> >>>> Right, but that doesn't catch the VM reset case, which is the
> >>>> one we're talking about.
> >>>>
> >>>
> >>> ok, so that's what you meant by warm reset, I see.
> >>>
> >>> Then I would think we should add that single invalidate on vcpu init
> >>> rather than flushing the icache on every page fault?
> >>>
> >>>>>> (Plus could there be
> >>>>>> stale data in the icache for this physical CPU for this VMID
> >>>>>> because we've run some other vCPU on it? Or does the process
> >>>>>> of rescheduling vCPUs across pCPUs and guest ASID management
> >>>>>> deal with that?)
> >>>>>
> >>>>> we don't clear the icache for vCPUs migrating onto other pCPUs but
> >>>>> invalidating the icache on a page fault won't guarantee that either.  Do
> >>>>> we really need to do that?
> >>>>
> >>>> I don't think we do, but I haven't thought through exactly
> >>>> why we don't yet :-)
> >>>>
> >>>
> >>> So once you start a secondary vCPU that one can then hit in the icache
> >>> from what the primary vCPU put there which I guess is different behavior
> >>> from a physical secondary core coming out of reset with the MMU off and
> >>> never hitting the icache, right?
> >>>
> >>> And is this not also a different behavior from a native system once the
> >>> vCPUs have turned their MMUs on, but we just don't happen to observe it
> >>> as being a problem?
> >>>
> >>> In any case, I don't have a great solution for how to solve this except
> >>> for always invalidating the icache when we migrate a vCPU to a pCPU, but
> >>> that's really nasty...
> >>
> >> No, it only needs to happen once per vcpu, on any CPU. IC IALLUIS is
> >> broadcast across CPUs, so once it has taken place on the first CPU this
> >> vcpu runs on, we're good.
> >>
> > But if you compare strictly to a native system, wouldn't a vCPU be able
> > to hit in the icache suddenly if migrated onto a pCPU that has run code
> > for the same VM (with the VMID) wihtout having tuned the MMU on?
> 
> Hmmm. Yes. ASID-tagged VIVT icache are really turning my brain into
> jelly, and that's not a pretty thing to see.
> 
> So we're back to your initial approach: Each time a vcpu is migrated to
> another CPU while its MMU/icache is off, we nuke the icache.
> 
> Do we want that in now, or do we keep that for later, when we actually
> see such an implementation?
> 
I don't think we want that in there now, if we can't test it anyway,
we're likely not to get it 100% correct.

Additionally, I haven't been able to think of a reasonable guest
scenario where this breaks.  Once the guest turns on its MMU it should
deal with the necessary icache invalidation itself (I think), so we're
really talking about situations where the stage-1 MMU is off, and I
gather that mostly you'll be seeing a single core doing any heavy
lifting and then secondary cores basically coming up, only seeing valid
entries in the icache, and doing the necessary invalidat+turn on mmu
stuff.

But I haven't spent days thinking about this yet.

-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Maydell Jan. 13, 2015, 12:12 p.m. UTC | #21
On 13 January 2015 at 12:04, Christoffer Dall
<christoffer.dall@linaro.org> wrote:
> Additionally, I haven't been able to think of a reasonable guest
> scenario where this breaks.  Once the guest turns on its MMU it should
> deal with the necessary icache invalidation itself (I think), so we're
> really talking about situations where the stage-1 MMU is off, and I
> gather that mostly you'll be seeing a single core doing any heavy
> lifting and then secondary cores basically coming up, only seeing valid
> entries in the icache, and doing the necessary invalidat+turn on mmu
> stuff.

The trouble with that is that as the secondary comes up, before it
turns on its icache its VA->PA mapping is the identity map; whereas
the primary vCPU's VA->PA mapping is "whatever the guest kernel's
usual mapping is". If the kernel has some mapping other than identity
for the VA which is wherever the secondary-CPU-startup-to-MMU-enable
code lives (which seems quite likely), then you have potential problems.

-- PMM
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoffer Dall Jan. 13, 2015, 1:35 p.m. UTC | #22
On Tue, Jan 13, 2015 at 12:12:41PM +0000, Peter Maydell wrote:
> On 13 January 2015 at 12:04, Christoffer Dall
> <christoffer.dall@linaro.org> wrote:
> > Additionally, I haven't been able to think of a reasonable guest
> > scenario where this breaks.  Once the guest turns on its MMU it should
> > deal with the necessary icache invalidation itself (I think), so we're
> > really talking about situations where the stage-1 MMU is off, and I
> > gather that mostly you'll be seeing a single core doing any heavy
> > lifting and then secondary cores basically coming up, only seeing valid
> > entries in the icache, and doing the necessary invalidat+turn on mmu
> > stuff.
> 
> The trouble with that is that as the secondary comes up, before it
> turns on its icache its VA->PA mapping is the identity map; whereas
> the primary vCPU's VA->PA mapping is "whatever the guest kernel's
> usual mapping is". If the kernel has some mapping other than identity
> for the VA which is wherever the secondary-CPU-startup-to-MMU-enable
> code lives (which seems quite likely), then you have potential problems.
> 
Wouldn't a guest (and I believe Linux does this) reserve ASID 0 for
additional cores and use ASID 1+++ for itself?

Or does the potential hits in the icache for a stage-1 turned-off MMU
hit on all ASIDs ?

-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Maydell Jan. 13, 2015, 1:41 p.m. UTC | #23
On 13 January 2015 at 13:35, Christoffer Dall
<christoffer.dall@linaro.org> wrote:
> Wouldn't a guest (and I believe Linux does this) reserve ASID 0 for
> additional cores and use ASID 1+++ for itself?

If the guest reserves an ASID for "MMU disabled" then yes, that would
work. The question of course is whether all guests do that...

-- PMM
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoffer Dall Jan. 13, 2015, 1:49 p.m. UTC | #24
On Tue, Jan 13, 2015 at 01:41:03PM +0000, Peter Maydell wrote:
> On 13 January 2015 at 13:35, Christoffer Dall
> <christoffer.dall@linaro.org> wrote:
> > Wouldn't a guest (and I believe Linux does this) reserve ASID 0 for
> > additional cores and use ASID 1+++ for itself?
> 
> If the guest reserves an ASID for "MMU disabled" then yes, that would
> work. The question of course is whether all guests do that...
> 
which ASID would match for MMU disabled?  Did you come across that in
the ARM ARM somewhere?

The fact that Linux does it would indicate that this may be a
requirement on real hardware too and therefore all guests have to do
it...

-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mark Rutland Jan. 15, 2015, noon UTC | #25
On Tue, Jan 13, 2015 at 01:35:16PM +0000, Christoffer Dall wrote:
> On Tue, Jan 13, 2015 at 12:12:41PM +0000, Peter Maydell wrote:
> > On 13 January 2015 at 12:04, Christoffer Dall
> > <christoffer.dall@linaro.org> wrote:
> > > Additionally, I haven't been able to think of a reasonable guest
> > > scenario where this breaks.  Once the guest turns on its MMU it should
> > > deal with the necessary icache invalidation itself (I think), so we're
> > > really talking about situations where the stage-1 MMU is off, and I
> > > gather that mostly you'll be seeing a single core doing any heavy
> > > lifting and then secondary cores basically coming up, only seeing valid
> > > entries in the icache, and doing the necessary invalidat+turn on mmu
> > > stuff.
> > 
> > The trouble with that is that as the secondary comes up, before it
> > turns on its icache its VA->PA mapping is the identity map; whereas
> > the primary vCPU's VA->PA mapping is "whatever the guest kernel's
> > usual mapping is". If the kernel has some mapping other than identity
> > for the VA which is wherever the secondary-CPU-startup-to-MMU-enable
> > code lives (which seems quite likely), then you have potential problems.
> > 
> Wouldn't a guest (and I believe Linux does this) reserve ASID 0 for
> additional cores and use ASID 1+++ for itself?

Not on arm since 52af9c6cd863fe37 (ARM: 6943/1: mm: use TTBR1 instead of
reserved context ID) and 45b95235b0ac86ce (ARM: 6944/1: mm: allow ASID 0
to be allocated to tasks). The swapper_pg_dir uses global mappings since
d427958a46af24f7 (ARM: 6942/1: mm: make TTBR1 always point to
swapper_pg_dir on ARMv6/7).

Similarly on arm64 the swapper_pg_dir and idmap_pg_dir use global
mappings and we don't reserve any ASIDs for use by the kernel.

Thanks,
Mark.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoffer Dall Jan. 15, 2015, 1 p.m. UTC | #26
Hi Mark,

On Thu, Jan 15, 2015 at 12:00:20PM +0000, Mark Rutland wrote:
> On Tue, Jan 13, 2015 at 01:35:16PM +0000, Christoffer Dall wrote:
> > On Tue, Jan 13, 2015 at 12:12:41PM +0000, Peter Maydell wrote:
> > > On 13 January 2015 at 12:04, Christoffer Dall
> > > <christoffer.dall@linaro.org> wrote:
> > > > Additionally, I haven't been able to think of a reasonable guest
> > > > scenario where this breaks.  Once the guest turns on its MMU it should
> > > > deal with the necessary icache invalidation itself (I think), so we're
> > > > really talking about situations where the stage-1 MMU is off, and I
> > > > gather that mostly you'll be seeing a single core doing any heavy
> > > > lifting and then secondary cores basically coming up, only seeing valid
> > > > entries in the icache, and doing the necessary invalidat+turn on mmu
> > > > stuff.
> > > 
> > > The trouble with that is that as the secondary comes up, before it
> > > turns on its icache its VA->PA mapping is the identity map; whereas
> > > the primary vCPU's VA->PA mapping is "whatever the guest kernel's
> > > usual mapping is". If the kernel has some mapping other than identity
> > > for the VA which is wherever the secondary-CPU-startup-to-MMU-enable
> > > code lives (which seems quite likely), then you have potential problems.
> > > 
> > Wouldn't a guest (and I believe Linux does this) reserve ASID 0 for
> > additional cores and use ASID 1+++ for itself?
> 
> Not on arm since 52af9c6cd863fe37 (ARM: 6943/1: mm: use TTBR1 instead of
> reserved context ID) and 45b95235b0ac86ce (ARM: 6944/1: mm: allow ASID 0
> to be allocated to tasks). The swapper_pg_dir uses global mappings since
> d427958a46af24f7 (ARM: 6942/1: mm: make TTBR1 always point to
> swapper_pg_dir on ARMv6/7).

thanks for the pointer, been a while since I looked at that code.

> 
> Similarly on arm64 the swapper_pg_dir and idmap_pg_dir use global
> mappings and we don't reserve any ASIDs for use by the kernel.
> 

Do you happen to know which ASIDs are matched in the icache when the MMU
is off?

-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mark Rutland Jan. 15, 2015, 3:47 p.m. UTC | #27
On Thu, Jan 15, 2015 at 01:00:58PM +0000, Christoffer Dall wrote:
> Hi Mark,
> 
> On Thu, Jan 15, 2015 at 12:00:20PM +0000, Mark Rutland wrote:
> > On Tue, Jan 13, 2015 at 01:35:16PM +0000, Christoffer Dall wrote:
> > > On Tue, Jan 13, 2015 at 12:12:41PM +0000, Peter Maydell wrote:
> > > > On 13 January 2015 at 12:04, Christoffer Dall
> > > > <christoffer.dall@linaro.org> wrote:
> > > > > Additionally, I haven't been able to think of a reasonable guest
> > > > > scenario where this breaks.  Once the guest turns on its MMU it should
> > > > > deal with the necessary icache invalidation itself (I think), so we're
> > > > > really talking about situations where the stage-1 MMU is off, and I
> > > > > gather that mostly you'll be seeing a single core doing any heavy
> > > > > lifting and then secondary cores basically coming up, only seeing valid
> > > > > entries in the icache, and doing the necessary invalidat+turn on mmu
> > > > > stuff.
> > > > 
> > > > The trouble with that is that as the secondary comes up, before it
> > > > turns on its icache its VA->PA mapping is the identity map; whereas
> > > > the primary vCPU's VA->PA mapping is "whatever the guest kernel's
> > > > usual mapping is". If the kernel has some mapping other than identity
> > > > for the VA which is wherever the secondary-CPU-startup-to-MMU-enable
> > > > code lives (which seems quite likely), then you have potential problems.
> > > > 
> > > Wouldn't a guest (and I believe Linux does this) reserve ASID 0 for
> > > additional cores and use ASID 1+++ for itself?
> > 
> > Not on arm since 52af9c6cd863fe37 (ARM: 6943/1: mm: use TTBR1 instead of
> > reserved context ID) and 45b95235b0ac86ce (ARM: 6944/1: mm: allow ASID 0
> > to be allocated to tasks). The swapper_pg_dir uses global mappings since
> > d427958a46af24f7 (ARM: 6942/1: mm: make TTBR1 always point to
> > swapper_pg_dir on ARMv6/7).
> 
> thanks for the pointer, been a while since I looked at that code.
> 
> > 
> > Similarly on arm64 the swapper_pg_dir and idmap_pg_dir use global
> > mappings and we don't reserve any ASIDs for use by the kernel.
> > 
> 
> Do you happen to know which ASIDs are matched in the icache when the MMU
> is off?

Unfortunately not. I only seem to have derived as much as anyone else
here by reading the ARM ARM.

Thanks,
Mark.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 7ceb836..be6bc72 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -162,13 +162,10 @@  static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
 	return (vcpu->arch.cp15[c1_SCTLR] & 0b101) == 0b101;
 }
 
-static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
-					     unsigned long size,
-					     bool ipa_uncached)
+static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu, pfn_t pfn,
+				 	       unsigned long size,
+					       bool ipa_uncached)
 {
-	if (!vcpu_has_cache_enabled(vcpu) || ipa_uncached)
-		kvm_flush_dcache_to_poc((void *)hva, size);
-	
 	/*
 	 * If we are going to insert an instruction page and the icache is
 	 * either VIPT or PIPT, there is a potential problem where the host
@@ -180,10 +177,32 @@  static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
 	 *
 	 * VIVT caches are tagged using both the ASID and the VMID and doesn't
 	 * need any kind of flushing (DDI 0406C.b - Page B3-1392).
+	 *
+	 * We need to do this through a kernel mapping (using the
+	 * user-space mapping has proved to be the wrong
+	 * solution). For that, we need to kmap one page at a time,
+	 * and iterate over the range.
 	 */
-	if (icache_is_pipt()) {
-		__cpuc_coherent_user_range(hva, hva + size);
-	} else if (!icache_is_vivt_asid_tagged()) {
+
+	VM_BUG_ON(size & PAGE_MASK);
+
+	while (size) {
+		void *va = kmap_atomic_pfn(pfn);
+
+		if (!vcpu_has_cache_enabled(vcpu) || ipa_uncached)
+			kvm_flush_dcache_to_poc(va, PAGE_SIZE);
+	
+		if (icache_is_pipt())
+			__cpuc_coherent_user_range((unsigned long)va,
+						   (unsigned long)va + PAGE_SIZE);
+
+		size -= PAGE_SIZE;
+		pfn++;
+
+		kunmap_atomic(va);
+	}
+
+	if (!icache_is_pipt() && !icache_is_vivt_asid_tagged()) {
 		/* any kind of VIPT cache */
 		__flush_icache_all();
 	}
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 1f5b793..4da6504 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -921,6 +921,12 @@  static bool kvm_is_device_pfn(unsigned long pfn)
 	return !pfn_valid(pfn);
 }
 
+static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, pfn_t pfn,
+				      unsigned long size, bool uncached)
+{
+	__coherent_cache_guest_page(vcpu, pfn, size, uncached);
+}
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			  struct kvm_memory_slot *memslot, unsigned long hva,
 			  unsigned long fault_status)
@@ -1010,8 +1016,7 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			kvm_set_s2pmd_writable(&new_pmd);
 			kvm_set_pfn_dirty(pfn);
 		}
-		coherent_cache_guest_page(vcpu, hva & PMD_MASK, PMD_SIZE,
-					  fault_ipa_uncached);
+		coherent_cache_guest_page(vcpu, pfn, PMD_SIZE, fault_ipa_uncached);
 		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
 	} else {
 		pte_t new_pte = pfn_pte(pfn, mem_type);
@@ -1019,8 +1024,7 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			kvm_set_s2pte_writable(&new_pte);
 			kvm_set_pfn_dirty(pfn);
 		}
-		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
-					  fault_ipa_uncached);
+		coherent_cache_guest_page(vcpu, pfn, PAGE_SIZE, fault_ipa_uncached);
 		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
 			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
 	}
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index b7419f5..970e11e 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -243,15 +243,18 @@  static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
 	return (vcpu_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
 }
 
-static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
-					     unsigned long size,
-					     bool ipa_uncached)
+static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu, pfn_t pfn,
+					       unsigned long size,
+					       bool ipa_uncached)
 {
+	void *va = page_address(pfn_to_page(pfn));
+
 	if (!vcpu_has_cache_enabled(vcpu) || ipa_uncached)
-		kvm_flush_dcache_to_poc((void *)hva, size);
+		kvm_flush_dcache_to_poc(va, size);
 
 	if (!icache_is_aliasing()) {		/* PIPT */
-		flush_icache_range(hva, hva + size);
+		flush_icache_range((unsigned long)va,
+				   (unsigned long)va + size);
 	} else if (!icache_is_aivivt()) {	/* non ASID-tagged VIVT */
 		/* any kind of VIPT cache */
 		__flush_icache_all();