diff mbox series

[RFC,03/39] KVM: x86/xen: register shared_info page

Message ID 20190220201609.28290-4-joao.m.martins@oracle.com (mailing list archive)
State New, archived
Headers show
Series x86/KVM: Xen HVM guest support | expand

Commit Message

Joao Martins Feb. 20, 2019, 8:15 p.m. UTC
We add a new ioctl, XEN_HVM_SHARED_INFO, to allow hypervisor
to know where the guest's shared info page is.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
---
 arch/x86/include/asm/kvm_host.h |  3 +++
 arch/x86/kvm/x86.c              | 21 +++++++++++++++
 arch/x86/kvm/xen.c              | 60 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/xen.h              |  4 +++
 include/uapi/linux/kvm.h        | 15 +++++++++++
 5 files changed, 103 insertions(+)

Comments

David Woodhouse Dec. 1, 2020, 1:07 p.m. UTC | #1
On Wed, 2019-02-20 at 20:15 +0000, Joao Martins wrote:
> +static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
> +{
> +       struct shared_info *shared_info;
> +       struct page *page;
> +
> +       page = gfn_to_page(kvm, gfn);
> +       if (is_error_page(page))
> +               return -EINVAL;
> +
> +       kvm->arch.xen.shinfo_addr = gfn;
> +
> +       shared_info = page_to_virt(page);
> +       memset(shared_info, 0, sizeof(struct shared_info));
> +       kvm->arch.xen.shinfo = shared_info;
> +       return 0;
> +}
> +

Hm.

How come we get to pin the page and directly dereference it every time,
while kvm_setup_pvclock_page() has to use kvm_write_guest_cached()
instead?

If that was allowed, wouldn't it have been a much simpler fix for
CVE-2019-3016? What am I missing?

Should I rework these to use kvm_write_guest_cached()?
Ankur Arora Dec. 2, 2020, 12:40 a.m. UTC | #2
On 2020-12-01 5:07 a.m., David Woodhouse wrote:
> On Wed, 2019-02-20 at 20:15 +0000, Joao Martins wrote:
>> +static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
>> +{
>> +       struct shared_info *shared_info;
>> +       struct page *page;
>> +
>> +       page = gfn_to_page(kvm, gfn);
>> +       if (is_error_page(page))
>> +               return -EINVAL;
>> +
>> +       kvm->arch.xen.shinfo_addr = gfn;
>> +
>> +       shared_info = page_to_virt(page);
>> +       memset(shared_info, 0, sizeof(struct shared_info));
>> +       kvm->arch.xen.shinfo = shared_info;
>> +       return 0;
>> +}
>> +
> 
> Hm.
> 
> How come we get to pin the page and directly dereference it every time,
> while kvm_setup_pvclock_page() has to use kvm_write_guest_cached()
> instead?

So looking at my WIP trees from the time, this is something that
we went back and forth on as well with using just a pinned page or a
persistent kvm_vcpu_map().

I remember distinguishing shared_info/vcpu_info from kvm_setup_pvclock_page()
as shared_info is created early and is not expected to change during the
lifetime of the guest which didn't seem true for MSR_KVM_SYSTEM_TIME (or
MSR_KVM_STEAL_TIME) so that would either need to do a kvm_vcpu_map()
kvm_vcpu_unmap() dance or do some kind of synchronization.

That said, I don't think this code explicitly disallows any updates
to shared_info.

> 
> If that was allowed, wouldn't it have been a much simpler fix for
> CVE-2019-3016? What am I missing?

Agreed.

Perhaps, Paolo can chime in with why KVM never uses pinned page
and always prefers to do cached mappings instead?

> 
> Should I rework these to use kvm_write_guest_cached()?

kvm_vcpu_map() would be better. The event channel logic does RMW operations
on shared_info->vcpu_info.


Ankur

> 
>
David Woodhouse Dec. 2, 2020, 1:26 a.m. UTC | #3
On Tue, 2020-12-01 at 16:40 -0800, Ankur Arora wrote:
> On 2020-12-01 5:07 a.m., David Woodhouse wrote:
> > On Wed, 2019-02-20 at 20:15 +0000, Joao Martins wrote:
> > > +static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
> > > +{
> > > +       struct shared_info *shared_info;
> > > +       struct page *page;
> > > +
> > > +       page = gfn_to_page(kvm, gfn);
> > > +       if (is_error_page(page))
> > > +               return -EINVAL;
> > > +
> > > +       kvm->arch.xen.shinfo_addr = gfn;
> > > +
> > > +       shared_info = page_to_virt(page);
> > > +       memset(shared_info, 0, sizeof(struct shared_info));
> > > +       kvm->arch.xen.shinfo = shared_info;
> > > +       return 0;
> > > +}
> > > +
> > 
> > Hm.
> > 
> > How come we get to pin the page and directly dereference it every time,
> > while kvm_setup_pvclock_page() has to use kvm_write_guest_cached()
> > instead?
> 
> So looking at my WIP trees from the time, this is something that
> we went back and forth on as well with using just a pinned page or a
> persistent kvm_vcpu_map().

OK, thanks.

> I remember distinguishing shared_info/vcpu_info from kvm_setup_pvclock_page()
> as shared_info is created early and is not expected to change during the
> lifetime of the guest which didn't seem true for MSR_KVM_SYSTEM_TIME (or
> MSR_KVM_STEAL_TIME) so that would either need to do a kvm_vcpu_map()
> kvm_vcpu_unmap() dance or do some kind of synchronization.
> 
> That said, I don't think this code explicitly disallows any updates
> to shared_info.

It needs to allow updates as well as disabling the shared_info pages.
We're going to need that to implement SHUTDOWN_soft_reset for kexec.

> > 
> > If that was allowed, wouldn't it have been a much simpler fix for
> > CVE-2019-3016? What am I missing?
> 
> Agreed.
> 
> Perhaps, Paolo can chime in with why KVM never uses pinned page
> and always prefers to do cached mappings instead?
> 
> > 
> > Should I rework these to use kvm_write_guest_cached()?
> 
> kvm_vcpu_map() would be better. The event channel logic does RMW operations
> on shared_info->vcpu_info.

I've ported the shared_info/vcpu_info parts and made a test case, and
was going back through to make it use kvm_write_guest_cached(). But I
should probably plug on through the evtchn bits before I do that.

I also don't see much locking on the cache, and can't convince myself
that accessing the shared_info page from multiple CPUs with
kvm_write_guest_cached() or kvm_map_gfn() is sane unless they each have
their own cache.

What I have so far is at

https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/xenpv

I'll do the event channel support tomorrow and hook it up to my actual
VMM to give it some more serious testing.
Ankur Arora Dec. 2, 2020, 5:17 a.m. UTC | #4
On 2020-12-01 5:26 p.m., David Woodhouse wrote
> On Tue, 2020-12-01 at 16:40 -0800, Ankur Arora wrote:
>> On 2020-12-01 5:07 a.m., David Woodhouse wrote:
>>> On Wed, 2019-02-20 at 20:15 +0000, Joao Martins wrote:
>>>> +static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
>>>> +{
>>>> +       struct shared_info *shared_info;
>>>> +       struct page *page;
>>>> +
>>>> +       page = gfn_to_page(kvm, gfn);
>>>> +       if (is_error_page(page))
>>>> +               return -EINVAL;
>>>> +
>>>> +       kvm->arch.xen.shinfo_addr = gfn;
>>>> +
>>>> +       shared_info = page_to_virt(page);
>>>> +       memset(shared_info, 0, sizeof(struct shared_info));
>>>> +       kvm->arch.xen.shinfo = shared_info;
>>>> +       return 0;
>>>> +}
>>>> +
>>>
>>> Hm.
>>>
>>> How come we get to pin the page and directly dereference it every time,
>>> while kvm_setup_pvclock_page() has to use kvm_write_guest_cached()
>>> instead?
>>
>> So looking at my WIP trees from the time, this is something that
>> we went back and forth on as well with using just a pinned page or a
>> persistent kvm_vcpu_map().
> 
> OK, thanks.
> 
>> I remember distinguishing shared_info/vcpu_info from kvm_setup_pvclock_page()
>> as shared_info is created early and is not expected to change during the
>> lifetime of the guest which didn't seem true for MSR_KVM_SYSTEM_TIME (or
>> MSR_KVM_STEAL_TIME) so that would either need to do a kvm_vcpu_map()
>> kvm_vcpu_unmap() dance or do some kind of synchronization.
>>
>> That said, I don't think this code explicitly disallows any updates
>> to shared_info.
> 
> It needs to allow updates as well as disabling the shared_info pages.
> We're going to need that to implement SHUTDOWN_soft_reset for kexec.
True.

> 
>>>
>>> If that was allowed, wouldn't it have been a much simpler fix for
>>> CVE-2019-3016? What am I missing?
>>
>> Agreed.
>>
>> Perhaps, Paolo can chime in with why KVM never uses pinned page
>> and always prefers to do cached mappings instead?
>>
>>>
>>> Should I rework these to use kvm_write_guest_cached()?
>>
>> kvm_vcpu_map() would be better. The event channel logic does RMW operations
>> on shared_info->vcpu_info.
> 
> I've ported the shared_info/vcpu_info parts and made a test case, and
> was going back through to make it use kvm_write_guest_cached(). But I
> should probably plug on through the evtchn bits before I do that.
> 
> I also don't see much locking on the cache, and can't convince myself
> that accessing the shared_info page from multiple CPUs with
> kvm_write_guest_cached() or kvm_map_gfn() is sane unless they each have
> their own cache.

I think you could get a VCPU specific cache with kvm_vcpu_map().

> 
> What I have so far is at
> 
> https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/xenpv

Thanks. Will take a look there.

Ankur

> 
> I'll do the event channel support tomorrow and hook it up to my actual
> VMM to give it some more serious testing.
>
Joao Martins Dec. 2, 2020, 10:44 a.m. UTC | #5
[late response - was on holiday yesterday]

On 12/2/20 12:40 AM, Ankur Arora wrote:
> On 2020-12-01 5:07 a.m., David Woodhouse wrote:
>> On Wed, 2019-02-20 at 20:15 +0000, Joao Martins wrote:
>>> +static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
>>> +{
>>> +       struct shared_info *shared_info;
>>> +       struct page *page;
>>> +
>>> +       page = gfn_to_page(kvm, gfn);
>>> +       if (is_error_page(page))
>>> +               return -EINVAL;
>>> +
>>> +       kvm->arch.xen.shinfo_addr = gfn;
>>> +
>>> +       shared_info = page_to_virt(page);
>>> +       memset(shared_info, 0, sizeof(struct shared_info));
>>> +       kvm->arch.xen.shinfo = shared_info;
>>> +       return 0;
>>> +}
>>> +
>>
>> Hm.
>>
>> How come we get to pin the page and directly dereference it every time,
>> while kvm_setup_pvclock_page() has to use kvm_write_guest_cached()
>> instead?
> 
> So looking at my WIP trees from the time, this is something that
> we went back and forth on as well with using just a pinned page or a
> persistent kvm_vcpu_map().
> 
> I remember distinguishing shared_info/vcpu_info from kvm_setup_pvclock_page()
> as shared_info is created early and is not expected to change during the
> lifetime of the guest which didn't seem true for MSR_KVM_SYSTEM_TIME (or
> MSR_KVM_STEAL_TIME) so that would either need to do a kvm_vcpu_map()
> kvm_vcpu_unmap() dance or do some kind of synchronization.
> 
> That said, I don't think this code explicitly disallows any updates
> to shared_info.
> 
>>
>> If that was allowed, wouldn't it have been a much simpler fix for
>> CVE-2019-3016? What am I missing?
> 
> Agreed.
> 
> Perhaps, Paolo can chime in with why KVM never uses pinned page
> and always prefers to do cached mappings instead?
> 
Part of the CVE fix to not use cached versions.

It's not a longterm pin of the page unlike we try to do here (partly due to the nature
of the pages we are mapping) but we still we map the gpa, RMW the steal time struct, and
then unmap the page.

See record_steal_time() -- but more specifically commit b043138246 ("x86/KVM: Make sure
KVM_VCPU_FLUSH_TLB flag is not missed").

But I am not sure it's a good idea to follow the same as record_steal_time() given that
this is a fairly sensitive code path for event channels.

>>
>> Should I rework these to use kvm_write_guest_cached()?
> 
> kvm_vcpu_map() would be better. The event channel logic does RMW operations
> on shared_info->vcpu_info.
> 
Indeed, yes.

Ankur IIRC, we saw missed event channels notifications when we were using the
{write,read}_cached() version of the patch.

But I can't remember the reason it was due to, either the evtchn_pending or the mask
word -- which would make it not inject an upcall.

	Joao
Joao Martins Dec. 2, 2020, 10:50 a.m. UTC | #6
On 12/2/20 5:17 AM, Ankur Arora wrote:
> On 2020-12-01 5:26 p.m., David Woodhouse wrote
>> On Tue, 2020-12-01 at 16:40 -0800, Ankur Arora wrote:
>>> On 2020-12-01 5:07 a.m., David Woodhouse wrote:

[...]

>>>> If that was allowed, wouldn't it have been a much simpler fix for
>>>> CVE-2019-3016? What am I missing?
>>>
>>> Agreed.
>>>
>>> Perhaps, Paolo can chime in with why KVM never uses pinned page
>>> and always prefers to do cached mappings instead?
>>>
>>>>
>>>> Should I rework these to use kvm_write_guest_cached()?
>>>
>>> kvm_vcpu_map() would be better. The event channel logic does RMW operations
>>> on shared_info->vcpu_info.
>>
>> I've ported the shared_info/vcpu_info parts and made a test case, and
>> was going back through to make it use kvm_write_guest_cached(). But I
>> should probably plug on through the evtchn bits before I do that.
>>
>> I also don't see much locking on the cache, and can't convince myself
>> that accessing the shared_info page from multiple CPUs with
>> kvm_write_guest_cached() or kvm_map_gfn() is sane unless they each have
>> their own cache.
> 
> I think you could get a VCPU specific cache with kvm_vcpu_map().
> 

steal clock handling creates such a mapping cache (struct gfn_to_pfn_cache).

	Joao
David Woodhouse Dec. 2, 2020, 12:20 p.m. UTC | #7
On Wed, 2020-12-02 at 10:44 +0000, Joao Martins wrote:
> [late response - was on holiday yesterday]
> 
> On 12/2/20 12:40 AM, Ankur Arora wrote:
> > On 2020-12-01 5:07 a.m., David Woodhouse wrote:
> > > On Wed, 2019-02-20 at 20:15 +0000, Joao Martins wrote:
> > > > +static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
> > > > +{
> > > > +       struct shared_info *shared_info;
> > > > +       struct page *page;
> > > > +
> > > > +       page = gfn_to_page(kvm, gfn);
> > > > +       if (is_error_page(page))
> > > > +               return -EINVAL;
> > > > +
> > > > +       kvm->arch.xen.shinfo_addr = gfn;
> > > > +
> > > > +       shared_info = page_to_virt(page);
> > > > +       memset(shared_info, 0, sizeof(struct shared_info));
> > > > +       kvm->arch.xen.shinfo = shared_info;
> > > > +       return 0;
> > > > +}
> > > > +
> > > 
> > > Hm.
> > > 
> > > How come we get to pin the page and directly dereference it every time,
> > > while kvm_setup_pvclock_page() has to use kvm_write_guest_cached()
> > > instead?
> > 
> > So looking at my WIP trees from the time, this is something that
> > we went back and forth on as well with using just a pinned page or a
> > persistent kvm_vcpu_map().
> > 
> > I remember distinguishing shared_info/vcpu_info from kvm_setup_pvclock_page()
> > as shared_info is created early and is not expected to change during the
> > lifetime of the guest which didn't seem true for MSR_KVM_SYSTEM_TIME (or
> > MSR_KVM_STEAL_TIME) so that would either need to do a kvm_vcpu_map()
> > kvm_vcpu_unmap() dance or do some kind of synchronization.
> > 
> > That said, I don't think this code explicitly disallows any updates
> > to shared_info.
> > 
> > > 
> > > If that was allowed, wouldn't it have been a much simpler fix for
> > > CVE-2019-3016? What am I missing?
> > 
> > Agreed.
> > 
> > Perhaps, Paolo can chime in with why KVM never uses pinned page
> > and always prefers to do cached mappings instead?
> > 
> 
> Part of the CVE fix to not use cached versions.
> 
> It's not a longterm pin of the page unlike we try to do here (partly due to the nature
> of the pages we are mapping) but we still we map the gpa, RMW the steal time struct, and
> then unmap the page.
> 
> See record_steal_time() -- but more specifically commit b043138246 ("x86/KVM: Make sure
> KVM_VCPU_FLUSH_TLB flag is not missed").
> 
> But I am not sure it's a good idea to follow the same as record_steal_time() given that
> this is a fairly sensitive code path for event channels.

Right. We definitely need to use atomic RMW operations (like the CVE
fix did) so the page needs to be *mapped*.

My question was about a permanent pinned mapping vs the map/unmap as we
need it that record_steal_time() does.

On IRC, Paolo told me that permanent pinning causes problems for memory
hotplug, and pointed me at the trick we do with an MMU notifier and
kvm_vcpu_reload_apic_access_page().

I'm going to stick with the pinning we have for the moment, and just
fix up the fact that it leaks the pinned pages if the guest sets the
shared_info address more than once.

At some point the apic page MMU notifier thing can be made generic, and
we can use that for this and for KVM steal time too.
Ankur Arora Dec. 2, 2020, 8:32 p.m. UTC | #8
On 2020-12-02 4:20 a.m., David Woodhouse wrote:
> On Wed, 2020-12-02 at 10:44 +0000, Joao Martins wrote:
>> [late response - was on holiday yesterday]
>>
>> On 12/2/20 12:40 AM, Ankur Arora wrote:
>>> On 2020-12-01 5:07 a.m., David Woodhouse wrote:
>>>> On Wed, 2019-02-20 at 20:15 +0000, Joao Martins wrote:
>>>>> +static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
>>>>> +{
>>>>> +       struct shared_info *shared_info;
>>>>> +       struct page *page;
>>>>> +
>>>>> +       page = gfn_to_page(kvm, gfn);
>>>>> +       if (is_error_page(page))
>>>>> +               return -EINVAL;
>>>>> +
>>>>> +       kvm->arch.xen.shinfo_addr = gfn;
>>>>> +
>>>>> +       shared_info = page_to_virt(page);
>>>>> +       memset(shared_info, 0, sizeof(struct shared_info));
>>>>> +       kvm->arch.xen.shinfo = shared_info;
>>>>> +       return 0;
>>>>> +}
>>>>> +
>>>>
>>>> Hm.
>>>>
>>>> How come we get to pin the page and directly dereference it every time,
>>>> while kvm_setup_pvclock_page() has to use kvm_write_guest_cached()
>>>> instead?
>>>
>>> So looking at my WIP trees from the time, this is something that
>>> we went back and forth on as well with using just a pinned page or a
>>> persistent kvm_vcpu_map().
>>>
>>> I remember distinguishing shared_info/vcpu_info from kvm_setup_pvclock_page()
>>> as shared_info is created early and is not expected to change during the
>>> lifetime of the guest which didn't seem true for MSR_KVM_SYSTEM_TIME (or
>>> MSR_KVM_STEAL_TIME) so that would either need to do a kvm_vcpu_map()
>>> kvm_vcpu_unmap() dance or do some kind of synchronization.
>>>
>>> That said, I don't think this code explicitly disallows any updates
>>> to shared_info.
>>>
>>>>
>>>> If that was allowed, wouldn't it have been a much simpler fix for
>>>> CVE-2019-3016? What am I missing?
>>>
>>> Agreed.
>>>
>>> Perhaps, Paolo can chime in with why KVM never uses pinned page
>>> and always prefers to do cached mappings instead?
>>>
>>
>> Part of the CVE fix to not use cached versions.
>>
>> It's not a longterm pin of the page unlike we try to do here (partly due to the nature
>> of the pages we are mapping) but we still we map the gpa, RMW the steal time struct, and
>> then unmap the page.
>>
>> See record_steal_time() -- but more specifically commit b043138246 ("x86/KVM: Make sure
>> KVM_VCPU_FLUSH_TLB flag is not missed").
>>
>> But I am not sure it's a good idea to follow the same as record_steal_time() given that
>> this is a fairly sensitive code path for event channels.
> 
> Right. We definitely need to use atomic RMW operations (like the CVE
> fix did) so the page needs to be *mapped*.
> 
> My question was about a permanent pinned mapping vs the map/unmap as we
> need it that record_steal_time() does.
> 
> On IRC, Paolo told me that permanent pinning causes problems for memory
> hotplug, and pointed me at the trick we do with an MMU notifier and
> kvm_vcpu_reload_apic_access_page().

Okay that answers my question. Thanks for clearing that up.

Not sure of a good place to document this but it would be good to
have this written down somewhere. Maybe kvm_map_gfn()?

> 
> I'm going to stick with the pinning we have for the moment, and just
> fix up the fact that it leaks the pinned pages if the guest sets the
> shared_info address more than once.
> 
> At some point the apic page MMU notifier thing can be made generic, and
> we can use that for this and for KVM steal time too.
> 

Yeah, that's something that'll definitely be good to have.

Ankur
Ankur Arora Dec. 2, 2020, 8:33 p.m. UTC | #9
On 2020-12-02 2:44 a.m., Joao Martins wrote:
> [late response - was on holiday yesterday]
> 
> On 12/2/20 12:40 AM, Ankur Arora wrote:
>> On 2020-12-01 5:07 a.m., David Woodhouse wrote:
>>> On Wed, 2019-02-20 at 20:15 +0000, Joao Martins wrote:
>>>> +static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
>>>> +{
>>>> +       struct shared_info *shared_info;
>>>> +       struct page *page;
>>>> +
>>>> +       page = gfn_to_page(kvm, gfn);
>>>> +       if (is_error_page(page))
>>>> +               return -EINVAL;
>>>> +
>>>> +       kvm->arch.xen.shinfo_addr = gfn;
>>>> +
>>>> +       shared_info = page_to_virt(page);
>>>> +       memset(shared_info, 0, sizeof(struct shared_info));
>>>> +       kvm->arch.xen.shinfo = shared_info;
>>>> +       return 0;
>>>> +}
>>>> +
>>>
>>> Hm.
>>>
>>> How come we get to pin the page and directly dereference it every time,
>>> while kvm_setup_pvclock_page() has to use kvm_write_guest_cached()
>>> instead?
>>
>> So looking at my WIP trees from the time, this is something that
>> we went back and forth on as well with using just a pinned page or a
>> persistent kvm_vcpu_map().
>>
>> I remember distinguishing shared_info/vcpu_info from kvm_setup_pvclock_page()
>> as shared_info is created early and is not expected to change during the
>> lifetime of the guest which didn't seem true for MSR_KVM_SYSTEM_TIME (or
>> MSR_KVM_STEAL_TIME) so that would either need to do a kvm_vcpu_map()
>> kvm_vcpu_unmap() dance or do some kind of synchronization.
>>
>> That said, I don't think this code explicitly disallows any updates
>> to shared_info.
>>
>>>
>>> If that was allowed, wouldn't it have been a much simpler fix for
>>> CVE-2019-3016? What am I missing?
>>
>> Agreed.
>>
>> Perhaps, Paolo can chime in with why KVM never uses pinned page
>> and always prefers to do cached mappings instead?
>>
> Part of the CVE fix to not use cached versions.
> 
> It's not a longterm pin of the page unlike we try to do here (partly due to the nature
> of the pages we are mapping) but we still we map the gpa, RMW the steal time struct, and
> then unmap the page.
> 
> See record_steal_time() -- but more specifically commit b043138246 ("x86/KVM: Make sure
> KVM_VCPU_FLUSH_TLB flag is not missed").
> 
> But I am not sure it's a good idea to follow the same as record_steal_time() given that
> this is a fairly sensitive code path for event channels.
> 
>>>
>>> Should I rework these to use kvm_write_guest_cached()?
>>
>> kvm_vcpu_map() would be better. The event channel logic does RMW operations
>> on shared_info->vcpu_info.
>>
> Indeed, yes.
> 
> Ankur IIRC, we saw missed event channels notifications when we were using the
> {write,read}_cached() version of the patch.
> 
> But I can't remember the reason it was due to, either the evtchn_pending or the mask
> word -- which would make it not inject an upcall.

If memory serves, it was the mask. Though I don't think that we had
kvm_{write,read}_cached in use at that point -- given that they were
definitely not RMW safe.


Ankur

> 
> 	Joao
>
David Woodhouse Dec. 3, 2020, 10:16 a.m. UTC | #10
On Wed, 2020-12-02 at 12:32 -0800, Ankur Arora wrote:
> > On IRC, Paolo told me that permanent pinning causes problems for memory
> > hotplug, and pointed me at the trick we do with an MMU notifier and
> > kvm_vcpu_reload_apic_access_page().
> 
> Okay that answers my question. Thanks for clearing that up.
> 
> Not sure of a good place to document this but it would be good to
> have this written down somewhere. Maybe kvm_map_gfn()?

Trying not to get too distracted by polishing this part, so I can
continue with making more things actually work. But I took a quick look
at the reload_apic_access_page() thing.

AFAICT it works because the access is only from *within* the vCPU, in
guest mode.

So all the notifier has to do is kick all CPUs, which happens when it
calls kvm_make_all_cpus_request(). Thus we are guaranteed that all CPUs
are *out* of guest mode by the time...

    ...er... maybe not by the time the notifier returns, because all 
    we've done is *send* the IPI and we don't know the other CPUs have 
    actually stopped running the guest yet? 

    Maybe there's some explanation of why the actual TLB shootdown 
    truly *will* occur before the page goes away, and some ordering 
    rules which mean our reschedule IPI will happen first? Something 
    like that ideally would have been in a comment in in MMU notifier.

Anyway, I'm not going to fret about that because it clearly doesn't
work for the pvclock/shinfo cases anyway; those are accessed from the
kernel *outside* guest mode. I think we can use a similar trick but
just protect the accesses with kvm->srcu.

The code in my tree now uses kvm_map_gfn() at setup time and leaves the
shinfo page mapped. A bit like the KVM pvclock code, except that I
don't pointlessly map and unmap all the time while leaving the page
pinned anyway.

So all we need to do is kvm_release_pfn() right after mapping it,
leaving the map->hva "unsafe".

Then we hook in to the MMU notifier to unmap it when it goes away (in
RCU style; clearing the pointer variable, synchronize_srcu(), and
*then* unmap, possibly in different range_start/range/range_end
callbacks).

The *users* of such mappings will need an RCU read lock, and will need
to be able to fault the GFN back in when they need it.

For now, though, I'm content that converting the Xen shinfo support to
kvm_map_gfn() is the right way to go, and the memory hotplug support is
an incremental addition on top of it.

And I'm even more comforted by the observation that the KVM pvclock
support *already* pins its pages, so I'm not even failing to meet the
existing bar :)
Sean Christopherson Dec. 4, 2020, 5:30 p.m. UTC | #11
On Thu, Dec 03, 2020, David Woodhouse wrote:
> On Wed, 2020-12-02 at 12:32 -0800, Ankur Arora wrote:
> > > On IRC, Paolo told me that permanent pinning causes problems for memory
> > > hotplug, and pointed me at the trick we do with an MMU notifier and
> > > kvm_vcpu_reload_apic_access_page().
> > 
> > Okay that answers my question. Thanks for clearing that up.
> > 
> > Not sure of a good place to document this but it would be good to
> > have this written down somewhere. Maybe kvm_map_gfn()?
> 
> Trying not to get too distracted by polishing this part, so I can
> continue with making more things actually work. But I took a quick look
> at the reload_apic_access_page() thing.
> 
> AFAICT it works because the access is only from *within* the vCPU, in
> guest mode.
> 
> So all the notifier has to do is kick all CPUs, which happens when it
> calls kvm_make_all_cpus_request(). Thus we are guaranteed that all CPUs
> are *out* of guest mode by the time...
> 
>     ...er... maybe not by the time the notifier returns, because all 
>     we've done is *send* the IPI and we don't know the other CPUs have 
>     actually stopped running the guest yet? 
> 
>     Maybe there's some explanation of why the actual TLB shootdown 
>     truly *will* occur before the page goes away, and some ordering 
>     rules which mean our reschedule IPI will happen first? Something 
>     like that ideally would have been in a comment in in MMU notifier.

KVM_REQ_APIC_PAGE_RELOAD is tagged with KVM_REQUEST_WAIT, which means that
kvm_kick_many_cpus() and thus smp_call_function_many() will have @wait=true,
i.e. the sender will wait for the SMP function call to finish on the target CPUs.
David Woodhouse Dec. 12, 2020, 12:07 p.m. UTC | #12
On Tue, 2020-12-01 at 16:40 -0800, Ankur Arora wrote:
> > How come we get to pin the page and directly dereference it every time,
> > while kvm_setup_pvclock_page() has to use kvm_write_guest_cached()
> > instead?
> 
> So looking at my WIP trees from the time, this is something that
> we went back and forth on as well with using just a pinned page or a
> persistent kvm_vcpu_map().

I have a new plan.

Screw pinning it and mapping it into kernel space just because we need
to do atomic operations on it. 

Futexes can do atomic operations on userspace; so can we. We just add
atomic_cmpxchg_user() and use that.

We can add kvm_cmpxchg_guest_offset_cached() and use that from places
like record_steal_time().

That works nicely for all of the Xen support needs too, I think — since
test-and-set and test-and-clear bit operations all want to be built on
cmpxchg.

The one thing I can think off offhand that it doesn't address easily is
the testing of vcpu_info->evtchn_upcall_pending in
kvm_xen_cpu_has_interrupt(), which really does want to be fast so I'm
not sure I want to be using copy_from_user() for that.


But I think I can open-code that to do the cache checking that
kvm_read_guest_offset_cached() does, and use __get_user() directly in
the fast path, falling back to actually calling
kvm_read_guest_offset_cached() when it needs to. That will suffice
until/unless we grow more use cases which would make it worth providing
that as a kvm_get_guest...() set of functions for generic use.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0f469ce439c0..befc0e37f162 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -843,6 +843,9 @@  struct kvm_hv {
 /* Xen emulation context */
 struct kvm_xen {
 	u64 xen_hypercall;
+
+	gfn_t shinfo_addr;
+	struct shared_info *shinfo;
 };
 
 enum kvm_irqchip_mode {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index be8def385e3f..1eda96304180 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4793,6 +4793,26 @@  long kvm_arch_vm_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
+	case KVM_XEN_HVM_GET_ATTR: {
+		struct kvm_xen_hvm_attr xha;
+
+		r = -EFAULT;
+		if (copy_from_user(&xha, argp, sizeof(xha)))
+			goto out;
+		r = kvm_xen_hvm_get_attr(kvm, &xha);
+		if (copy_to_user(argp, &xha, sizeof(xha)))
+			goto out;
+		break;
+	}
+	case KVM_XEN_HVM_SET_ATTR: {
+		struct kvm_xen_hvm_attr xha;
+
+		r = -EFAULT;
+		if (copy_from_user(&xha, argp, sizeof(xha)))
+			goto out;
+		r = kvm_xen_hvm_set_attr(kvm, &xha);
+		break;
+	}
 	case KVM_SET_CLOCK: {
 		struct kvm_clock_data user_ns;
 		u64 now_ns;
@@ -9279,6 +9299,7 @@  void kvm_arch_destroy_vm(struct kvm *kvm)
 	kvm_mmu_uninit_vm(kvm);
 	kvm_page_track_cleanup(kvm);
 	kvm_hv_destroy_vm(kvm);
+	kvm_xen_destroy_vm(kvm);
 }
 
 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 76f0e4b812d2..4df223bd3cd7 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -11,9 +11,61 @@ 
 #include <linux/kvm_host.h>
 
 #include <trace/events/kvm.h>
+#include <xen/interface/xen.h>
 
 #include "trace.h"
 
+static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
+{
+	struct shared_info *shared_info;
+	struct page *page;
+
+	page = gfn_to_page(kvm, gfn);
+	if (is_error_page(page))
+		return -EINVAL;
+
+	kvm->arch.xen.shinfo_addr = gfn;
+
+	shared_info = page_to_virt(page);
+	memset(shared_info, 0, sizeof(struct shared_info));
+	kvm->arch.xen.shinfo = shared_info;
+	return 0;
+}
+
+int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
+{
+	int r = -ENOENT;
+
+	switch (data->type) {
+	case KVM_XEN_ATTR_TYPE_SHARED_INFO: {
+		gfn_t gfn = data->u.shared_info.gfn;
+
+		r = kvm_xen_shared_info_init(kvm, gfn);
+		break;
+	}
+	default:
+		break;
+	}
+
+	return r;
+}
+
+int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
+{
+	int r = -ENOENT;
+
+	switch (data->type) {
+	case KVM_XEN_ATTR_TYPE_SHARED_INFO: {
+		data->u.shared_info.gfn = kvm->arch.xen.shinfo_addr;
+		break;
+	}
+	default:
+		break;
+	}
+
+	return r;
+}
+
 bool kvm_xen_hypercall_enabled(struct kvm *kvm)
 {
 	return READ_ONCE(kvm->arch.xen.xen_hypercall);
@@ -77,3 +129,11 @@  int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
 
 	return 0;
 }
+
+void kvm_xen_destroy_vm(struct kvm *kvm)
+{
+	struct kvm_xen *xen = &kvm->arch.xen;
+
+	if (xen->shinfo)
+		put_page(virt_to_page(xen->shinfo));
+}
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index a2ae079c3ef3..bb38edf383fe 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -3,8 +3,12 @@ 
 #ifndef __ARCH_X86_KVM_XEN_H__
 #define __ARCH_X86_KVM_XEN_H__
 
+int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
+int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
 bool kvm_xen_hypercall_enabled(struct kvm *kvm);
 bool kvm_xen_hypercall_set(struct kvm *kvm);
 int kvm_xen_hypercall(struct kvm_vcpu *vcpu);
 
+void kvm_xen_destroy_vm(struct kvm *kvm);
+
 #endif
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index d07520c216a1..de2168d235af 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1455,6 +1455,21 @@  struct kvm_enc_region {
 /* Available with KVM_CAP_HYPERV_CPUID */
 #define KVM_GET_SUPPORTED_HV_CPUID _IOWR(KVMIO, 0xc1, struct kvm_cpuid2)
 
+#define KVM_XEN_HVM_GET_ATTR        _IOWR(KVMIO,  0xc2, struct kvm_xen_hvm_attr)
+#define KVM_XEN_HVM_SET_ATTR        _IOW(KVMIO,  0xc3, struct kvm_xen_hvm_attr)
+
+struct kvm_xen_hvm_attr {
+	__u16 type;
+
+	union {
+		struct {
+			__u64 gfn;
+		} shared_info;
+	} u;
+};
+
+#define KVM_XEN_ATTR_TYPE_SHARED_INFO       0x0
+
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
 	/* Guest initialization commands */