Message ID | 20240115125707.1183-12-paul@xen.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM: xen: update shared_info and vcpu_info handling | expand |
On Mon, Jan 15, 2024, Paul Durrant wrote: > @@ -638,20 +637,32 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) > } > break; > > - case KVM_XEN_ATTR_TYPE_SHARED_INFO: { > + case KVM_XEN_ATTR_TYPE_SHARED_INFO: > + case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA: { > int idx; > > mutex_lock(&kvm->arch.xen.xen_lock); > > idx = srcu_read_lock(&kvm->srcu); > > - if (data->u.shared_info.gfn == KVM_XEN_INVALID_GFN) { > - kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); > - r = 0; > + if (data->type == KVM_XEN_ATTR_TYPE_SHARED_INFO) { > + if (data->u.shared_info.gfn == KVM_XEN_INVALID_GFN) { > + kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); > + r = 0; > + } else { > + r = kvm_gpc_activate(&kvm->arch.xen.shinfo_cache, > + gfn_to_gpa(data->u.shared_info.gfn), > + PAGE_SIZE); > + } > } else { > - r = kvm_gpc_activate(&kvm->arch.xen.shinfo_cache, > - gfn_to_gpa(data->u.shared_info.gfn), > - PAGE_SIZE); > + if (data->u.shared_info.hva == 0) { I know I said I don't care about the KVM Xen ABI, but I still think using '0' as "invalid" is ridiculous. More importantly, this code needs to check that HVA is a userspace pointer. Because __kvm_set_memory_region() performs the address checks, KVM assumes any hva that it gets out of a memslot, i.e. from a gfn, is a safe userspace address. kvm_is_error_hva() will catch most addresses, but I'm pretty sure there's still a small window where userspace could use this to write kernel memory. > + kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); > + r = 0; > + } else { > + r = kvm_gpc_activate_hva(&kvm->arch.xen.shinfo_cache, > + data->u.shared_info.hva, > + PAGE_SIZE); > + }
On 07/02/2024 04:10, Sean Christopherson wrote: > On Mon, Jan 15, 2024, Paul Durrant wrote: >> @@ -638,20 +637,32 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) >> } >> break; >> >> - case KVM_XEN_ATTR_TYPE_SHARED_INFO: { >> + case KVM_XEN_ATTR_TYPE_SHARED_INFO: >> + case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA: { >> int idx; >> >> mutex_lock(&kvm->arch.xen.xen_lock); >> >> idx = srcu_read_lock(&kvm->srcu); >> >> - if (data->u.shared_info.gfn == KVM_XEN_INVALID_GFN) { >> - kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); >> - r = 0; >> + if (data->type == KVM_XEN_ATTR_TYPE_SHARED_INFO) { >> + if (data->u.shared_info.gfn == KVM_XEN_INVALID_GFN) { >> + kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); >> + r = 0; >> + } else { >> + r = kvm_gpc_activate(&kvm->arch.xen.shinfo_cache, >> + gfn_to_gpa(data->u.shared_info.gfn), >> + PAGE_SIZE); >> + } >> } else { >> - r = kvm_gpc_activate(&kvm->arch.xen.shinfo_cache, >> - gfn_to_gpa(data->u.shared_info.gfn), >> - PAGE_SIZE); >> + if (data->u.shared_info.hva == 0) { > > I know I said I don't care about the KVM Xen ABI, but I still think using '0' as > "invalid" is ridiculous. > I don't have any massive preference; we could define a KVM_XEN_INVALID_HVA too. > More importantly, this code needs to check that HVA is a userspace pointer. > Because __kvm_set_memory_region() performs the address checks, KVM assumes any > hva that it gets out of a memslot, i.e. from a gfn, is a safe userspace address. > > kvm_is_error_hva() will catch most addresses, but I'm pretty sure there's still > a small window where userspace could use this to write kernel memory. Ok. Good point. I'll add some appropriate checks. Paul > >> + kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); >> + r = 0; >> + } else { >> + r = kvm_gpc_activate_hva(&kvm->arch.xen.shinfo_cache, >> + data->u.shared_info.hva, >> + PAGE_SIZE); >> + }
On 07/02/2024 04:10, Sean Christopherson wrote: > On Mon, Jan 15, 2024, Paul Durrant wrote: >> @@ -638,20 +637,32 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) >> } >> break; >> >> - case KVM_XEN_ATTR_TYPE_SHARED_INFO: { >> + case KVM_XEN_ATTR_TYPE_SHARED_INFO: >> + case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA: { >> int idx; >> >> mutex_lock(&kvm->arch.xen.xen_lock); >> >> idx = srcu_read_lock(&kvm->srcu); >> >> - if (data->u.shared_info.gfn == KVM_XEN_INVALID_GFN) { >> - kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); >> - r = 0; >> + if (data->type == KVM_XEN_ATTR_TYPE_SHARED_INFO) { >> + if (data->u.shared_info.gfn == KVM_XEN_INVALID_GFN) { >> + kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); >> + r = 0; >> + } else { >> + r = kvm_gpc_activate(&kvm->arch.xen.shinfo_cache, >> + gfn_to_gpa(data->u.shared_info.gfn), >> + PAGE_SIZE); >> + } >> } else { >> - r = kvm_gpc_activate(&kvm->arch.xen.shinfo_cache, >> - gfn_to_gpa(data->u.shared_info.gfn), >> - PAGE_SIZE); >> + if (data->u.shared_info.hva == 0) { > > I know I said I don't care about the KVM Xen ABI, but I still think using '0' as > "invalid" is ridiculous. > With the benefit of some sleep, I'm wondering why 0 is a 'ridiculous' invalid value for a *virtual* address? Surely it's essentially a numerical cast of the canonically invalid NULL pointer? Paul
On Thu, Feb 08, 2024, Paul Durrant wrote: > On 07/02/2024 04:10, Sean Christopherson wrote: > > On Mon, Jan 15, 2024, Paul Durrant wrote: > > > @@ -638,20 +637,32 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) > > > } > > > break; > > > - case KVM_XEN_ATTR_TYPE_SHARED_INFO: { > > > + case KVM_XEN_ATTR_TYPE_SHARED_INFO: > > > + case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA: { > > > int idx; > > > mutex_lock(&kvm->arch.xen.xen_lock); > > > idx = srcu_read_lock(&kvm->srcu); > > > - if (data->u.shared_info.gfn == KVM_XEN_INVALID_GFN) { > > > - kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); > > > - r = 0; > > > + if (data->type == KVM_XEN_ATTR_TYPE_SHARED_INFO) { > > > + if (data->u.shared_info.gfn == KVM_XEN_INVALID_GFN) { > > > + kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); > > > + r = 0; > > > + } else { > > > + r = kvm_gpc_activate(&kvm->arch.xen.shinfo_cache, > > > + gfn_to_gpa(data->u.shared_info.gfn), > > > + PAGE_SIZE); > > > + } > > > } else { > > > - r = kvm_gpc_activate(&kvm->arch.xen.shinfo_cache, > > > - gfn_to_gpa(data->u.shared_info.gfn), > > > - PAGE_SIZE); > > > + if (data->u.shared_info.hva == 0) { > > > > I know I said I don't care about the KVM Xen ABI, but I still think using '0' as > > "invalid" is ridiculous. > > > > With the benefit of some sleep, I'm wondering why 0 is a 'ridiculous' > invalid value for a *virtual* address? Surely it's essentially a numerical > cast of the canonically invalid NULL pointer? It's legal to mmap() virtual address '0', albeit not by default: config DEFAULT_MMAP_MIN_ADDR int "Low address space to protect from user allocation" depends on MMU default 4096 help This is the portion of low virtual memory which should be protected from userspace allocation. Keeping a user from writing to low pages can help reduce the impact of kernel NULL pointer bugs. For most ppc64 and x86 users with lots of address space a value of 65536 is reasonable and should cause no problems. On arm and other archs it should not be higher than 32768. Programs which use vm86 functionality or have some need to map this low address space will need CAP_SYS_RAWIO or disable this protection by setting the value to 0. This value can be changed after boot using the /proc/sys/vm/mmap_min_addr tunable. Obviously it's equally ridiculous that userspace would ever mmap() '0' and pass that as the shared_info, but given that this is x86-only, there are architecturally illegal addresses that can be used, at least until Intel adds LA64 ;-)
On 08/02/2024 16:48, Sean Christopherson wrote: > On Thu, Feb 08, 2024, Paul Durrant wrote: >> On 07/02/2024 04:10, Sean Christopherson wrote: >>> On Mon, Jan 15, 2024, Paul Durrant wrote: >>>> @@ -638,20 +637,32 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) >>>> } >>>> break; >>>> - case KVM_XEN_ATTR_TYPE_SHARED_INFO: { >>>> + case KVM_XEN_ATTR_TYPE_SHARED_INFO: >>>> + case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA: { >>>> int idx; >>>> mutex_lock(&kvm->arch.xen.xen_lock); >>>> idx = srcu_read_lock(&kvm->srcu); >>>> - if (data->u.shared_info.gfn == KVM_XEN_INVALID_GFN) { >>>> - kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); >>>> - r = 0; >>>> + if (data->type == KVM_XEN_ATTR_TYPE_SHARED_INFO) { >>>> + if (data->u.shared_info.gfn == KVM_XEN_INVALID_GFN) { >>>> + kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); >>>> + r = 0; >>>> + } else { >>>> + r = kvm_gpc_activate(&kvm->arch.xen.shinfo_cache, >>>> + gfn_to_gpa(data->u.shared_info.gfn), >>>> + PAGE_SIZE); >>>> + } >>>> } else { >>>> - r = kvm_gpc_activate(&kvm->arch.xen.shinfo_cache, >>>> - gfn_to_gpa(data->u.shared_info.gfn), >>>> - PAGE_SIZE); >>>> + if (data->u.shared_info.hva == 0) { >>> >>> I know I said I don't care about the KVM Xen ABI, but I still think using '0' as >>> "invalid" is ridiculous. >>> >> >> With the benefit of some sleep, I'm wondering why 0 is a 'ridiculous' >> invalid value for a *virtual* address? Surely it's essentially a numerical >> cast of the canonically invalid NULL pointer? > > It's legal to mmap() virtual address '0', albeit not by default: > > config DEFAULT_MMAP_MIN_ADDR > int "Low address space to protect from user allocation" > depends on MMU > default 4096 > help > This is the portion of low virtual memory which should be protected > from userspace allocation. Keeping a user from writing to low pages > can help reduce the impact of kernel NULL pointer bugs. > > For most ppc64 and x86 users with lots of address space > a value of 65536 is reasonable and should cause no problems. > On arm and other archs it should not be higher than 32768. > Programs which use vm86 functionality or have some need to map > this low address space will need CAP_SYS_RAWIO or disable this > protection by setting the value to 0. > > This value can be changed after boot using the > /proc/sys/vm/mmap_min_addr tunable. > > > Obviously it's equally ridiculous that userspace would ever mmap() '0' and pass > that as the shared_info, but given that this is x86-only, there are architecturally > illegal addresses that can be used, at least until Intel adds LA64 ;-) Ok. Thanks for the reference.
On Thu, 2024-02-08 at 16:51 +0000, Paul Durrant wrote: > On 08/02/2024 16:48, Sean Christopherson wrote: > > On Thu, Feb 08, 2024, Paul Durrant wrote: > > > With the benefit of some sleep, I'm wondering why 0 is a 'ridiculous' > > > invalid value for a *virtual* address? Surely it's essentially a numerical > > > cast of the canonically invalid NULL pointer? > > > > It's legal to mmap() virtual address '0', albeit not by default: Well yes, to make dosemu work. But if you attempt to actually *do* that in C code, the compiler itself doesn't cope... $ cat foo.c int foo(int *bar) { if (bar) return 0; return *bar; } $ gcc -O2 -S -o- foo.c ... foo: .LFB0: .cfi_startproc endbr64 testq %rdi, %rdi je .L4 xorl %eax, %eax ret .p2align 4,,10 .p2align 3 .L4: movl 0, %eax ud2 .cfi_endproc .LFE0: .size foo, .-foo Note the ud2 instead of actually trying to dereference it. Using anything except NULL as the "no value" value doesn't make sense to me. It violates the principle of least surprise and would be a really bad API.
On Thu, Feb 08, 2024, David Woodhouse wrote: > Using anything except NULL as the "no value" value doesn't make sense > to me. It violates the principle of least surprise and would be a > really bad API. I'm a-ok with using '0'. My only request is to check for "!hva" as opposed to "hva == 0", both because that's preferred kernel style, and because it better conveys that it's really checking for !NULL as opposed to address '0'. Side topic, I think the code will end up in a more readable state if the GFN vs. HVA sub-commands are handled in separate case statements, especially if/when xen_lock goes away. E.g. something like this: case KVM_XEN_ATTR_TYPE_SHARED_INFO: { int idx; if (data->u.shared_info.gfn == KVM_XEN_INVALID_GFN) { kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); r = 0; break; } idx = srcu_read_lock(&kvm->srcu); r = kvm_gpc_activate(&kvm->arch.xen.shinfo_cache, gfn_to_gpa(data->u.shared_info.gfn), PAGE_SIZE); if (!r && kvm->arch.xen.shinfo_cache.active) r = kvm_xen_shared_info_init(kvm); srcu_read_unlock(&kvm->srcu, idx); break; } case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA: { unsigned long hva = data->u.shared_info.hva; if (hva != untagged_addr(hva) || !access_ok((void __user *)hva) || !PAGE_ALIGNED(hva)) { r = -EINVAL; break; } if (!hva) { kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); r = 0; break; } r = kvm_gpc_activate_hva(&kvm->arch.xen.shinfo_cache, hva, PAGE_SIZE); if (!r && kvm->arch.xen.shinfo_cache.active) r = kvm_xen_shared_info_init(kvm); break; } Side topic #2, the above requires that __kvm_gpc_refresh() not grab kvm_memslots() in the "using an hva" path, but I think that'd actually be a good thing as it would make it a bit more clear that using an hva bypasses memslots by design.
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 3ec0b7a455a0..3372be85b335 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -372,7 +372,7 @@ The bits in the dirty bitmap are cleared before the ioctl returns, unless KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is enabled. For more information, see the description of the capability. -Note that the Xen shared info page, if configured, shall always be assumed +Note that the Xen shared_info page, if configured, shall always be assumed to be dirty. KVM will not explicitly mark it such. @@ -5487,8 +5487,9 @@ KVM_PV_ASYNC_CLEANUP_PERFORM __u8 long_mode; __u8 vector; __u8 runstate_update_flag; - struct { + union { __u64 gfn; + __u64 hva; } shared_info; struct { __u32 send_port; @@ -5516,10 +5517,10 @@ type values: KVM_XEN_ATTR_TYPE_LONG_MODE Sets the ABI mode of the VM to 32-bit or 64-bit (long mode). This - determines the layout of the shared info pages exposed to the VM. + determines the layout of the shared_info page exposed to the VM. KVM_XEN_ATTR_TYPE_SHARED_INFO - Sets the guest physical frame number at which the Xen "shared info" + Sets the guest physical frame number at which the Xen shared_info page resides. Note that although Xen places vcpu_info for the first 32 vCPUs in the shared_info page, KVM does not automatically do so and instead requires that KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO be used @@ -5528,7 +5529,7 @@ KVM_XEN_ATTR_TYPE_SHARED_INFO not be aware of the Xen CPU id which is used as the index into the vcpu_info[] array, so may know the correct default location. - Note that the shared info page may be constantly written to by KVM; + Note that the shared_info page may be constantly written to by KVM; it contains the event channel bitmap used to deliver interrupts to a Xen guest, amongst other things. It is exempt from dirty tracking mechanisms — KVM will not explicitly mark the page as dirty each @@ -5537,9 +5538,21 @@ KVM_XEN_ATTR_TYPE_SHARED_INFO any vCPU has been running or any event channel interrupts can be routed to the guest. - Setting the gfn to KVM_XEN_INVALID_GFN will disable the shared info + Setting the gfn to KVM_XEN_INVALID_GFN will disable the shared_info page. +KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA + If the KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA flag is also set in the + Xen capabilities, then this attribute may be used to set the + userspace address at which the shared_info page resides, which + will always be fixed in the VMM regardless of where it is mapped + in guest physical address space. This attribute should be used in + preference to KVM_XEN_ATTR_TYPE_SHARED_INFO as it avoids + unnecessary invalidation of an internal cache when the page is + re-mapped in guest physcial address space. + + Setting the hva to zero will disable the shared_info page. + KVM_XEN_ATTR_TYPE_UPCALL_VECTOR Sets the exception vector used to deliver Xen event channel upcalls. This is the HVM-wide vector injected directly by the hypervisor diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index d595d476a5b3..813d63a8703a 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -617,7 +617,6 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) { int r = -ENOENT; - switch (data->type) { case KVM_XEN_ATTR_TYPE_LONG_MODE: if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) { @@ -638,20 +637,32 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) } break; - case KVM_XEN_ATTR_TYPE_SHARED_INFO: { + case KVM_XEN_ATTR_TYPE_SHARED_INFO: + case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA: { int idx; mutex_lock(&kvm->arch.xen.xen_lock); idx = srcu_read_lock(&kvm->srcu); - if (data->u.shared_info.gfn == KVM_XEN_INVALID_GFN) { - kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); - r = 0; + if (data->type == KVM_XEN_ATTR_TYPE_SHARED_INFO) { + if (data->u.shared_info.gfn == KVM_XEN_INVALID_GFN) { + kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); + r = 0; + } else { + r = kvm_gpc_activate(&kvm->arch.xen.shinfo_cache, + gfn_to_gpa(data->u.shared_info.gfn), + PAGE_SIZE); + } } else { - r = kvm_gpc_activate(&kvm->arch.xen.shinfo_cache, - gfn_to_gpa(data->u.shared_info.gfn), - PAGE_SIZE); + if (data->u.shared_info.hva == 0) { + kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); + r = 0; + } else { + r = kvm_gpc_activate_hva(&kvm->arch.xen.shinfo_cache, + data->u.shared_info.hva, + PAGE_SIZE); + } } srcu_read_unlock(&kvm->srcu, idx); @@ -715,13 +726,23 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) break; case KVM_XEN_ATTR_TYPE_SHARED_INFO: - if (kvm->arch.xen.shinfo_cache.active) + if (kvm->arch.xen.shinfo_cache.active && + kvm->arch.xen.shinfo_cache.gpa != KVM_XEN_INVALID_GPA) data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa); else data->u.shared_info.gfn = KVM_XEN_INVALID_GFN; r = 0; break; + case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA: + if (kvm->arch.xen.shinfo_cache.active && + kvm->arch.xen.shinfo_cache.gpa == KVM_XEN_INVALID_GPA) + data->u.shared_info.hva = kvm->arch.xen.shinfo_cache.uhva; + else + data->u.shared_info.hva = 0; + r = 0; + break; + case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR: data->u.vector = kvm->arch.xen.upcall_vector; r = 0; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index c3308536482b..ac5caba313d1 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1246,6 +1246,7 @@ struct kvm_x86_mce { #define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5) #define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6) #define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7) +#define KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA (1 << 8) struct kvm_xen_hvm_config { __u32 flags; @@ -1744,9 +1745,10 @@ struct kvm_xen_hvm_attr { __u8 long_mode; __u8 vector; __u8 runstate_update_flag; - struct { + union { __u64 gfn; #define KVM_XEN_INVALID_GFN ((__u64)-1) + __u64 hva; } shared_info; struct { __u32 send_port; @@ -1788,6 +1790,8 @@ struct kvm_xen_hvm_attr { #define KVM_XEN_ATTR_TYPE_XEN_VERSION 0x4 /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG */ #define KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG 0x5 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */ +#define KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA 0x6 /* Per-vCPU Xen attributes */ #define KVM_XEN_VCPU_GET_ATTR _IOWR(KVMIO, 0xca, struct kvm_xen_vcpu_attr)