Message ID | 1629192673-9911-4-git-send-email-robert.hu@linux.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM/x86/nVMX: Add field existence support in VMCS12 | expand |
On Tue, Aug 17, 2021, Robert Hoo wrote: > In vmcs12_{read,write}_any(), check the field exist or not. If not, return > failure. Hence their function prototype changed a little accordingly. > In handle_vm{read,write}(), above function's caller, check return value, if > failed, emulate nested vmx fail with instruction error of > VMXERR_UNSUPPORTED_VMCS_COMPONENT. > > Signed-off-by: Robert Hoo <robert.hu@linux.intel.com> > Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com> Assuming Yu is a co-author, this needs to be: Co-developed-by: Yu Zhang <yu.c.zhang@linux.intel.com> Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com> Signed-off-by: Robert Hoo <robert.hu@linux.intel.com> See "When to use Acked-by:, Cc:, and Co-developed-by:" in Documentation/process/submitting-patches.rst. > --- > arch/x86/kvm/vmx/nested.c | 20 ++++++++++++------ > arch/x86/kvm/vmx/vmcs12.h | 43 ++++++++++++++++++++++++++++++--------- > 2 files changed, 47 insertions(+), 16 deletions(-) > > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c > index b8121f8f6d96..9a35953ede22 100644 > --- a/arch/x86/kvm/vmx/nested.c > +++ b/arch/x86/kvm/vmx/nested.c > @@ -1547,7 +1547,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) > for (i = 0; i < max_shadow_read_write_fields; i++) { > field = shadow_read_write_fields[i]; > val = __vmcs_readl(field.encoding); > - vmcs12_write_any(vmcs12, field.encoding, field.offset, val); > + vmcs12_write_any(vmcs12, field.encoding, field.offset, val, > + vmx->nested.vmcs12_field_existence_bitmap); There is no need to perform existence checks when KVM is copying to/from vmcs12, the checks are only needed for VMREAD and VMWRITE. Architecturally, the VMCS is an opaque blob, software cannot rely on any assumptions about its layout or data, i.e. KVM is free to read/write whatever it wants. VMREAD and VMWRITE need to be enforced because architecturally they are defined to fail if the field does not exist. Limiting this to VMREAD/VMWRITE means we shouldn't need a bitmap and can use a more static lookup, e.g. a switch statement. And an idea to optimize for fields that unconditionally exist would be to use bit 0 in the field->offset table to denote conditional fields, e.g. the VMREAD/VMRITE lookups could be something like: diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index bc6327950657..ef8c48f80d1a 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5064,7 +5064,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) /* Decode instruction info and find the field to read */ field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); - offset = vmcs_field_to_offset(field); + offset = vmcs_field_to_offset(vmx, field); if (offset < 0) return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); @@ -5167,7 +5167,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); - offset = vmcs_field_to_offset(field); + offset = vmcs_field_to_offset(vmx, field); if (offset < 0) return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 2a45f026ee11..3c27631e0119 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -364,7 +364,8 @@ static inline void vmx_check_vmcs12_offsets(void) extern const unsigned short vmcs_field_to_offset_table[]; extern const unsigned int nr_vmcs12_fields; -static inline short vmcs_field_to_offset(unsigned long field) +static inline short vmcs_field_to_offset(struct vcpu_vmx *vmx, + unsigned long field) { unsigned short offset; unsigned int index; @@ -378,9 +379,10 @@ static inline short vmcs_field_to_offset(unsigned long field) index = array_index_nospec(index, nr_vmcs12_fields); offset = vmcs_field_to_offset_table[index]; - if (offset == 0) + if (offset == 0 || + ((offset & 1) && !vmcs12_field_exists(vmx, field))) return -ENOENT; - return offset; + return offset & ~1; } static inline u64 vmcs12_read_any(struct vmcs12 *vmcs12, unsigned long field,
On Tue, 2021-08-17 at 15:54 +0000, Sean Christopherson wrote: > On Tue, Aug 17, 2021, Robert Hoo wrote: > > In vmcs12_{read,write}_any(), check the field exist or not. If not, > > return > > failure. Hence their function prototype changed a little > > accordingly. > > In handle_vm{read,write}(), above function's caller, check return > > value, if > > failed, emulate nested vmx fail with instruction error of > > VMXERR_UNSUPPORTED_VMCS_COMPONENT. > > > > Signed-off-by: Robert Hoo <robert.hu@linux.intel.com> > > Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com> > > Assuming Yu is a co-author, this needs to be: > > Co-developed-by: Yu Zhang <yu.c.zhang@linux.intel.com> > Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com> > Signed-off-by: Robert Hoo <robert.hu@linux.intel.com> > > See "When to use Acked-by:, Cc:, and Co-developed-by:" in > Documentation/process/submitting-patches.rst. OK, thanks. > > > --- > > arch/x86/kvm/vmx/nested.c | 20 ++++++++++++------ > > arch/x86/kvm/vmx/vmcs12.h | 43 ++++++++++++++++++++++++++++++----- > > ---- > > 2 files changed, 47 insertions(+), 16 deletions(-) > > > > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c > > index b8121f8f6d96..9a35953ede22 100644 > > --- a/arch/x86/kvm/vmx/nested.c > > +++ b/arch/x86/kvm/vmx/nested.c > > @@ -1547,7 +1547,8 @@ static void copy_shadow_to_vmcs12(struct > > vcpu_vmx *vmx) > > for (i = 0; i < max_shadow_read_write_fields; i++) { > > field = shadow_read_write_fields[i]; > > val = __vmcs_readl(field.encoding); > > - vmcs12_write_any(vmcs12, field.encoding, field.offset, > > val); > > + vmcs12_write_any(vmcs12, field.encoding, field.offset, > > val, > > + vmx- > > >nested.vmcs12_field_existence_bitmap); > > There is no need to perform existence checks when KVM is copying > to/from vmcs12, > the checks are only needed for VMREAD and VMWRITE. Architecturally, > the VMCS is > an opaque blob, software cannot rely on any assumptions about its > layout or data, > i.e. KVM is free to read/write whatever it wants. VMREAD and > VMWRITE need to be > enforced because architecturally they are defined to fail if the > field does not exist. OK, agree. > > Limiting this to VMREAD/VMWRITE means we shouldn't need a bitmap and > can use a > more static lookup, e.g. a switch statement. Emm, hard for me to choose: Your approach sounds more efficient for CPU: Once VMX MSR's updated, no bother to update the bitmap. Each field's existence check will directly consult related VMX MSR. Well, the switch statement will be long... My this implementation: once VMX MSR's updated, the update needs to be passed to bitmap, this is 1 extra step comparing to aforementioned above. But, later, when query field existence, especially the those consulting vm{entry,exit}_ctrl, they usually would have to consult both MSRs if otherwise no bitmap, and we cannot guarantee if in the future there's no more complicated dependencies. If using bitmap, this consult is just 1-bit reading. If no bitmap, several MSR's read and compare happen. And, VMX MSR --> bitmap, usually happens only once when vCPU model is settled. But VMRead/VMWrite might happen frequently, depends on guest itself. I'd rather leave complicated comparison in former than in later. > And an idea to optimize for fields > that unconditionally exist would be to use bit 0 in the field->offset > table to > denote conditional fields, e.g. the VMREAD/VMRITE lookups could be > something like: Though all fields offset is even today, can we assert no new odd-offset field won't be added some day? And, what if some day, some field's conditional/unconditional existence depends on CPU model? > > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c > index bc6327950657..ef8c48f80d1a 100644 > --- a/arch/x86/kvm/vmx/nested.c > +++ b/arch/x86/kvm/vmx/nested.c > @@ -5064,7 +5064,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) > /* Decode instruction info and find the field to read */ > field = kvm_register_read(vcpu, (((instr_info) >> 28) & > 0xf)); > > - offset = vmcs_field_to_offset(field); > + offset = vmcs_field_to_offset(vmx, field); > if (offset < 0) > return nested_vmx_fail(vcpu, > VMXERR_UNSUPPORTED_VMCS_COMPONENT); > > @@ -5167,7 +5167,7 @@ static int handle_vmwrite(struct kvm_vcpu > *vcpu) > > field = kvm_register_read(vcpu, (((instr_info) >> 28) & > 0xf)); > > - offset = vmcs_field_to_offset(field); > + offset = vmcs_field_to_offset(vmx, field); > if (offset < 0) > return nested_vmx_fail(vcpu, > VMXERR_UNSUPPORTED_VMCS_COMPONENT); > > diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h > index 2a45f026ee11..3c27631e0119 100644 > --- a/arch/x86/kvm/vmx/vmcs12.h > +++ b/arch/x86/kvm/vmx/vmcs12.h > @@ -364,7 +364,8 @@ static inline void vmx_check_vmcs12_offsets(void) > extern const unsigned short vmcs_field_to_offset_table[]; > extern const unsigned int nr_vmcs12_fields; > > -static inline short vmcs_field_to_offset(unsigned long field) > +static inline short vmcs_field_to_offset(struct vcpu_vmx *vmx, > + unsigned long field) > { > unsigned short offset; > unsigned int index; > @@ -378,9 +379,10 @@ static inline short > vmcs_field_to_offset(unsigned long field) > > index = array_index_nospec(index, nr_vmcs12_fields); > offset = vmcs_field_to_offset_table[index]; > - if (offset == 0) > + if (offset == 0 || > + ((offset & 1) && !vmcs12_field_exists(vmx, field))) > return -ENOENT; > - return offset; > + return offset & ~1; > } > > static inline u64 vmcs12_read_any(struct vmcs12 *vmcs12, unsigned > long field,
On Wed, Aug 18, 2021, Robert Hoo wrote: > > Limiting this to VMREAD/VMWRITE means we shouldn't need a bitmap and > > can use a more static lookup, e.g. a switch statement. > Emm, hard for me to choose: > > Your approach sounds more efficient for CPU: Once VMX MSR's updated, no > bother to update the bitmap. Each field's existence check will directly > consult related VMX MSR. Well, the switch statement will be long... How long? Honest question, off the top of my head I don't have a feel for how many fields conditionally exist. > My this implementation: once VMX MSR's updated, the update needs to be > passed to bitmap, this is 1 extra step comparing to aforementioned > above. But, later, when query field existence, especially the those > consulting vm{entry,exit}_ctrl, they usually would have to consult both > MSRs if otherwise no bitmap, and we cannot guarantee if in the future > there's no more complicated dependencies. If using bitmap, this consult > is just 1-bit reading. If no bitmap, several MSR's read and compare > happen. Yes, but the bitmap is per-VM and likely may or may not be cache-hot for back-to-back VMREAD/VMWRITE to different fields, whereas the shadow controls are much more likely to reside somewhere in the caches. > And, VMX MSR --> bitmap, usually happens only once when vCPU model is > settled. But VMRead/VMWrite might happen frequently, depends on guest > itself. I'd rather leave complicated comparison in former than in > later. I'm not terribly concerned about the runtime performance, it's the extra per-VM allocation for something that's not thaaat interesting that I don't like. And for performance, most of the frequently accessed VMCS fields will be shadowed anyways, i.e. won't VM-Exit in the first place. And that brings up another wrinkle. The shadow VMCS bitmaps are global across all VMs, e.g. if the preemption timer is supported in hardware but hidden from L1, then a misbehaving L1 can VMREAD/VMWRITE the field even with this patch. If it was just the preemption timer we could consider disabling shadow VMCS for the VM ifthe timer exists but is hidden from L1, but GUEST_PML_INDEX and GUEST_INTR_STATUS are also conditional :-( Maybe there's a middle ground, e.g. let userspace tell KVM which fields it plans on exposing to L1, use that to build the bitmaps, and disable shadow VMCS if userspace creates VMs that don't match the specified configuration. Burning three more pages per VM isn't very enticing... This is quite the complicated mess for something I'm guessing no one actually cares about. At what point do we chalk this up as a virtualization hole and sweep it under the rug?
On Wed, Aug 18, 2021 at 4:11 PM Sean Christopherson <seanjc@google.com> wrote: > > On Wed, Aug 18, 2021, Robert Hoo wrote: > > > Limiting this to VMREAD/VMWRITE means we shouldn't need a bitmap and > > > can use a more static lookup, e.g. a switch statement. > > Emm, hard for me to choose: > > > > Your approach sounds more efficient for CPU: Once VMX MSR's updated, no > > bother to update the bitmap. Each field's existence check will directly > > consult related VMX MSR. Well, the switch statement will be long... > > How long? Honest question, off the top of my head I don't have a feel for how > many fields conditionally exist. > > > My this implementation: once VMX MSR's updated, the update needs to be > > passed to bitmap, this is 1 extra step comparing to aforementioned > > above. But, later, when query field existence, especially the those > > consulting vm{entry,exit}_ctrl, they usually would have to consult both > > MSRs if otherwise no bitmap, and we cannot guarantee if in the future > > there's no more complicated dependencies. If using bitmap, this consult > > is just 1-bit reading. If no bitmap, several MSR's read and compare > > happen. > > Yes, but the bitmap is per-VM and likely may or may not be cache-hot for back-to-back > VMREAD/VMWRITE to different fields, whereas the shadow controls are much more likely > to reside somewhere in the caches. > > > And, VMX MSR --> bitmap, usually happens only once when vCPU model is > > settled. But VMRead/VMWrite might happen frequently, depends on guest > > itself. I'd rather leave complicated comparison in former than in > > later. > > I'm not terribly concerned about the runtime performance, it's the extra per-VM > allocation for something that's not thaaat interesting that I don't like. > > And for performance, most of the frequently accessed VMCS fields will be shadowed > anyways, i.e. won't VM-Exit in the first place. > > And that brings up another wrinkle. The shadow VMCS bitmaps are global across > all VMs, e.g. if the preemption timer is supported in hardware but hidden from > L1, then a misbehaving L1 can VMREAD/VMWRITE the field even with this patch. > If it was just the preemption timer we could consider disabling shadow VMCS for > the VM ifthe timer exists but is hidden from L1, but GUEST_PML_INDEX and > GUEST_INTR_STATUS are also conditional :-( > > Maybe there's a middle ground, e.g. let userspace tell KVM which fields it plans > on exposing to L1, use that to build the bitmaps, and disable shadow VMCS if > userspace creates VMs that don't match the specified configuration. Burning > three more pages per VM isn't very enticing... > > This is quite the complicated mess for something I'm guessing no one actually > cares about. At what point do we chalk this up as a virtualization hole and > sweep it under the rug? Good point! Note that hardware doesn't even get this right. See erratum CF77 in https://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/xeon-e7-v2-spec-update.pdf. I'd cut and paste the text here, but Intel won't allow that.
On Wed, Aug 18, 2021, Jim Mattson wrote: > On Wed, Aug 18, 2021 at 4:11 PM Sean Christopherson <seanjc@google.com> wrote: > > This is quite the complicated mess for something I'm guessing no one actually > > cares about. At what point do we chalk this up as a virtualization hole and > > sweep it under the rug? > > Good point! Note that hardware doesn't even get this right. See > erratum CF77 in > https://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/xeon-e7-v2-spec-update.pdf. > I'd cut and paste the text here, but Intel won't allow that. Ha! KVM's behavior is a feature, not a bug, we're just matching hardware! ;-)
On Wed, 2021-08-18 at 23:10 +0000, Sean Christopherson wrote: > On Wed, Aug 18, 2021, Robert Hoo wrote: > > > Limiting this to VMREAD/VMWRITE means we shouldn't need a bitmap > > > and > > > can use a more static lookup, e.g. a switch statement. > > > > Emm, hard for me to choose: > > > > Your approach sounds more efficient for CPU: Once VMX MSR's > > updated, no > > bother to update the bitmap. Each field's existence check will > > directly > > consult related VMX MSR. Well, the switch statement will be long... > > How long? Honest question, off the top of my head I don't have a > feel for how > many fields conditionally exist. Per my just manual count, ~51 fields till today. > > > My this implementation: once VMX MSR's updated, the update needs to > > be > > passed to bitmap, this is 1 extra step comparing to aforementioned > > above. But, later, when query field existence, especially the those > > consulting vm{entry,exit}_ctrl, they usually would have to consult > > both > > MSRs if otherwise no bitmap, and we cannot guarantee if in the > > future > > there's no more complicated dependencies. If using bitmap, this > > consult > > is just 1-bit reading. If no bitmap, several MSR's read and compare > > happen. > > Yes, but the bitmap is per-VM and likely may or may not be cache-hot > for back-to-back > VMREAD/VMWRITE to different fields, whereas the shadow controls are > much more likely > to reside somewhere in the caches. Sorry I don't quite understand the "shadow controls" here. Do you mean shadow VMCS? what does field existence to do with shadow VMCS? emm, here you indeed remind me a questions: what if L1 VMREAD/VMWRITE a shadow field that doesn't exist? If your here "shadow controls" means nested_vmx.nested_vmx_msrs, they're like bitmap, per-vCPU, I think no essential difference for their cache hit possibilities. BTW, till current VMCS12 size, the bitmap can be contained in a cache line. > > > And, VMX MSR --> bitmap, usually happens only once when vCPU model > > is > > settled. But VMRead/VMWrite might happen frequently, depends on > > guest > > itself. I'd rather leave complicated comparison in former than in > > later. > > I'm not terribly concerned about the runtime performance, it's the > extra per-VM > allocation for something that's not thaaat interesting that I don't > like. OK, it's even further, per-vCPU/vmx ;) > > And for performance, most of the frequently accessed VMCS fields will > be shadowed > anyways, i.e. won't VM-Exit in the first place. > > And that brings up another wrinkle. The shadow VMCS bitmaps are > global across > all VMs, OK, that's the problem. Ideally, it should be per-VM or per-vCPU, but that means each VM/vCPU will consume 2 more pages for vm{read,write} bitmap. > e.g. if the preemption timer is supported in hardware but hidden from > L1, then a misbehaving L1 can VMREAD/VMWRITE the field even with this > patch. > If it was just the preemption timer we could consider disabling > shadow VMCS for > the VM ifthe timer exists but is hidden from L1, but GUEST_PML_INDEX > and > GUEST_INTR_STATUS are also conditional :-( Yes, if the vm{read,write}-bitmap is KVM global, cannot implement field existence with shadow VMCS functioning. I don't think it's right. It just did't cause any trouble until we consider today's field existence implementation. If we stringently implement this per spec, i.e. each VMCS has its own vm{read,write}-bitmap, or at least each VM has its own, then doable. > > Maybe there's a middle ground, e.g. let userspace tell KVM which > fields it plans > on exposing to L1, use that to build the bitmaps, and disable shadow > VMCS if > userspace creates VMs that don't match the specified configuration. Here "specific configuration" means: if KVM vm{write,read}-bitmap enables some L1 non-exist field shadow read/write, we turn of shadow VMCS for that VM, right? I guess user would rather abandon this field existence check for VMCS shadowing. > Burning > three more pages per VM isn't very enticing... Why 3 more? I count 2 more pages, i.e. vm{read,write}-bitmap. And, just 2 pages (8KB) per VM isn't huge consumption, is it? ;) > > This is quite the complicated mess for something I'm guessing no one > actually > cares about. At what point do we chalk this up as a virtualization > hole and > sweep it under the rug? Yes, too complicated, beyond my imagination of vmcs12 field existence implementation at first. I guess perhaps the original guy who hard coded nested_msr.vmcs_enum had tried this before ;)
On Thu, Aug 19, 2021, Robert Hoo wrote: > On Wed, 2021-08-18 at 23:10 +0000, Sean Christopherson wrote: > > > My this implementation: once VMX MSR's updated, the update needs to be > > > passed to bitmap, this is 1 extra step comparing to aforementioned above. > > > But, later, when query field existence, especially the those consulting > > > vm{entry,exit}_ctrl, they usually would have to consult both MSRs if > > > otherwise no bitmap, and we cannot guarantee if in the future there's no > > > more complicated dependencies. If using bitmap, this consult is just > > > 1-bit reading. If no bitmap, several MSR's read and compare happen. > > > > Yes, but the bitmap is per-VM and likely may or may not be cache-hot for > > back-to-back VMREAD/VMWRITE to different fields, whereas the shadow > > controls are much more likely to reside somewhere in the caches. > > Sorry I don't quite understand the "shadow controls" here. Do you mean > shadow VMCS? what does field existence to do with shadow VMCS? vmcs->controls_shadow.* > emm, here you indeed remind me a questions: what if L1 VMREAD/VMWRITE a > shadow field that doesn't exist? Doesn't exist in hardware? KVM will intercept the access by leaving the corresponding bit set in the VMREAD/VMWRITE bitmaps. This is handled by init_vmcs_shadow_fields(). Note, KVM will still incorrectly emulate the access, but on the plus side that means L2 will see consistent behavior regardless of underlying hardware.
On Wed, 2021-09-01 at 20:42 +0000, Sean Christopherson wrote: > On Thu, Aug 19, 2021, Robert Hoo wrote: > > On Wed, 2021-08-18 at 23:10 +0000, Sean Christopherson wrote: > > > > My this implementation: once VMX MSR's updated, the update > > > > needs to be > > > > passed to bitmap, this is 1 extra step comparing to > > > > aforementioned above. > > > > But, later, when query field existence, especially the those > > > > consulting > > > > vm{entry,exit}_ctrl, they usually would have to consult both > > > > MSRs if > > > > otherwise no bitmap, and we cannot guarantee if in the future > > > > there's no > > > > more complicated dependencies. If using bitmap, this consult is > > > > just > > > > 1-bit reading. If no bitmap, several MSR's read and compare > > > > happen. > > > > > > Yes, but the bitmap is per-VM and likely may or may not be cache- > > > hot for > > > back-to-back VMREAD/VMWRITE to different fields, whereas the > > > shadow > > > controls are much more likely to reside somewhere in the caches. > > > > Sorry I don't quite understand the "shadow controls" here. Do you > > mean > > shadow VMCS? what does field existence to do with shadow VMCS? > > vmcs->controls_shadow.* OK, I see now. But I still don't understand why is these shadow controls related to field existence. They not in handle_vm{read,write}() path. Would you shed more light? Thanks. > > > emm, here you indeed remind me a questions: what if L1 > > VMREAD/VMWRITE a > > shadow field that doesn't exist? > > Doesn't exist in hardware? I mean doesn't exist in VMCS12, per the bitmap. In this case, when L1 read/write the field, it is shadowed, won't be trapped by vmx. > KVM will intercept the access by leaving the > corresponding bit set in the VMREAD/VMWRITE bitmaps. This is handled > by > init_vmcs_shadow_fields(). Note, KVM will still incorrectly emulate > the access, > but on the plus side that means L2 will see consistent behavior > regardless of > underlying hardware.
On Fri, Sep 03, 2021, Robert Hoo wrote: > On Wed, 2021-09-01 at 20:42 +0000, Sean Christopherson wrote: > > On Thu, Aug 19, 2021, Robert Hoo wrote: > > > On Wed, 2021-08-18 at 23:10 +0000, Sean Christopherson wrote: > > > > > My this implementation: once VMX MSR's updated, the update needs to > > > > > be passed to bitmap, this is 1 extra step comparing to aforementioned > > > > > above. But, later, when query field existence, especially the those > > > > > consulting vm{entry,exit}_ctrl, they usually would have to consult > > > > > both MSRs if otherwise no bitmap, and we cannot guarantee if in the > > > > > future there's no more complicated dependencies. If using bitmap, > > > > > this consult is just 1-bit reading. If no bitmap, several MSR's read > > > > > and compare happen. > > > > > > > > Yes, but the bitmap is per-VM and likely may or may not be cache- hot > > > > for back-to-back VMREAD/VMWRITE to different fields, whereas the shadow > > > > controls are much more likely to reside somewhere in the caches. > > > > > > Sorry I don't quite understand the "shadow controls" here. Do you mean > > > shadow VMCS? what does field existence to do with shadow VMCS? > > > > vmcs->controls_shadow.* > > OK, I see now. But I still don't understand why is these shadow > controls related to field existence. They not in > handle_vm{read,write}() path. Would you shed more light? Thanks. Hmm, you're confused because my comment about the controls shadows is nonsensical. I conflated the vmcs->controls_shadow.* with vmx->nested.msrs.*. Sorry for the confusion :-/
On Fri, 2021-09-03 at 15:11 +0000, Sean Christopherson wrote:
> ...
Hi Sean,
Sorry for so late reply. Multi-task, you know;-)
The discussion about this patch has passed so long time and has
diverged, actually. Let me summarize our previous discussions. Then we
can converge things and settle direction.
* Copy to/from shadow vmcs, no need to validate field existence or not.
-- I agree.
* Now that only VMCS-read/write need to validate field existence, can
use static check instead of bitmap.
* And borrow bit 0 in the field->offset table to denote conditional
fields.
Because:
Shadow control can have more chances to be cache-hit than
bitmap.
The bitmap is per-VMX, additional memory allocation is not
interesting.
Robert argued:
I still prefer to use bitmap to denote conditional fields.
If used static switch
case check rather than bitmap, the
switch
case would be very long. Till today, ~51 conditional fields.
Though very less likely, we cannot guarantee no future use of
bit 0 of field->offset table entry.
From perspective of runtime efficiency, read bitmap is better
to do static check every time.
From the perspective of cache hit chance, shadow control (or
nested_vmx_msrs) and bitmap are both in nested structure, I don't think
they have essential difference.
The bitmap is just 62 bytes long now, I think it's tolerable.:)
* Interaction with Shadow VMCS -- for those shadowed fields, we cannot
trap its read/write, therefore cannot check its existence per vmx
configuration L0 set for L1.
This last point is the most messy one.
If we would like to solve this, you proposed as a middle ground
to disable shadow VMCS totally when user space setting conflicts with
what KVM figured out.
You also said, "This is quite the complicated mess for
something I'm guessing no one actually cares about. At what point do
we chalk this up as a virtualization hole and sweep it under the rug?"
-- I couldn't agree more.
We think to disable shadow VMCS totally is not good in any
circumstances, for the sake of nested performance, etc..
We think there are 2 ways ahead:
1) Leave it as it is nowadays, i.e. discard this patch set.
Perhaps we can add some build-check to force update that hard-coded
assignment to vmcs-enum-max when necessary.
2) Make shadow vmcs bitmap per VM. This will have to allocate 2
(or 3?) more pages (shadow read/write bitmaps) per VM. Then we can
configure the shadow vmcs bitmap per user space configuration, i.e.
don't shadow those conditional VMCS fields, force its read/write to go
through handle_vm{read,write} gate.
So, Sean, can you help converge our discussion and settle next step?
Thanks.:-)
On Tue, Sep 28, 2021, Robert Hoo wrote: > On Fri, 2021-09-03 at 15:11 +0000, Sean Christopherson wrote: > You also said, "This is quite the complicated mess for > something I'm guessing no one actually cares about. At what point do > we chalk this up as a virtualization hole and sweep it under the rug?" > -- I couldn't agree more. ... > So, Sean, can you help converge our discussion and settle next step? Any objection to simply keeping KVM's current behavior, i.e. sweeping this under the proverbial rug?
On Tue, Oct 5, 2021 at 9:16 AM Sean Christopherson <seanjc@google.com> wrote: > > On Tue, Sep 28, 2021, Robert Hoo wrote: > > On Fri, 2021-09-03 at 15:11 +0000, Sean Christopherson wrote: > > You also said, "This is quite the complicated mess for > > something I'm guessing no one actually cares about. At what point do > > we chalk this up as a virtualization hole and sweep it under the rug?" > > -- I couldn't agree more. > > ... > > > So, Sean, can you help converge our discussion and settle next step? > > Any objection to simply keeping KVM's current behavior, i.e. sweeping this under > the proverbial rug? Adding 8 KiB per vCPU seems like no big deal to me, but, on the other hand, Paolo recently argued that slightly less than 1 KiB per vCPU was unreasonable for VM-exit statistics, so maybe I've got a warped perspective. I'm all for pedantic adherence to the specification, but I have to admit that no actual hypervisor is likely to care (or ever will).
On Tue, Oct 05, 2021, Jim Mattson wrote: > On Tue, Oct 5, 2021 at 9:16 AM Sean Christopherson <seanjc@google.com> wrote: > > > > On Tue, Sep 28, 2021, Robert Hoo wrote: > > > On Fri, 2021-09-03 at 15:11 +0000, Sean Christopherson wrote: > > > You also said, "This is quite the complicated mess for > > > something I'm guessing no one actually cares about. At what point do > > > we chalk this up as a virtualization hole and sweep it under the rug?" > > > -- I couldn't agree more. > > > > ... > > > > > So, Sean, can you help converge our discussion and settle next step? > > > > Any objection to simply keeping KVM's current behavior, i.e. sweeping this under > > the proverbial rug? > > Adding 8 KiB per vCPU seems like no big deal to me, but, on the other > hand, Paolo recently argued that slightly less than 1 KiB per vCPU was > unreasonable for VM-exit statistics, so maybe I've got a warped > perspective. I'm all for pedantic adherence to the specification, but > I have to admit that no actual hypervisor is likely to care (or ever > will). It's not just the memory, it's also the complexity, e.g. to get VMCS shadowing working correctly, both now and in the future.
On Tue, Oct 5, 2021 at 10:59 AM Sean Christopherson <seanjc@google.com> wrote: > > On Tue, Oct 05, 2021, Jim Mattson wrote: > > On Tue, Oct 5, 2021 at 9:16 AM Sean Christopherson <seanjc@google.com> wrote: > > > > > > On Tue, Sep 28, 2021, Robert Hoo wrote: > > > > On Fri, 2021-09-03 at 15:11 +0000, Sean Christopherson wrote: > > > > You also said, "This is quite the complicated mess for > > > > something I'm guessing no one actually cares about. At what point do > > > > we chalk this up as a virtualization hole and sweep it under the rug?" > > > > -- I couldn't agree more. > > > > > > ... > > > > > > > So, Sean, can you help converge our discussion and settle next step? > > > > > > Any objection to simply keeping KVM's current behavior, i.e. sweeping this under > > > the proverbial rug? > > > > Adding 8 KiB per vCPU seems like no big deal to me, but, on the other > > hand, Paolo recently argued that slightly less than 1 KiB per vCPU was > > unreasonable for VM-exit statistics, so maybe I've got a warped > > perspective. I'm all for pedantic adherence to the specification, but > > I have to admit that no actual hypervisor is likely to care (or ever > > will). > > It's not just the memory, it's also the complexity, e.g. to get VMCS shadowing > working correctly, both now and in the future. As far as CPU feature virtualization goes, this one doesn't seem that complex to me. It's not anywhere near as complex as virtualizing MTF, for instance, and KVM *claims* to do that! :-)
On Tue, Oct 05, 2021, Jim Mattson wrote: > On Tue, Oct 5, 2021 at 10:59 AM Sean Christopherson <seanjc@google.com> wrote: > > > > On Tue, Oct 05, 2021, Jim Mattson wrote: > > > On Tue, Oct 5, 2021 at 9:16 AM Sean Christopherson <seanjc@google.com> wrote: > > > > > > > > On Tue, Sep 28, 2021, Robert Hoo wrote: > > > > > On Fri, 2021-09-03 at 15:11 +0000, Sean Christopherson wrote: > > > > > You also said, "This is quite the complicated mess for > > > > > something I'm guessing no one actually cares about. At what point do > > > > > we chalk this up as a virtualization hole and sweep it under the rug?" > > > > > -- I couldn't agree more. > > > > > > > > ... > > > > > > > > > So, Sean, can you help converge our discussion and settle next step? > > > > > > > > Any objection to simply keeping KVM's current behavior, i.e. sweeping this under > > > > the proverbial rug? > > > > > > Adding 8 KiB per vCPU seems like no big deal to me, but, on the other > > > hand, Paolo recently argued that slightly less than 1 KiB per vCPU was > > > unreasonable for VM-exit statistics, so maybe I've got a warped > > > perspective. I'm all for pedantic adherence to the specification, but > > > I have to admit that no actual hypervisor is likely to care (or ever > > > will). > > > > It's not just the memory, it's also the complexity, e.g. to get VMCS shadowing > > working correctly, both now and in the future. > > As far as CPU feature virtualization goes, this one doesn't seem that > complex to me. It's not anywhere near as complex as virtualizing MTF, > for instance, and KVM *claims* to do that! :-) There aren't many things as complex as MTF. But unlike MTF, this behavior doesn't have a concrete use case to justify the risk vs. reward. IMO the odds of us breaking something in KVM for "normal" use cases are higher than the odds of an L1 VMM breaking because a VMREAD/VMWRITE didn't fail when it technically should have failed.
On Tue, Oct 5, 2021 at 1:50 PM Sean Christopherson <seanjc@google.com> wrote: > > On Tue, Oct 05, 2021, Jim Mattson wrote: > > On Tue, Oct 5, 2021 at 10:59 AM Sean Christopherson <seanjc@google.com> wrote: > > > > > > On Tue, Oct 05, 2021, Jim Mattson wrote: > > > > On Tue, Oct 5, 2021 at 9:16 AM Sean Christopherson <seanjc@google.com> wrote: > > > > > > > > > > On Tue, Sep 28, 2021, Robert Hoo wrote: > > > > > > On Fri, 2021-09-03 at 15:11 +0000, Sean Christopherson wrote: > > > > > > You also said, "This is quite the complicated mess for > > > > > > something I'm guessing no one actually cares about. At what point do > > > > > > we chalk this up as a virtualization hole and sweep it under the rug?" > > > > > > -- I couldn't agree more. > > > > > > > > > > ... > > > > > > > > > > > So, Sean, can you help converge our discussion and settle next step? > > > > > > > > > > Any objection to simply keeping KVM's current behavior, i.e. sweeping this under > > > > > the proverbial rug? > > > > > > > > Adding 8 KiB per vCPU seems like no big deal to me, but, on the other > > > > hand, Paolo recently argued that slightly less than 1 KiB per vCPU was > > > > unreasonable for VM-exit statistics, so maybe I've got a warped > > > > perspective. I'm all for pedantic adherence to the specification, but > > > > I have to admit that no actual hypervisor is likely to care (or ever > > > > will). > > > > > > It's not just the memory, it's also the complexity, e.g. to get VMCS shadowing > > > working correctly, both now and in the future. > > > > As far as CPU feature virtualization goes, this one doesn't seem that > > complex to me. It's not anywhere near as complex as virtualizing MTF, > > for instance, and KVM *claims* to do that! :-) > > There aren't many things as complex as MTF. But unlike MTF, this behavior doesn't > have a concrete use case to justify the risk vs. reward. IMO the odds of us breaking > something in KVM for "normal" use cases are higher than the odds of an L1 VMM breaking > because a VMREAD/VMWRITE didn't fail when it technically should have failed. Playing devil's advocate here, because I totally agree with you... Who's to say what's "normal"? It's a slippery slope when we start making personal value judgments about which parts of the architectural specification are important and which aren't.
On Tue, Oct 05, 2021, Jim Mattson wrote: > On Tue, Oct 5, 2021 at 1:50 PM Sean Christopherson <seanjc@google.com> wrote: > > > > On Tue, Oct 05, 2021, Jim Mattson wrote: > > > On Tue, Oct 5, 2021 at 10:59 AM Sean Christopherson <seanjc@google.com> wrote: > > > > > > > > On Tue, Oct 05, 2021, Jim Mattson wrote: > > > > > On Tue, Oct 5, 2021 at 9:16 AM Sean Christopherson <seanjc@google.com> wrote: > > > > > > > > > > > > On Tue, Sep 28, 2021, Robert Hoo wrote: > > > > > > > On Fri, 2021-09-03 at 15:11 +0000, Sean Christopherson wrote: > > > > > > > You also said, "This is quite the complicated mess for > > > > > > > something I'm guessing no one actually cares about. At what point do > > > > > > > we chalk this up as a virtualization hole and sweep it under the rug?" > > > > > > > -- I couldn't agree more. > > > > > > > > > > > > ... > > > > > > > > > > > > > So, Sean, can you help converge our discussion and settle next step? > > > > > > > > > > > > Any objection to simply keeping KVM's current behavior, i.e. sweeping this under > > > > > > the proverbial rug? > > > > > > > > > > Adding 8 KiB per vCPU seems like no big deal to me, but, on the other > > > > > hand, Paolo recently argued that slightly less than 1 KiB per vCPU was > > > > > unreasonable for VM-exit statistics, so maybe I've got a warped > > > > > perspective. I'm all for pedantic adherence to the specification, but > > > > > I have to admit that no actual hypervisor is likely to care (or ever > > > > > will). > > > > > > > > It's not just the memory, it's also the complexity, e.g. to get VMCS shadowing > > > > working correctly, both now and in the future. > > > > > > As far as CPU feature virtualization goes, this one doesn't seem that > > > complex to me. It's not anywhere near as complex as virtualizing MTF, > > > for instance, and KVM *claims* to do that! :-) > > > > There aren't many things as complex as MTF. But unlike MTF, this behavior doesn't > > have a concrete use case to justify the risk vs. reward. IMO the odds of us breaking > > something in KVM for "normal" use cases are higher than the odds of an L1 VMM breaking > > because a VMREAD/VMWRITE didn't fail when it technically should have failed. > > Playing devil's advocate here, because I totally agree with you... > > Who's to say what's "normal"? It's a slippery slope when we start > making personal value judgments about which parts of the architectural > specification are important and which aren't. I agree, but in a very similar case Intel chose to take an erratum instead of fixing what was in all likelihood a microcode bug, i.e. could have been patched in the field. So it's not _just_ personal value judgment, though it's definitely that too :-) I'm not saying I'd actively oppose support for strict VMREAD/VMWRITE adherence to the vCPU model, but I'm also not going to advise anyone to go spend their time implementing a non-trivial fix for behavior that, AFAIK, doesn't adversely affect any real world use cases.
On Tue, Oct 05, 2021 at 11:22:15PM +0000, Sean Christopherson wrote: > On Tue, Oct 05, 2021, Jim Mattson wrote: > > On Tue, Oct 5, 2021 at 1:50 PM Sean Christopherson <seanjc@google.com> wrote: > > > > > > On Tue, Oct 05, 2021, Jim Mattson wrote: > > > > On Tue, Oct 5, 2021 at 10:59 AM Sean Christopherson <seanjc@google.com> wrote: > > > > > > > > > > On Tue, Oct 05, 2021, Jim Mattson wrote: > > > > > > On Tue, Oct 5, 2021 at 9:16 AM Sean Christopherson <seanjc@google.com> wrote: > > > > > > > > > > > > > > On Tue, Sep 28, 2021, Robert Hoo wrote: > > > > > > > > On Fri, 2021-09-03 at 15:11 +0000, Sean Christopherson wrote: > > > > > > > > You also said, "This is quite the complicated mess for > > > > > > > > something I'm guessing no one actually cares about. At what point do > > > > > > > > we chalk this up as a virtualization hole and sweep it under the rug?" > > > > > > > > -- I couldn't agree more. > > > > > > > > > > > > > > ... > > > > > > > > > > > > > > > So, Sean, can you help converge our discussion and settle next step? > > > > > > > > > > > > > > Any objection to simply keeping KVM's current behavior, i.e. sweeping this under > > > > > > > the proverbial rug? > > > > > > > > > > > > Adding 8 KiB per vCPU seems like no big deal to me, but, on the other > > > > > > hand, Paolo recently argued that slightly less than 1 KiB per vCPU was > > > > > > unreasonable for VM-exit statistics, so maybe I've got a warped > > > > > > perspective. I'm all for pedantic adherence to the specification, but > > > > > > I have to admit that no actual hypervisor is likely to care (or ever > > > > > > will). > > > > > > > > > > It's not just the memory, it's also the complexity, e.g. to get VMCS shadowing > > > > > working correctly, both now and in the future. > > > > > > > > As far as CPU feature virtualization goes, this one doesn't seem that > > > > complex to me. It's not anywhere near as complex as virtualizing MTF, > > > > for instance, and KVM *claims* to do that! :-) > > > > > > There aren't many things as complex as MTF. But unlike MTF, this behavior doesn't > > > have a concrete use case to justify the risk vs. reward. IMO the odds of us breaking > > > something in KVM for "normal" use cases are higher than the odds of an L1 VMM breaking > > > because a VMREAD/VMWRITE didn't fail when it technically should have failed. > > > > Playing devil's advocate here, because I totally agree with you... > > > > Who's to say what's "normal"? It's a slippery slope when we start > > making personal value judgments about which parts of the architectural > > specification are important and which aren't. > > I agree, but in a very similar case Intel chose to take an erratum instead of > fixing what was in all likelihood a microcode bug, i.e. could have been patched > in the field. So it's not _just_ personal value judgment, though it's definitely > that too :-) > > I'm not saying I'd actively oppose support for strict VMREAD/VMWRITE adherence > to the vCPU model, but I'm also not going to advise anyone to go spend their time > implementing a non-trivial fix for behavior that, AFAIK, doesn't adversely affect > any real world use cases. > Thank you all for the discussion, Sean & Jim. Could we draw a conclusion to just keep KVM as it is now? If yes, how about we depricate the check against max index value from MSR_IA32_VMX_VMCS_ENUM in vmx.c of the kvm-unit-test? After all, we have not witnessed any real system doing so. E.g., diff --git a/x86/vmx.c b/x86/vmx.c index f0b853a..63623e5 100644 --- a/x86/vmx.c +++ b/x86/vmx.c @@ -380,8 +380,7 @@ static void test_vmwrite_vmread(void) vmcs_enum_max = (rdmsr(MSR_IA32_VMX_VMCS_ENUM) & VMCS_FIELD_INDEX_MASK) >> VMCS_FIELD_INDEX_SHIFT; max_index = find_vmcs_max_index(); - report(vmcs_enum_max == max_index, - "VMX_VMCS_ENUM.MAX_INDEX expected: %x, actual: %x", + printf("VMX_VMCS_ENUM.MAX_INDEX expected: %x, actual: %x", max_index, vmcs_enum_max); assert(!vmcs_clear(vmcs)); B.R. Yu
On Fri, 2021-10-08 at 16:23 +0800, Yu Zhang wrote: > On Tue, Oct 05, 2021 at 11:22:15PM +0000, Sean Christopherson wrote: > > On Tue, Oct 05, 2021, Jim Mattson wrote: > > > On Tue, Oct 5, 2021 at 1:50 PM Sean Christopherson < > > > seanjc@google.com> wrote: > > > > > > > > On Tue, Oct 05, 2021, Jim Mattson wrote: > > > > > On Tue, Oct 5, 2021 at 10:59 AM Sean Christopherson < > > > > > seanjc@google.com> wrote: > > > > > > > > > > > > On Tue, Oct 05, 2021, Jim Mattson wrote: > > > > > > > On Tue, Oct 5, 2021 at 9:16 AM Sean Christopherson < > > > > > > > seanjc@google.com> wrote: > > > > > > > > > > > > > > > > On Tue, Sep 28, 2021, Robert Hoo wrote: > > > > > > > > > On Fri, 2021-09-03 at 15:11 +0000, Sean > > > > > > > > > Christopherson wrote: > > > > > > > > > You also said, "This is quite the complicated > > > > > > > > > mess for > > > > > > > > > something I'm guessing no one actually cares > > > > > > > > > about. At what point do > > > > > > > > > we chalk this up as a virtualization hole and sweep > > > > > > > > > it under the rug?" > > > > > > > > > -- I couldn't agree more. > > > > > > > > > > > > > > > > ... > > > > > > > > > > > > > > > > > So, Sean, can you help converge our discussion and > > > > > > > > > settle next step? > > > > > > > > > > > > > > > > Any objection to simply keeping KVM's current behavior, > > > > > > > > i.e. sweeping this under > > > > > > > > the proverbial rug? > > > > > > > > > > > > > > Adding 8 KiB per vCPU seems like no big deal to me, but, > > > > > > > on the other > > > > > > > hand, Paolo recently argued that slightly less than 1 KiB > > > > > > > per vCPU was > > > > > > > unreasonable for VM-exit statistics, so maybe I've got a > > > > > > > warped > > > > > > > perspective. I'm all for pedantic adherence to the > > > > > > > specification, but > > > > > > > I have to admit that no actual hypervisor is likely to > > > > > > > care (or ever > > > > > > > will). > > > > > > > > > > > > It's not just the memory, it's also the complexity, e.g. to > > > > > > get VMCS shadowing > > > > > > working correctly, both now and in the future. > > > > > > > > > > As far as CPU feature virtualization goes, this one doesn't > > > > > seem that > > > > > complex to me. It's not anywhere near as complex as > > > > > virtualizing MTF, > > > > > for instance, and KVM *claims* to do that! :-) > > > > > > > > There aren't many things as complex as MTF. But unlike MTF, > > > > this behavior doesn't > > > > have a concrete use case to justify the risk vs. reward. IMO > > > > the odds of us breaking > > > > something in KVM for "normal" use cases are higher than the > > > > odds of an L1 VMM breaking > > > > because a VMREAD/VMWRITE didn't fail when it technically should > > > > have failed. > > > > > > Playing devil's advocate here, because I totally agree with > > > you... > > > > > > Who's to say what's "normal"? It's a slippery slope when we start > > > making personal value judgments about which parts of the > > > architectural > > > specification are important and which aren't. > > > > I agree, but in a very similar case Intel chose to take an erratum > > instead of > > fixing what was in all likelihood a microcode bug, i.e. could have > > been patched > > in the field. So it's not _just_ personal value judgment, though > > it's definitely > > that too :-) > > > > I'm not saying I'd actively oppose support for strict > > VMREAD/VMWRITE adherence > > to the vCPU model, but I'm also not going to advise anyone to go > > spend their time > > implementing a non-trivial fix for behavior that, AFAIK, doesn't > > adversely affect > > any real world use cases. > > > > Thank you all for the discussion, Sean & Jim. > > Could we draw a conclusion to just keep KVM as it is now? If yes, how > about we > depricate the check against max index value from > MSR_IA32_VMX_VMCS_ENUM in vmx.c > of the kvm-unit-test? > > After all, we have not witnessed any real system doing so. > > E.g., > > diff --git a/x86/vmx.c b/x86/vmx.c > index f0b853a..63623e5 100644 > --- a/x86/vmx.c > +++ b/x86/vmx.c > @@ -380,8 +380,7 @@ static void test_vmwrite_vmread(void) > vmcs_enum_max = (rdmsr(MSR_IA32_VMX_VMCS_ENUM) & > VMCS_FIELD_INDEX_MASK) > >> VMCS_FIELD_INDEX_SHIFT; > max_index = find_vmcs_max_index(); > - report(vmcs_enum_max == max_index, > - "VMX_VMCS_ENUM.MAX_INDEX expected: %x, actual: %x", > + printf("VMX_VMCS_ENUM.MAX_INDEX expected: %x, actual: %x", > max_index, vmcs_enum_max); > > assert(!vmcs_clear(vmcs)); > > B.R. > Yu I think this patch series has its value of fixing the be-forced hard- code VMX_VMCS_ENUM. My understanding of Sean's "simply keeping KVM's current behavior, i.e. sweeping this under the proverbial rug", is about vmcs shadowing will fail some VMCS field validation. Of course, this in turn will fail some case of this KVM unit test case (theoretically), though we haven't met yet.
We have some internal patches for virtualizing VMCS shadowing which may break if there is a guest VMCS field with index greater than VMX_VMCS_ENUM.MAX_INDEX. I plan to upstream them soon. On Fri, Oct 8, 2021 at 8:09 AM Robert Hoo <robert.hu@linux.intel.com> wrote: > > On Fri, 2021-10-08 at 16:23 +0800, Yu Zhang wrote: > > On Tue, Oct 05, 2021 at 11:22:15PM +0000, Sean Christopherson wrote: > > > On Tue, Oct 05, 2021, Jim Mattson wrote: > > > > On Tue, Oct 5, 2021 at 1:50 PM Sean Christopherson < > > > > seanjc@google.com> wrote: > > > > > > > > > > On Tue, Oct 05, 2021, Jim Mattson wrote: > > > > > > On Tue, Oct 5, 2021 at 10:59 AM Sean Christopherson < > > > > > > seanjc@google.com> wrote: > > > > > > > > > > > > > > On Tue, Oct 05, 2021, Jim Mattson wrote: > > > > > > > > On Tue, Oct 5, 2021 at 9:16 AM Sean Christopherson < > > > > > > > > seanjc@google.com> wrote: > > > > > > > > > > > > > > > > > > On Tue, Sep 28, 2021, Robert Hoo wrote: > > > > > > > > > > On Fri, 2021-09-03 at 15:11 +0000, Sean > > > > > > > > > > Christopherson wrote: > > > > > > > > > > You also said, "This is quite the complicated > > > > > > > > > > mess for > > > > > > > > > > something I'm guessing no one actually cares > > > > > > > > > > about. At what point do > > > > > > > > > > we chalk this up as a virtualization hole and sweep > > > > > > > > > > it under the rug?" > > > > > > > > > > -- I couldn't agree more. > > > > > > > > > > > > > > > > > > ... > > > > > > > > > > > > > > > > > > > So, Sean, can you help converge our discussion and > > > > > > > > > > settle next step? > > > > > > > > > > > > > > > > > > Any objection to simply keeping KVM's current behavior, > > > > > > > > > i.e. sweeping this under > > > > > > > > > the proverbial rug? > > > > > > > > > > > > > > > > Adding 8 KiB per vCPU seems like no big deal to me, but, > > > > > > > > on the other > > > > > > > > hand, Paolo recently argued that slightly less than 1 KiB > > > > > > > > per vCPU was > > > > > > > > unreasonable for VM-exit statistics, so maybe I've got a > > > > > > > > warped > > > > > > > > perspective. I'm all for pedantic adherence to the > > > > > > > > specification, but > > > > > > > > I have to admit that no actual hypervisor is likely to > > > > > > > > care (or ever > > > > > > > > will). > > > > > > > > > > > > > > It's not just the memory, it's also the complexity, e.g. to > > > > > > > get VMCS shadowing > > > > > > > working correctly, both now and in the future. > > > > > > > > > > > > As far as CPU feature virtualization goes, this one doesn't > > > > > > seem that > > > > > > complex to me. It's not anywhere near as complex as > > > > > > virtualizing MTF, > > > > > > for instance, and KVM *claims* to do that! :-) > > > > > > > > > > There aren't many things as complex as MTF. But unlike MTF, > > > > > this behavior doesn't > > > > > have a concrete use case to justify the risk vs. reward. IMO > > > > > the odds of us breaking > > > > > something in KVM for "normal" use cases are higher than the > > > > > odds of an L1 VMM breaking > > > > > because a VMREAD/VMWRITE didn't fail when it technically should > > > > > have failed. > > > > > > > > Playing devil's advocate here, because I totally agree with > > > > you... > > > > > > > > Who's to say what's "normal"? It's a slippery slope when we start > > > > making personal value judgments about which parts of the > > > > architectural > > > > specification are important and which aren't. > > > > > > I agree, but in a very similar case Intel chose to take an erratum > > > instead of > > > fixing what was in all likelihood a microcode bug, i.e. could have > > > been patched > > > in the field. So it's not _just_ personal value judgment, though > > > it's definitely > > > that too :-) > > > > > > I'm not saying I'd actively oppose support for strict > > > VMREAD/VMWRITE adherence > > > to the vCPU model, but I'm also not going to advise anyone to go > > > spend their time > > > implementing a non-trivial fix for behavior that, AFAIK, doesn't > > > adversely affect > > > any real world use cases. > > > > > > > Thank you all for the discussion, Sean & Jim. > > > > Could we draw a conclusion to just keep KVM as it is now? If yes, how > > about we > > depricate the check against max index value from > > MSR_IA32_VMX_VMCS_ENUM in vmx.c > > of the kvm-unit-test? > > > > After all, we have not witnessed any real system doing so. > > > > E.g., > > > > diff --git a/x86/vmx.c b/x86/vmx.c > > index f0b853a..63623e5 100644 > > --- a/x86/vmx.c > > +++ b/x86/vmx.c > > @@ -380,8 +380,7 @@ static void test_vmwrite_vmread(void) > > vmcs_enum_max = (rdmsr(MSR_IA32_VMX_VMCS_ENUM) & > > VMCS_FIELD_INDEX_MASK) > > >> VMCS_FIELD_INDEX_SHIFT; > > max_index = find_vmcs_max_index(); > > - report(vmcs_enum_max == max_index, > > - "VMX_VMCS_ENUM.MAX_INDEX expected: %x, actual: %x", > > + printf("VMX_VMCS_ENUM.MAX_INDEX expected: %x, actual: %x", > > max_index, vmcs_enum_max); > > > > assert(!vmcs_clear(vmcs)); > > > > B.R. > > Yu > > I think this patch series has its value of fixing the be-forced hard- > code VMX_VMCS_ENUM. > My understanding of Sean's "simply keeping KVM's current behavior, i.e. > sweeping this under the proverbial rug", is about vmcs shadowing will > fail some VMCS field validation. Of course, this in turn will fail some > case of this KVM unit test case (theoretically), though we haven't met > yet. > >
On Fri, 2021-10-08 at 16:49 -0700, Jim Mattson wrote: > We have some internal patches for virtualizing VMCS shadowing which > may break if there is a guest VMCS field with index greater than > VMX_VMCS_ENUM.MAX_INDEX. I plan to upstream them soon. OK, thanks for letting us know.:-) > > On Fri, Oct 8, 2021 at 8:09 AM Robert Hoo <robert.hu@linux.intel.com> > wrote: > > > > On Fri, 2021-10-08 at 16:23 +0800, Yu Zhang wrote: > > > On Tue, Oct 05, 2021 at 11:22:15PM +0000, Sean Christopherson > > > wrote: > > > > On Tue, Oct 05, 2021, Jim Mattson wrote: > > > > > On Tue, Oct 5, 2021 at 1:50 PM Sean Christopherson < > > > > > seanjc@google.com> wrote: > > > > > > > > > > > > On Tue, Oct 05, 2021, Jim Mattson wrote: > > > > > > > On Tue, Oct 5, 2021 at 10:59 AM Sean Christopherson < > > > > > > > seanjc@google.com> wrote: > > > > > > > > > > > > > > > > On Tue, Oct 05, 2021, Jim Mattson wrote: > > > > > > > > > On Tue, Oct 5, 2021 at 9:16 AM Sean Christopherson < > > > > > > > > > seanjc@google.com> wrote: > > > > > > > > > > > > > > > > > > > > On Tue, Sep 28, 2021, Robert Hoo wrote: > > > > > > > > > > > On Fri, 2021-09-03 at 15:11 +0000, Sean > > > > > > > > > > > Christopherson wrote: > > > > > > > > > > > You also said, "This is quite the > > > > > > > > > > > complicated > > > > > > > > > > > mess for > > > > > > > > > > > something I'm guessing no one actually cares > > > > > > > > > > > about. At what point do > > > > > > > > > > > we chalk this up as a virtualization hole and > > > > > > > > > > > sweep > > > > > > > > > > > it under the rug?" > > > > > > > > > > > -- I couldn't agree more. > > > > > > > > > > > > > > > > > > > > ... > > > > > > > > > > > > > > > > > > > > > So, Sean, can you help converge our discussion > > > > > > > > > > > and > > > > > > > > > > > settle next step? > > > > > > > > > > > > > > > > > > > > Any objection to simply keeping KVM's current > > > > > > > > > > behavior, > > > > > > > > > > i.e. sweeping this under > > > > > > > > > > the proverbial rug? > > > > > > > > > > > > > > > > > > Adding 8 KiB per vCPU seems like no big deal to me, > > > > > > > > > but, > > > > > > > > > on the other > > > > > > > > > hand, Paolo recently argued that slightly less than 1 > > > > > > > > > KiB > > > > > > > > > per vCPU was > > > > > > > > > unreasonable for VM-exit statistics, so maybe I've > > > > > > > > > got a > > > > > > > > > warped > > > > > > > > > perspective. I'm all for pedantic adherence to the > > > > > > > > > specification, but > > > > > > > > > I have to admit that no actual hypervisor is likely > > > > > > > > > to > > > > > > > > > care (or ever > > > > > > > > > will). > > > > > > > > > > > > > > > > It's not just the memory, it's also the complexity, > > > > > > > > e.g. to > > > > > > > > get VMCS shadowing > > > > > > > > working correctly, both now and in the future. > > > > > > > > > > > > > > As far as CPU feature virtualization goes, this one > > > > > > > doesn't > > > > > > > seem that > > > > > > > complex to me. It's not anywhere near as complex as > > > > > > > virtualizing MTF, > > > > > > > for instance, and KVM *claims* to do that! :-) > > > > > > > > > > > > There aren't many things as complex as MTF. But unlike > > > > > > MTF, > > > > > > this behavior doesn't > > > > > > have a concrete use case to justify the risk vs. > > > > > > reward. IMO > > > > > > the odds of us breaking > > > > > > something in KVM for "normal" use cases are higher than the > > > > > > odds of an L1 VMM breaking > > > > > > because a VMREAD/VMWRITE didn't fail when it technically > > > > > > should > > > > > > have failed. > > > > > > > > > > Playing devil's advocate here, because I totally agree with > > > > > you... > > > > > > > > > > Who's to say what's "normal"? It's a slippery slope when we > > > > > start > > > > > making personal value judgments about which parts of the > > > > > architectural > > > > > specification are important and which aren't. > > > > > > > > I agree, but in a very similar case Intel chose to take an > > > > erratum > > > > instead of > > > > fixing what was in all likelihood a microcode bug, i.e. could > > > > have > > > > been patched > > > > in the field. So it's not _just_ personal value judgment, > > > > though > > > > it's definitely > > > > that too :-) > > > > > > > > I'm not saying I'd actively oppose support for strict > > > > VMREAD/VMWRITE adherence > > > > to the vCPU model, but I'm also not going to advise anyone to > > > > go > > > > spend their time > > > > implementing a non-trivial fix for behavior that, AFAIK, > > > > doesn't > > > > adversely affect > > > > any real world use cases. > > > > > > > > > > Thank you all for the discussion, Sean & Jim. > > > > > > Could we draw a conclusion to just keep KVM as it is now? If yes, > > > how > > > about we > > > depricate the check against max index value from > > > MSR_IA32_VMX_VMCS_ENUM in vmx.c > > > of the kvm-unit-test? > > > > > > After all, we have not witnessed any real system doing so. > > > > > > E.g., > > > > > > diff --git a/x86/vmx.c b/x86/vmx.c > > > index f0b853a..63623e5 100644 > > > --- a/x86/vmx.c > > > +++ b/x86/vmx.c > > > @@ -380,8 +380,7 @@ static void test_vmwrite_vmread(void) > > > vmcs_enum_max = (rdmsr(MSR_IA32_VMX_VMCS_ENUM) & > > > VMCS_FIELD_INDEX_MASK) > > > >> VMCS_FIELD_INDEX_SHIFT; > > > max_index = find_vmcs_max_index(); > > > - report(vmcs_enum_max == max_index, > > > - "VMX_VMCS_ENUM.MAX_INDEX expected: %x, actual: > > > %x", > > > + printf("VMX_VMCS_ENUM.MAX_INDEX expected: %x, actual: > > > %x", > > > max_index, vmcs_enum_max); > > > > > > assert(!vmcs_clear(vmcs)); > > > > > > B.R. > > > Yu > > > > I think this patch series has its value of fixing the be-forced > > hard- > > code VMX_VMCS_ENUM. > > My understanding of Sean's "simply keeping KVM's current behavior, > > i.e. > > sweeping this under the proverbial rug", is about vmcs shadowing > > will > > fail some VMCS field validation. Of course, this in turn will fail > > some > > case of this KVM unit test case (theoretically), though we haven't > > met > > yet. > > > >
On Fri, Oct 8, 2021 at 5:05 PM Robert Hoo <robert.hu@linux.intel.com> wrote: > > On Fri, 2021-10-08 at 16:49 -0700, Jim Mattson wrote: > > We have some internal patches for virtualizing VMCS shadowing which > > may break if there is a guest VMCS field with index greater than > > VMX_VMCS_ENUM.MAX_INDEX. I plan to upstream them soon. > > OK, thanks for letting us know.:-) After careful consideration, we're actually going to drop these patches rather than sending them upstream.
On Fri, 2021-10-29 at 12:53 -0700, Jim Mattson wrote: > On Fri, Oct 8, 2021 at 5:05 PM Robert Hoo <robert.hu@linux.intel.com> > wrote: > > > > On Fri, 2021-10-08 at 16:49 -0700, Jim Mattson wrote: > > > We have some internal patches for virtualizing VMCS shadowing > > > which > > > may break if there is a guest VMCS field with index greater than > > > VMX_VMCS_ENUM.MAX_INDEX. I plan to upstream them soon. > > > > OK, thanks for letting us know.:-) > > After careful consideration, we're actually going to drop these > patches rather than sending them upstream. OK. Hi, Paolo, Sean and Jim, Do you think our this series patch are still needed or can be dropped as well? Thanks.
On Wed, Nov 03, 2021, Robert Hoo wrote: > On Fri, 2021-10-29 at 12:53 -0700, Jim Mattson wrote: > > On Fri, Oct 8, 2021 at 5:05 PM Robert Hoo <robert.hu@linux.intel.com> > > wrote: > > > > > > On Fri, 2021-10-08 at 16:49 -0700, Jim Mattson wrote: > > > > We have some internal patches for virtualizing VMCS shadowing > > > > which > > > > may break if there is a guest VMCS field with index greater than > > > > VMX_VMCS_ENUM.MAX_INDEX. I plan to upstream them soon. > > > > > > OK, thanks for letting us know.:-) > > > > After careful consideration, we're actually going to drop these > > patches rather than sending them upstream. > > OK. > > Hi, Paolo, Sean and Jim, > > Do you think our this series patch are still needed or can be dropped > as well? IMO we should drop this series and take our own erratum.
On Tue, Nov 09, 2021 at 10:33:43PM +0000, Sean Christopherson wrote: > On Wed, Nov 03, 2021, Robert Hoo wrote: > > On Fri, 2021-10-29 at 12:53 -0700, Jim Mattson wrote: > > > On Fri, Oct 8, 2021 at 5:05 PM Robert Hoo <robert.hu@linux.intel.com> > > > wrote: > > > > > > > > On Fri, 2021-10-08 at 16:49 -0700, Jim Mattson wrote: > > > > > We have some internal patches for virtualizing VMCS shadowing > > > > > which > > > > > may break if there is a guest VMCS field with index greater than > > > > > VMX_VMCS_ENUM.MAX_INDEX. I plan to upstream them soon. > > > > > > > > OK, thanks for letting us know.:-) > > > > > > After careful consideration, we're actually going to drop these > > > patches rather than sending them upstream. > > > > OK. > > > > Hi, Paolo, Sean and Jim, > > > > Do you think our this series patch are still needed or can be dropped > > as well? > > IMO we should drop this series and take our own erratum. > Thanks, Sean. Do we need a patch in kvm-unit-test to depricate the check against the max index from MSR_IA32_VMX_VMCS_ENUM? B.R. Yu
On Wed, Nov 10, 2021, Yu Zhang wrote: > On Tue, Nov 09, 2021 at 10:33:43PM +0000, Sean Christopherson wrote: > > On Wed, Nov 03, 2021, Robert Hoo wrote: > > > On Fri, 2021-10-29 at 12:53 -0700, Jim Mattson wrote: > > > > On Fri, Oct 8, 2021 at 5:05 PM Robert Hoo <robert.hu@linux.intel.com> > > > > wrote: > > > > > > > > > > On Fri, 2021-10-08 at 16:49 -0700, Jim Mattson wrote: > > > > > > We have some internal patches for virtualizing VMCS shadowing > > > > > > which > > > > > > may break if there is a guest VMCS field with index greater than > > > > > > VMX_VMCS_ENUM.MAX_INDEX. I plan to upstream them soon. > > > > > > > > > > OK, thanks for letting us know.:-) > > > > > > > > After careful consideration, we're actually going to drop these > > > > patches rather than sending them upstream. > > > > > > OK. > > > > > > Hi, Paolo, Sean and Jim, > > > > > > Do you think our this series patch are still needed or can be dropped > > > as well? > > > > IMO we should drop this series and take our own erratum. > > > > Thanks, Sean. > > Do we need a patch in kvm-unit-test to depricate the check against > the max index from MSR_IA32_VMX_VMCS_ENUM? Hmm, yes, unless there's an easy way to tell QEMU to not override the VMX MSRs. I don't see any point in fighting too hard with QEMU.
On Thu, 2021-11-18 at 01:19 +0000, Sean Christopherson wrote: > On Wed, Nov 10, 2021, Yu Zhang wrote: > > On Tue, Nov 09, 2021 at 10:33:43PM +0000, Sean Christopherson > > wrote: > > > On Wed, Nov 03, 2021, Robert Hoo wrote: > > > > On Fri, 2021-10-29 at 12:53 -0700, Jim Mattson wrote: > > > > > On Fri, Oct 8, 2021 at 5:05 PM Robert Hoo < > > > > > robert.hu@linux.intel.com> > > > > > wrote: > > > > > > > > > > > > On Fri, 2021-10-08 at 16:49 -0700, Jim Mattson wrote: > > > > > > > We have some internal patches for virtualizing VMCS > > > > > > > shadowing > > > > > > > which > > > > > > > may break if there is a guest VMCS field with index > > > > > > > greater than > > > > > > > VMX_VMCS_ENUM.MAX_INDEX. I plan to upstream them soon. > > > > > > > > > > > > OK, thanks for letting us know.:-) > > > > > > > > > > After careful consideration, we're actually going to drop > > > > > these > > > > > patches rather than sending them upstream. > > > > > > > > OK. > > > > > > > > Hi, Paolo, Sean and Jim, > > > > > > > > Do you think our this series patch are still needed or can be > > > > dropped > > > > as well? > > > > > > IMO we should drop this series and take our own erratum. > > > > > > > Thanks, Sean. > > > > Do we need a patch in kvm-unit-test to depricate the check against > > the max index from MSR_IA32_VMX_VMCS_ENUM? > > Hmm, yes, unless there's an easy way to tell QEMU to not override the > VMX MSRs. > I don't see any point in fighting too hard with QEMU. OK. I just sent out the kvm-unit-tests patch. Copied from last mail from Yu. https://lore.kernel.org/kvm/1637306107-92967-1-git-send-email-robert.hu@linux.intel.com/T/#u
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b8121f8f6d96..9a35953ede22 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1547,7 +1547,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) for (i = 0; i < max_shadow_read_write_fields; i++) { field = shadow_read_write_fields[i]; val = __vmcs_readl(field.encoding); - vmcs12_write_any(vmcs12, field.encoding, field.offset, val); + vmcs12_write_any(vmcs12, field.encoding, field.offset, val, + vmx->nested.vmcs12_field_existence_bitmap); } vmcs_clear(shadow_vmcs); @@ -1580,8 +1581,9 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) for (q = 0; q < ARRAY_SIZE(fields); q++) { for (i = 0; i < max_fields[q]; i++) { field = fields[q][i]; - val = vmcs12_read_any(vmcs12, field.encoding, - field.offset); + vmcs12_read_any(vmcs12, field.encoding, + field.offset, &val, + vmx->nested.vmcs12_field_existence_bitmap); __vmcs_writel(field.encoding, val); } } @@ -5070,7 +5072,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); struct x86_exception e; unsigned long field; - u64 value; + unsigned long value; gva_t gva = 0; short offset; int len, r; @@ -5098,7 +5100,10 @@ static int handle_vmread(struct kvm_vcpu *vcpu) copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); /* Read the field, zero-extended to a u64 value */ - value = vmcs12_read_any(vmcs12, field, offset); + r = vmcs12_read_any(vmcs12, field, offset, &value, + vmx->nested.vmcs12_field_existence_bitmap); + if (r < 0) + return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); /* * Now copy part of this value to register or memory, as requested. @@ -5223,7 +5228,10 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) value &= 0x1f0ff; - vmcs12_write_any(vmcs12, field, offset, value); + r = vmcs12_write_any(vmcs12, field, offset, value, + vmx->nested.vmcs12_field_existence_bitmap); + if (r < 0) + return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); /* * Do not track vmcs12 dirty-state if in guest-mode as we actually diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 5c39370dff3c..9ac3d6ac1b6b 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -413,31 +413,51 @@ static inline short vmcs_field_to_offset(unsigned long field) #undef ROL16 -static inline u64 vmcs12_read_any(struct vmcs12 *vmcs12, unsigned long field, - u16 offset) +static inline int vmcs12_read_any(struct vmcs12 *vmcs12, unsigned long field, + u16 offset, unsigned long *value, unsigned long *bitmap) { char *p = (char *)vmcs12 + offset; + if (unlikely(bitmap == NULL)) { + pr_err_once("vmcs12 read: NULL bitmap"); + return -EINVAL; + } + if (!test_bit(offset / sizeof(u16), bitmap)) + return -ENOENT; + switch (vmcs_field_width(field)) { case VMCS_FIELD_WIDTH_NATURAL_WIDTH: - return *((natural_width *)p); + *value = *((natural_width *)p); + break; case VMCS_FIELD_WIDTH_U16: - return *((u16 *)p); + *value = *((u16 *)p); + break; case VMCS_FIELD_WIDTH_U32: - return *((u32 *)p); + *value = *((u32 *)p); + break; case VMCS_FIELD_WIDTH_U64: - return *((u64 *)p); + *value = *((u64 *)p); + break; default: WARN_ON_ONCE(1); - return -1; + return -ENOENT; } + + return 0; } -static inline void vmcs12_write_any(struct vmcs12 *vmcs12, unsigned long field, - u16 offset, u64 field_value) +static inline int vmcs12_write_any(struct vmcs12 *vmcs12, unsigned long field, + u16 offset, u64 field_value, unsigned long *bitmap) { char *p = (char *)vmcs12 + offset; + if (unlikely(bitmap == NULL)) { + pr_err_once("%s: NULL bitmap", __func__); + return -EINVAL; + } + if (!test_bit(offset / sizeof(u16), bitmap)) + return -ENOENT; + switch (vmcs_field_width(field)) { case VMCS_FIELD_WIDTH_U16: *(u16 *)p = field_value; @@ -453,8 +473,11 @@ static inline void vmcs12_write_any(struct vmcs12 *vmcs12, unsigned long field, break; default: WARN_ON_ONCE(1); - break; + return -ENOENT; } + + return 0; } + #endif /* __KVM_X86_VMX_VMCS12_H */