diff mbox series

[v19,110/130] KVM: TDX: Handle TDX PV MMIO hypercall

Message ID a4421e0f2eafc17b4703c920936e32489d2382a3.1708933498.git.isaku.yamahata@intel.com (mailing list archive)
State New, archived
Headers show
Series [v19,001/130] x86/virt/tdx: Rename _offset to _member for TD_SYSINFO_MAP() macro | expand

Commit Message

Isaku Yamahata Feb. 26, 2024, 8:26 a.m. UTC
From: Sean Christopherson <sean.j.christopherson@intel.com>

Export kvm_io_bus_read and kvm_mmio tracepoint and wire up TDX PV MMIO
hypercall to the KVM backend functions.

kvm_io_bus_read/write() searches KVM device emulated in kernel of the given
MMIO address and emulates the MMIO.  As TDX PV MMIO also needs it, export
kvm_io_bus_read().  kvm_io_bus_write() is already exported.  TDX PV MMIO
emulates some of MMIO itself.  To add trace point consistently with x86
kvm, export kvm_mmio tracepoint.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/tdx.c | 114 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c     |   1 +
 virt/kvm/kvm_main.c    |   2 +
 3 files changed, 117 insertions(+)

Comments

Binbin Wu April 18, 2024, 9:29 a.m. UTC | #1
On 2/26/2024 4:26 PM, isaku.yamahata@intel.com wrote:
> From: Sean Christopherson <sean.j.christopherson@intel.com>
>
> Export kvm_io_bus_read and kvm_mmio tracepoint and wire up TDX PV MMIO
> hypercall to the KVM backend functions.
>
> kvm_io_bus_read/write() searches KVM device emulated in kernel of the given
> MMIO address and emulates the MMIO.  As TDX PV MMIO also needs it, export
> kvm_io_bus_read().  kvm_io_bus_write() is already exported.  TDX PV MMIO
> emulates some of MMIO itself.  To add trace point consistently with x86
> kvm, export kvm_mmio tracepoint.
>
> Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>   arch/x86/kvm/vmx/tdx.c | 114 +++++++++++++++++++++++++++++++++++++++++
>   arch/x86/kvm/x86.c     |   1 +
>   virt/kvm/kvm_main.c    |   2 +
>   3 files changed, 117 insertions(+)
>
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index 55fc6cc6c816..389bb95d2af0 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -1217,6 +1217,118 @@ static int tdx_emulate_io(struct kvm_vcpu *vcpu)
>   	return ret;
>   }
>   
> +static int tdx_complete_mmio(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long val = 0;
> +	gpa_t gpa;
> +	int size;
> +
> +	KVM_BUG_ON(vcpu->mmio_needed != 1, vcpu->kvm);
> +	vcpu->mmio_needed = 0;
> +
> +	if (!vcpu->mmio_is_write) {
> +		gpa = vcpu->mmio_fragments[0].gpa;
> +		size = vcpu->mmio_fragments[0].len;
> +
> +		memcpy(&val, vcpu->run->mmio.data, size);
> +		tdvmcall_set_return_val(vcpu, val);
> +		trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
> +	}
> +	return 1;
> +}
> +
> +static inline int tdx_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, int size,
> +				 unsigned long val)
> +{
> +	if (kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, gpa, size, &val) &&
> +	    kvm_io_bus_write(vcpu, KVM_MMIO_BUS, gpa, size, &val))
> +		return -EOPNOTSUPP;
> +
> +	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, size, gpa, &val);
> +	return 0;
> +}
> +
> +static inline int tdx_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, int size)
> +{
> +	unsigned long val;
> +
> +	if (kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev, gpa, size, &val) &&
> +	    kvm_io_bus_read(vcpu, KVM_MMIO_BUS, gpa, size, &val))
> +		return -EOPNOTSUPP;
> +
> +	tdvmcall_set_return_val(vcpu, val);
> +	trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
> +	return 0;
> +}
> +
> +static int tdx_emulate_mmio(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_memory_slot *slot;
> +	int size, write, r;
> +	unsigned long val;
> +	gpa_t gpa;
> +
> +	KVM_BUG_ON(vcpu->mmio_needed, vcpu->kvm);
> +
> +	size = tdvmcall_a0_read(vcpu);
> +	write = tdvmcall_a1_read(vcpu);
> +	gpa = tdvmcall_a2_read(vcpu);
> +	val = write ? tdvmcall_a3_read(vcpu) : 0;
> +
> +	if (size != 1 && size != 2 && size != 4 && size != 8)
> +		goto error;
> +	if (write != 0 && write != 1)
> +		goto error;
> +
> +	/* Strip the shared bit, allow MMIO with and without it set. */
Based on the discussion 
https://lore.kernel.org/all/ZcUO5sFEAIH68JIA@google.com/
Do we still allow the MMIO without shared bit?

> +	gpa = gpa & ~gfn_to_gpa(kvm_gfn_shared_mask(vcpu->kvm));
> +
> +	if (size > 8u || ((gpa + size - 1) ^ gpa) & PAGE_MASK)
"size > 8u" can be removed, since based on the check of size above, it 
can't be greater than 8.


> +		goto error;
> +
> +	slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(gpa));
> +	if (slot && !(slot->flags & KVM_MEMSLOT_INVALID))
> +		goto error;
> +
> +	if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
Should this be checked for write first?

I check the handle_ept_misconfig() in VMX, it doesn't check write first 
neither.

Functionally, it should be OK since guest will not read the address 
range of fast mmio.
So the read case will be filtered out by ioeventfd_write().
But it has take a long way to get to ioeventfd_write().
Isn't it more efficient to check write first?


> +		trace_kvm_fast_mmio(gpa);
> +		return 1;
> +	}
> +
> +	if (write)
> +		r = tdx_mmio_write(vcpu, gpa, size, val);
> +	else
> +		r = tdx_mmio_read(vcpu, gpa, size);
> +	if (!r) {
> +		/* Kernel completed device emulation. */
> +		tdvmcall_set_return_code(vcpu, TDVMCALL_SUCCESS);
> +		return 1;
> +	}
> +
> +	/* Request the device emulation to userspace device model. */
> +	vcpu->mmio_needed = 1;
> +	vcpu->mmio_is_write = write;
> +	vcpu->arch.complete_userspace_io = tdx_complete_mmio;
> +
> +	vcpu->run->mmio.phys_addr = gpa;
> +	vcpu->run->mmio.len = size;
> +	vcpu->run->mmio.is_write = write;
> +	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> +
> +	if (write) {
> +		memcpy(vcpu->run->mmio.data, &val, size);
> +	} else {
> +		vcpu->mmio_fragments[0].gpa = gpa;
> +		vcpu->mmio_fragments[0].len = size;
> +		trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, NULL);
> +	}
> +	return 0;
> +
> +error:
> +	tdvmcall_set_return_code(vcpu, TDVMCALL_INVALID_OPERAND);
> +	return 1;
> +}
> +
>   static int handle_tdvmcall(struct kvm_vcpu *vcpu)
>   {
>   	if (tdvmcall_exit_type(vcpu))
> @@ -1229,6 +1341,8 @@ static int handle_tdvmcall(struct kvm_vcpu *vcpu)
>   		return tdx_emulate_hlt(vcpu);
>   	case EXIT_REASON_IO_INSTRUCTION:
>   		return tdx_emulate_io(vcpu);
> +	case EXIT_REASON_EPT_VIOLATION:
> +		return tdx_emulate_mmio(vcpu);
>   	default:
>   		break;
>   	}
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 03950368d8db..d5b18cad9dcd 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -13975,6 +13975,7 @@ EXPORT_SYMBOL_GPL(kvm_sev_es_string_io);
>   
>   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry);
>   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
> +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_mmio);
>   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
>   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
>   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index e27c22449d85..bc14e1f2610c 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2689,6 +2689,7 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn
>   
>   	return NULL;
>   }
> +EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot);
>   
>   bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
>   {
> @@ -5992,6 +5993,7 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
>   	r = __kvm_io_bus_read(vcpu, bus, &range, val);
>   	return r < 0 ? r : 0;
>   }
> +EXPORT_SYMBOL_GPL(kvm_io_bus_read);
>   
>   int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
>   			    int len, struct kvm_io_device *dev)
Binbin Wu April 18, 2024, 11:04 a.m. UTC | #2
On 4/18/2024 5:29 PM, Binbin Wu wrote:
>
>> +
>> +static int tdx_emulate_mmio(struct kvm_vcpu *vcpu)
>> +{
>> +    struct kvm_memory_slot *slot;
>> +    int size, write, r;
>> +    unsigned long val;
>> +    gpa_t gpa;
>> +
>> +    KVM_BUG_ON(vcpu->mmio_needed, vcpu->kvm);
>> +
>> +    size = tdvmcall_a0_read(vcpu);
>> +    write = tdvmcall_a1_read(vcpu);
>> +    gpa = tdvmcall_a2_read(vcpu);
>> +    val = write ? tdvmcall_a3_read(vcpu) : 0;
>> +
>> +    if (size != 1 && size != 2 && size != 4 && size != 8)
>> +        goto error;
>> +    if (write != 0 && write != 1)
>> +        goto error;
>> +
>> +    /* Strip the shared bit, allow MMIO with and without it set. */
> Based on the discussion 
> https://lore.kernel.org/all/ZcUO5sFEAIH68JIA@google.com/
> Do we still allow the MMIO without shared bit?
>
>> +    gpa = gpa & ~gfn_to_gpa(kvm_gfn_shared_mask(vcpu->kvm));
>> +
>> +    if (size > 8u || ((gpa + size - 1) ^ gpa) & PAGE_MASK)
> "size > 8u" can be removed, since based on the check of size above, it 
> can't be greater than 8.
>
>
>> +        goto error;
>> +
>> +    slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(gpa));
>> +    if (slot && !(slot->flags & KVM_MEMSLOT_INVALID))
>> +        goto error;
>> +
>> +    if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
> Should this be checked for write first?
>
> I check the handle_ept_misconfig() in VMX, it doesn't check write 
> first neither.
>
> Functionally, it should be OK since guest will not read the address 
> range of fast mmio.
> So the read case will be filtered out by ioeventfd_write().
> But it has take a long way to get to ioeventfd_write().
> Isn't it more efficient to check write first?

I got the reason why in handle_ept_misconfig(), it tries to do fast mmio 
write without checking.
It was intended to make fast mmio faster.
And for ept misconfig case, it's not easy to get the info of read/write.

But in this patch, we have already have read/write info, so maybe we can 
add the check for write before fast mmio?


>
>
>> +        trace_kvm_fast_mmio(gpa);
>> +        return 1;
>> +    }
>> +
>> +    if (write)
>> +        r = tdx_mmio_write(vcpu, gpa, size, val);
>> +    else
>> +        r = tdx_mmio_read(vcpu, gpa, size);
>> +    if (!r) {
>> +        /* Kernel completed device emulation. */
>> +        tdvmcall_set_return_code(vcpu, TDVMCALL_SUCCESS);
>> +        return 1;
>> +    }
>> +
>> +    /* Request the device emulation to userspace device model. */
>> +    vcpu->mmio_needed = 1;
>> +    vcpu->mmio_is_write = write;
>> +    vcpu->arch.complete_userspace_io = tdx_complete_mmio;
>> +
>> +    vcpu->run->mmio.phys_addr = gpa;
>> +    vcpu->run->mmio.len = size;
>> +    vcpu->run->mmio.is_write = write;
>> +    vcpu->run->exit_reason = KVM_EXIT_MMIO;
>> +
>> +    if (write) {
>> +        memcpy(vcpu->run->mmio.data, &val, size);
>> +    } else {
>> +        vcpu->mmio_fragments[0].gpa = gpa;
>> +        vcpu->mmio_fragments[0].len = size;
>> +        trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, 
>> NULL);
>> +    }
>> +    return 0;
>> +
>> +error:
>> +    tdvmcall_set_return_code(vcpu, TDVMCALL_INVALID_OPERAND);
>> +    return 1;
>> +}
>> +
>>   static int handle_tdvmcall(struct kvm_vcpu *vcpu)
>>   {
>>       if (tdvmcall_exit_type(vcpu))
>> @@ -1229,6 +1341,8 @@ static int handle_tdvmcall(struct kvm_vcpu *vcpu)
>>           return tdx_emulate_hlt(vcpu);
>>       case EXIT_REASON_IO_INSTRUCTION:
>>           return tdx_emulate_io(vcpu);
>> +    case EXIT_REASON_EPT_VIOLATION:
>> +        return tdx_emulate_mmio(vcpu);
>>       default:
>>           break;
>>       }
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 03950368d8db..d5b18cad9dcd 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -13975,6 +13975,7 @@ EXPORT_SYMBOL_GPL(kvm_sev_es_string_io);
>>     EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry);
>>   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
>> +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_mmio);
>>   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
>>   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
>>   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>> index e27c22449d85..bc14e1f2610c 100644
>> --- a/virt/kvm/kvm_main.c
>> +++ b/virt/kvm/kvm_main.c
>> @@ -2689,6 +2689,7 @@ struct kvm_memory_slot 
>> *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn
>>         return NULL;
>>   }
>> +EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot);
>>     bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
>>   {
>> @@ -5992,6 +5993,7 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum 
>> kvm_bus bus_idx, gpa_t addr,
>>       r = __kvm_io_bus_read(vcpu, bus, &range, val);
>>       return r < 0 ? r : 0;
>>   }
>> +EXPORT_SYMBOL_GPL(kvm_io_bus_read);
>>     int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus 
>> bus_idx, gpa_t addr,
>>                   int len, struct kvm_io_device *dev)
>
>
Isaku Yamahata April 18, 2024, 9:22 p.m. UTC | #3
On Thu, Apr 18, 2024 at 07:04:11PM +0800,
Binbin Wu <binbin.wu@linux.intel.com> wrote:

> 
> 
> On 4/18/2024 5:29 PM, Binbin Wu wrote:
> > 
> > > +
> > > +static int tdx_emulate_mmio(struct kvm_vcpu *vcpu)
> > > +{
> > > +    struct kvm_memory_slot *slot;
> > > +    int size, write, r;
> > > +    unsigned long val;
> > > +    gpa_t gpa;
> > > +
> > > +    KVM_BUG_ON(vcpu->mmio_needed, vcpu->kvm);
> > > +
> > > +    size = tdvmcall_a0_read(vcpu);
> > > +    write = tdvmcall_a1_read(vcpu);
> > > +    gpa = tdvmcall_a2_read(vcpu);
> > > +    val = write ? tdvmcall_a3_read(vcpu) : 0;
> > > +
> > > +    if (size != 1 && size != 2 && size != 4 && size != 8)
> > > +        goto error;
> > > +    if (write != 0 && write != 1)
> > > +        goto error;
> > > +
> > > +    /* Strip the shared bit, allow MMIO with and without it set. */
> > Based on the discussion
> > https://lore.kernel.org/all/ZcUO5sFEAIH68JIA@google.com/
> > Do we still allow the MMIO without shared bit?

That's independent.  The part is how to work around guest accesses the
MMIO region with private GPA.  This part is,  the guest issues
TDG.VP.VMCALL<MMMIO> and KVM masks out the shared bit to make it friendly
to the user space VMM.



> > > +    gpa = gpa & ~gfn_to_gpa(kvm_gfn_shared_mask(vcpu->kvm));
> > > +
> > > +    if (size > 8u || ((gpa + size - 1) ^ gpa) & PAGE_MASK)
> > "size > 8u" can be removed, since based on the check of size above, it
> > can't be greater than 8.

Yes, will remove the check.


> > > +        goto error;
> > > +
> > > +    slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(gpa));
> > > +    if (slot && !(slot->flags & KVM_MEMSLOT_INVALID))
> > > +        goto error;
> > > +
> > > +    if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
> > Should this be checked for write first?
> > 
> > I check the handle_ept_misconfig() in VMX, it doesn't check write first
> > neither.
> > 
> > Functionally, it should be OK since guest will not read the address
> > range of fast mmio.
> > So the read case will be filtered out by ioeventfd_write().
> > But it has take a long way to get to ioeventfd_write().
> > Isn't it more efficient to check write first?
> 
> I got the reason why in handle_ept_misconfig(), it tries to do fast mmio
> write without checking.
> It was intended to make fast mmio faster.
> And for ept misconfig case, it's not easy to get the info of read/write.
> 
> But in this patch, we have already have read/write info, so maybe we can add
> the check for write before fast mmio?

Yes, let's add it.
Binbin Wu April 19, 2024, 1:42 a.m. UTC | #4
On 4/19/2024 5:22 AM, Isaku Yamahata wrote:
> On Thu, Apr 18, 2024 at 07:04:11PM +0800,
> Binbin Wu <binbin.wu@linux.intel.com> wrote:
>
>>
>> On 4/18/2024 5:29 PM, Binbin Wu wrote:
>>>> +
>>>> +static int tdx_emulate_mmio(struct kvm_vcpu *vcpu)
>>>> +{
>>>> +    struct kvm_memory_slot *slot;
>>>> +    int size, write, r;
>>>> +    unsigned long val;
>>>> +    gpa_t gpa;
>>>> +
>>>> +    KVM_BUG_ON(vcpu->mmio_needed, vcpu->kvm);
>>>> +
>>>> +    size = tdvmcall_a0_read(vcpu);
>>>> +    write = tdvmcall_a1_read(vcpu);
>>>> +    gpa = tdvmcall_a2_read(vcpu);
>>>> +    val = write ? tdvmcall_a3_read(vcpu) : 0;
>>>> +
>>>> +    if (size != 1 && size != 2 && size != 4 && size != 8)
>>>> +        goto error;
>>>> +    if (write != 0 && write != 1)
>>>> +        goto error;
>>>> +
>>>> +    /* Strip the shared bit, allow MMIO with and without it set. */
>>> Based on the discussion
>>> https://lore.kernel.org/all/ZcUO5sFEAIH68JIA@google.com/
>>> Do we still allow the MMIO without shared bit?
> That's independent.  The part is how to work around guest accesses the
> MMIO region with private GPA.  This part is,  the guest issues
> TDG.VP.VMCALL<MMMIO> and KVM masks out the shared bit to make it friendly
> to the user space VMM.
It's similar.
The tdvmcall from the guest for mmio can also be private GPA, which is 
not reasonable, right?
According to the comment, kvm doens't care about if the TD guest issue 
the tdvmcall with private GPA or shared GPA.
Isaku Yamahata April 19, 2024, 5:34 p.m. UTC | #5
On Fri, Apr 19, 2024 at 09:42:48AM +0800,
Binbin Wu <binbin.wu@linux.intel.com> wrote:

> 
> 
> On 4/19/2024 5:22 AM, Isaku Yamahata wrote:
> > On Thu, Apr 18, 2024 at 07:04:11PM +0800,
> > Binbin Wu <binbin.wu@linux.intel.com> wrote:
> > 
> > > 
> > > On 4/18/2024 5:29 PM, Binbin Wu wrote:
> > > > > +
> > > > > +static int tdx_emulate_mmio(struct kvm_vcpu *vcpu)
> > > > > +{
> > > > > +    struct kvm_memory_slot *slot;
> > > > > +    int size, write, r;
> > > > > +    unsigned long val;
> > > > > +    gpa_t gpa;
> > > > > +
> > > > > +    KVM_BUG_ON(vcpu->mmio_needed, vcpu->kvm);
> > > > > +
> > > > > +    size = tdvmcall_a0_read(vcpu);
> > > > > +    write = tdvmcall_a1_read(vcpu);
> > > > > +    gpa = tdvmcall_a2_read(vcpu);
> > > > > +    val = write ? tdvmcall_a3_read(vcpu) : 0;
> > > > > +
> > > > > +    if (size != 1 && size != 2 && size != 4 && size != 8)
> > > > > +        goto error;
> > > > > +    if (write != 0 && write != 1)
> > > > > +        goto error;
> > > > > +
> > > > > +    /* Strip the shared bit, allow MMIO with and without it set. */
> > > > Based on the discussion
> > > > https://lore.kernel.org/all/ZcUO5sFEAIH68JIA@google.com/
> > > > Do we still allow the MMIO without shared bit?
> > That's independent.  The part is how to work around guest accesses the
> > MMIO region with private GPA.  This part is,  the guest issues
> > TDG.VP.VMCALL<MMMIO> and KVM masks out the shared bit to make it friendly
> > to the user space VMM.
> It's similar.
> The tdvmcall from the guest for mmio can also be private GPA, which is not
> reasonable, right?
> According to the comment, kvm doens't care about if the TD guest issue the
> tdvmcall with private GPA or shared GPA.

I checked the GHCI spec. It clearly states this hypercall is for shared GPA.
We should return error for private GPA.

  This TDG.VP.VMCALL is used to help request the VMM perform
  emulated-MMIO-access operation. The VMM may emulate MMIO space in shared-GPA
  space. The VMM can induce a #VE on these shared-GPA accesses by mapping shared
  GPAs with the suppress-VE bit cleared in the EPT Entries corresponding to
  these mappings

So we'll have something as follows. Compile only tested.

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 3bf0d6e3cd21..0f696f3fbd86 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1281,24 +1281,34 @@ static int tdx_emulate_mmio(struct kvm_vcpu *vcpu)
        if (write != 0 && write != 1)
                goto error;
 
-       /* Strip the shared bit, allow MMIO with and without it set. */
+       /*
+        * MMIO with TDG.VP.VMCALL<MMIO> allows only shared GPA because
+        * private GPA is for device assignment.
+        */
+       if (kvm_is_private_gpa(gpa))
+               goto error;
+
+       /*
+        * Strip the shared bit because device emulator is assigned to GPA
+        * without shared bit.  We'd like the existing code untouched.
+        */
        gpa = gpa & ~gfn_to_gpa(kvm_gfn_shared_mask(vcpu->kvm));
 
-       if (size > 8u || ((gpa + size - 1) ^ gpa) & PAGE_MASK)
+       /* Disallow MMIO crossing page boundary for simplicity. */
+       if (((gpa + size - 1) ^ gpa) & PAGE_MASK)
                goto error;
 
        slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(gpa));
        if (slot && !(slot->flags & KVM_MEMSLOT_INVALID))
                goto error;
 
-       if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
-               trace_kvm_fast_mmio(gpa);
-               return 1;
-       }
-
-       if (write)
+       if (write) {
+               if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
+                       trace_kvm_fast_mmio(gpa);
+                       return 1;
+               }
                r = tdx_mmio_write(vcpu, gpa, size, val);
-       else
+       } else
                r = tdx_mmio_read(vcpu, gpa, size);
        if (!r) {
                /* Kernel completed device emulation. */
diff mbox series

Patch

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 55fc6cc6c816..389bb95d2af0 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1217,6 +1217,118 @@  static int tdx_emulate_io(struct kvm_vcpu *vcpu)
 	return ret;
 }
 
+static int tdx_complete_mmio(struct kvm_vcpu *vcpu)
+{
+	unsigned long val = 0;
+	gpa_t gpa;
+	int size;
+
+	KVM_BUG_ON(vcpu->mmio_needed != 1, vcpu->kvm);
+	vcpu->mmio_needed = 0;
+
+	if (!vcpu->mmio_is_write) {
+		gpa = vcpu->mmio_fragments[0].gpa;
+		size = vcpu->mmio_fragments[0].len;
+
+		memcpy(&val, vcpu->run->mmio.data, size);
+		tdvmcall_set_return_val(vcpu, val);
+		trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
+	}
+	return 1;
+}
+
+static inline int tdx_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, int size,
+				 unsigned long val)
+{
+	if (kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, gpa, size, &val) &&
+	    kvm_io_bus_write(vcpu, KVM_MMIO_BUS, gpa, size, &val))
+		return -EOPNOTSUPP;
+
+	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, size, gpa, &val);
+	return 0;
+}
+
+static inline int tdx_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, int size)
+{
+	unsigned long val;
+
+	if (kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev, gpa, size, &val) &&
+	    kvm_io_bus_read(vcpu, KVM_MMIO_BUS, gpa, size, &val))
+		return -EOPNOTSUPP;
+
+	tdvmcall_set_return_val(vcpu, val);
+	trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
+	return 0;
+}
+
+static int tdx_emulate_mmio(struct kvm_vcpu *vcpu)
+{
+	struct kvm_memory_slot *slot;
+	int size, write, r;
+	unsigned long val;
+	gpa_t gpa;
+
+	KVM_BUG_ON(vcpu->mmio_needed, vcpu->kvm);
+
+	size = tdvmcall_a0_read(vcpu);
+	write = tdvmcall_a1_read(vcpu);
+	gpa = tdvmcall_a2_read(vcpu);
+	val = write ? tdvmcall_a3_read(vcpu) : 0;
+
+	if (size != 1 && size != 2 && size != 4 && size != 8)
+		goto error;
+	if (write != 0 && write != 1)
+		goto error;
+
+	/* Strip the shared bit, allow MMIO with and without it set. */
+	gpa = gpa & ~gfn_to_gpa(kvm_gfn_shared_mask(vcpu->kvm));
+
+	if (size > 8u || ((gpa + size - 1) ^ gpa) & PAGE_MASK)
+		goto error;
+
+	slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(gpa));
+	if (slot && !(slot->flags & KVM_MEMSLOT_INVALID))
+		goto error;
+
+	if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
+		trace_kvm_fast_mmio(gpa);
+		return 1;
+	}
+
+	if (write)
+		r = tdx_mmio_write(vcpu, gpa, size, val);
+	else
+		r = tdx_mmio_read(vcpu, gpa, size);
+	if (!r) {
+		/* Kernel completed device emulation. */
+		tdvmcall_set_return_code(vcpu, TDVMCALL_SUCCESS);
+		return 1;
+	}
+
+	/* Request the device emulation to userspace device model. */
+	vcpu->mmio_needed = 1;
+	vcpu->mmio_is_write = write;
+	vcpu->arch.complete_userspace_io = tdx_complete_mmio;
+
+	vcpu->run->mmio.phys_addr = gpa;
+	vcpu->run->mmio.len = size;
+	vcpu->run->mmio.is_write = write;
+	vcpu->run->exit_reason = KVM_EXIT_MMIO;
+
+	if (write) {
+		memcpy(vcpu->run->mmio.data, &val, size);
+	} else {
+		vcpu->mmio_fragments[0].gpa = gpa;
+		vcpu->mmio_fragments[0].len = size;
+		trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, NULL);
+	}
+	return 0;
+
+error:
+	tdvmcall_set_return_code(vcpu, TDVMCALL_INVALID_OPERAND);
+	return 1;
+}
+
 static int handle_tdvmcall(struct kvm_vcpu *vcpu)
 {
 	if (tdvmcall_exit_type(vcpu))
@@ -1229,6 +1341,8 @@  static int handle_tdvmcall(struct kvm_vcpu *vcpu)
 		return tdx_emulate_hlt(vcpu);
 	case EXIT_REASON_IO_INSTRUCTION:
 		return tdx_emulate_io(vcpu);
+	case EXIT_REASON_EPT_VIOLATION:
+		return tdx_emulate_mmio(vcpu);
 	default:
 		break;
 	}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 03950368d8db..d5b18cad9dcd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -13975,6 +13975,7 @@  EXPORT_SYMBOL_GPL(kvm_sev_es_string_io);
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_mmio);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e27c22449d85..bc14e1f2610c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2689,6 +2689,7 @@  struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn
 
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot);
 
 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
 {
@@ -5992,6 +5993,7 @@  int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
 	r = __kvm_io_bus_read(vcpu, bus, &range, val);
 	return r < 0 ? r : 0;
 }
+EXPORT_SYMBOL_GPL(kvm_io_bus_read);
 
 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 			    int len, struct kvm_io_device *dev)