diff mbox

[5/25] Xen/doc: Add Xen virtual IOMMU doc

Message ID 1498715457-16565-6-git-send-email-tianyu.lan@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

lan,Tianyu June 29, 2017, 5:50 a.m. UTC
This patch is to add Xen virtual IOMMU doc to introduce motivation,
framework, vIOMMU hypercall and xl configuration.

Signed-off-by: Lan Tianyu <tianyu.lan@intel.com>
---
 docs/misc/viommu.txt | 129 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 129 insertions(+)
 create mode 100644 docs/misc/viommu.txt

Comments

Julien Grall July 4, 2017, 10:39 a.m. UTC | #1
Hi,

On 06/29/2017 06:50 AM, Lan Tianyu wrote:
> This patch is to add Xen virtual IOMMU doc to introduce motivation,
> framework, vIOMMU hypercall and xl configuration.
> 
> Signed-off-by: Lan Tianyu <tianyu.lan@intel.com>
> ---
>   docs/misc/viommu.txt | 129 +++++++++++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 129 insertions(+)
>   create mode 100644 docs/misc/viommu.txt
> 
> diff --git a/docs/misc/viommu.txt b/docs/misc/viommu.txt
> new file mode 100644
> index 0000000..76d4cee
> --- /dev/null
> +++ b/docs/misc/viommu.txt
> @@ -0,0 +1,129 @@
> +Xen virtual IOMMU
> +
> +Motivation
> +==========
> +*) Enable more than 255 vcpu support
> +HPC cloud service requires VM provides high performance parallel
> +computing and we hope to create a huge VM with >255 vcpu on one machine
> +to meet such requirement. Pin each vcpu to separate pcpus.
> +
> +To support >255 vcpus, X2APIC mode in guest is necessary because legacy
> +APIC(XAPIC) just supports 8-bit APIC ID and it only can support 255
> +vcpus at most. X2APIC mode supports 32-bit APIC ID and it requires
> +interrupt mapping function of vIOMMU.
> +
> +The reason for this is that there is no modification to existing PCI MSI
> +and IOAPIC with the introduction of X2APIC. PCI MSI/IOAPIC can only send
> +interrupt message containing 8-bit APIC ID, which cannot address >255
> +cpus. Interrupt remapping supports 32-bit APIC ID and so it's necessary
> +to enable >255 cpus with x2apic mode.
> +
> +
> +vIOMMU Architecture
> +===================
> +vIOMMU device model is inside Xen hypervisor for following factors
> +    1) Avoid round trips between Qemu and Xen hypervisor
> +    2) Ease of integration with the rest of hypervisor
> +    3) HVMlite/PVH doesn't use Qemu
> +
> +* Interrupt remapping overview.
> +Interrupts from virtual devices and physical devices are delivered
> +to vLAPIC from vIOAPIC and vMSI. vIOMMU needs to remap interrupt during
> +this procedure.
> +
> ++---------------------------------------------------+
> +|Qemu                       |VM                     |
> +|                           | +----------------+    |
> +|                           | |  Device driver |    |
> +|                           | +--------+-------+    |
> +|                           |          ^            |
> +|       +----------------+  | +--------+-------+    |
> +|       | Virtual device |  | |  IRQ subsystem |    |
> +|       +-------+--------+  | +--------+-------+    |
> +|               |           |          ^            |
> +|               |           |          |            |
> ++---------------------------+-----------------------+
> +|hyperviosr     |                      | VIRQ       |
> +|               |            +---------+--------+   |
> +|               |            |      vLAPIC      |   |
> +|               |VIRQ        +---------+--------+   |
> +|               |                      ^            |
> +|               |                      |            |
> +|               |            +---------+--------+   |
> +|               |            |      vIOMMU      |   |
> +|               |            +---------+--------+   |
> +|               |                      ^            |
> +|               |                      |            |
> +|               |            +---------+--------+   |
> +|               |            |   vIOAPIC/vMSI   |   |
> +|               |            +----+----+--------+   |
> +|               |                 ^    ^            |
> +|               +-----------------+    |            |
> +|                                      |            |
> ++---------------------------------------------------+
> +HW                                     |IRQ
> +                                +-------------------+
> +                                |   PCI Device      |
> +                                +-------------------+
> +
> +
> +vIOMMU hypercall
> +================
> +Introduce new domctl hypercall "xen_domctl_viommu_op" to create/destroy
> +vIOMMU and query vIOMMU capabilities that device model can support.
> +
> +* vIOMMU hypercall parameter structure
> +struct xen_domctl_viommu_op {
> +    uint32_t cmd;
> +#define XEN_DOMCTL_create_viommu          0
> +#define XEN_DOMCTL_destroy_viommu         1
> +#define XEN_DOMCTL_query_viommu_caps      2

I am a bit confused. This is only creating the vIOMMU. However, there 
might be multiple host IOMMUs, how do you link them together?

> +    union {
> +        struct {
> +            /* IN - vIOMMU type */
> +            uint64_t viommu_type;

This is a bit confusing, you don't define what should be the value of 
viommu_type, ...

> +            /* IN - MMIO base address of vIOMMU. */
> +            uint64_t base_address;
> +            /* IN - Length of MMIO region */
> +            uint64_t length; > +            /* IN - Capabilities with which we want to create */
> +            uint64_t capabilities;

... capabilities ...

> +            /* OUT - vIOMMU identity */
> +            uint32_t viommu_id;
> +        } create_viommu; > +
> +        struct {
> +            /* IN - vIOMMU identity */
> +            uint32_t viommu_id;
> +        } destroy_viommu;
> +
> +        struct {
> +            /* IN - vIOMMU type */
> +            uint64_t viommu_type; > +            /* OUT - vIOMMU Capabilities */
> +            uint64_t caps;

... and caps. I see you have defined them in a separate header 
(viommu.h). But there are no way for the developer to know that they 
should be used.

> +        } query_caps;
> +    } u;
> +};
> +
> +- XEN_DOMCTL_query_viommu_caps
> +    Query capabilities of vIOMMU device model. vIOMMU_type specifies
> +which vendor vIOMMU device model(E,G Intel VTD) is targeted and hypervisor

"E,G" did you mean "e.g"?

> +returns capability bits(E,G interrupt remapping bit).

Ditto.

A given platform may have multiple IOMMUs with different features. Are 
we expecting

> +
> +- XEN_DOMCTL_create_viommu
> +    Create vIOMMU device with vIOMMU_type, capabilities, MMIO
> +base address and length. Hypervisor returns viommu_id. Capabilities should
> +be in range of value returned by query_viommu_caps hypercall.

Can you explain what mmio and length are here for? Do you expect to trap 
and emulate the MMIO region in Xen?

 From just looking at the document. I am struggling to understand how 
this is going to be useful.

> +
> +- XEN_DOMCTL_destroy_viommu
> +    Destroy vIOMMU in Xen hypervisor with viommu_id as parameters.
> +
> +xl vIOMMU configuration
> +=======================
> +viommu="type=vtd,intremap=1,x2apic=1"
> +
> +"type" - Specify vIOMMU device model type. Currently only supports Intel vtd
> +device model.
> +"intremap" - Enable vIOMMU interrupt remapping function.
> +"x2apic" - Support x2apic mode with interrupt remapping function.
> 

Cheers,
lan,Tianyu July 5, 2017, 3:15 a.m. UTC | #2
Hi Julien:
	Thanks for your review.

On 2017年07月04日 18:39, Julien Grall wrote:
>> +vIOMMU hypercall
>> +================
>> +Introduce new domctl hypercall "xen_domctl_viommu_op" to create/destroy
>> +vIOMMU and query vIOMMU capabilities that device model can support.
>> +
>> +* vIOMMU hypercall parameter structure
>> +struct xen_domctl_viommu_op {
>> +    uint32_t cmd;
>> +#define XEN_DOMCTL_create_viommu          0
>> +#define XEN_DOMCTL_destroy_viommu         1
>> +#define XEN_DOMCTL_query_viommu_caps      2
> 
> I am a bit confused. This is only creating the vIOMMU. However, there
> might be multiple host IOMMUs, how do you link them together?
> 
>> +    union {
>> +        struct {
>> +            /* IN - vIOMMU type */
>> +            uint64_t viommu_type;
> 
> This is a bit confusing, you don't define what should be the value of
> viommu_type, ...
> 
>> +            /* IN - MMIO base address of vIOMMU. */
>> +            uint64_t base_address;
>> +            /* IN - Length of MMIO region */
>> +            uint64_t length; > +            /* IN - Capabilities with
>> which we want to create */
>> +            uint64_t capabilities;
> 
> ... capabilities ...
> 

Sorry. miss the type and capability definition here.

/* VIOMMU type */
#define VIOMMU_TYPE_INTEL_VTD     (1u << 0)

/* VIOMMU capabilities*/
#define VIOMMU_CAP_IRQ_REMAPPING  (1u << 0)

"viommu_type" means vendor vIOMMU device model. So far, we just support
virtual Intel VTD.

"capabilities" means the feature that vIOMMU supports. We just add
interrupt remapping for virtual VTD.


>> +            /* OUT - vIOMMU identity */
>> +            uint32_t viommu_id;
>> +        } create_viommu; > +
>> +        struct {
>> +            /* IN - vIOMMU identity */
>> +            uint32_t viommu_id;
>> +        } destroy_viommu;
>> +
>> +        struct {
>> +            /* IN - vIOMMU type */
>> +            uint64_t viommu_type; > +            /* OUT - vIOMMU
>> Capabilities */
>> +            uint64_t caps;
> 
> ... and caps. I see you have defined them in a separate header
> (viommu.h). But there are no way for the developer to know that they
> should be used.

Macros of "Capabilities" and "type" are defined under public directory
in order to tool stack also can use them to pass vIOMMU type and
capabilities.


> 
>> +        } query_caps;
>> +    } u;
>> +};
>> +
>> +- XEN_DOMCTL_query_viommu_caps
>> +    Query capabilities of vIOMMU device model. vIOMMU_type specifies
>> +which vendor vIOMMU device model(E,G Intel VTD) is targeted and
>> hypervisor
> 
> "E,G" did you mean "e.g"?

Yes. Will update.

> 
>> +returns capability bits(E,G interrupt remapping bit).
> 
> Ditto.
> 
> A given platform may have multiple IOMMUs with different features. Are
> we expecting

So far, our patchset just supports VM with one vIOMMU as starter.

Do you mean emulation of some vIOMMU capabilities rely on physical IOMMU
and there are multiple IOMMUs with different feature?

If yes, we need to emulate mult-vIOMMU for different assigned devices
under different pIOMMU. Vendor vIOMMU device model needs to check
whether the assigned device and support given capabilities passed by
tool stack.

> 
>> +
>> +- XEN_DOMCTL_create_viommu
>> +    Create vIOMMU device with vIOMMU_type, capabilities, MMIO
>> +base address and length. Hypervisor returns viommu_id. Capabilities
>> should
>> +be in range of value returned by query_viommu_caps hypercall.
> 
> Can you explain what mmio and length are here for? Do you expect to trap
> and emulate the MMIO region in Xen?

Yes, we need to emulate VTD MMIO register in the Xen hypervisor and this
is agreement under design stage. The MMIO base address is passed to
guest via ACPI table which is built by tool stack and so tool stack
manages vIOMMU MMIO region. When create vIOMMU, base address and length
needs to be passed.

For arm, the base address maybe passed by device tree?

> 
> From just looking at the document. I am struggling to understand how
> this is going to be useful.
> 
>> +
>> +- XEN_DOMCTL_destroy_viommu
>> +    Destroy vIOMMU in Xen hypervisor with viommu_id as parameters.
>> +
>> +xl vIOMMU configuration
>> +=======================
>> +viommu="type=vtd,intremap=1,x2apic=1"
>> +
>> +"type" - Specify vIOMMU device model type. Currently only supports
>> Intel vtd
>> +device model.
>> +"intremap" - Enable vIOMMU interrupt remapping function.
>> +"x2apic" - Support x2apic mode with interrupt remapping function.
Julien Grall July 5, 2017, 1:25 p.m. UTC | #3
On 05/07/17 04:15, Lan Tianyu wrote:
> Hi Julien:

Hi Tianyu Lan,

> 	Thanks for your review.
>
> On 2017年07月04日 18:39, Julien Grall wrote:
>>> +vIOMMU hypercall
>>> +================
>>> +Introduce new domctl hypercall "xen_domctl_viommu_op" to create/destroy
>>> +vIOMMU and query vIOMMU capabilities that device model can support.
>>> +
>>> +* vIOMMU hypercall parameter structure
>>> +struct xen_domctl_viommu_op {
>>> +    uint32_t cmd;
>>> +#define XEN_DOMCTL_create_viommu          0
>>> +#define XEN_DOMCTL_destroy_viommu         1
>>> +#define XEN_DOMCTL_query_viommu_caps      2
>>
>> I am a bit confused. This is only creating the vIOMMU. However, there
>> might be multiple host IOMMUs, how do you link them together?
>>
>>> +    union {
>>> +        struct {
>>> +            /* IN - vIOMMU type */
>>> +            uint64_t viommu_type;
>>
>> This is a bit confusing, you don't define what should be the value of
>> viommu_type, ...
>>
>>> +            /* IN - MMIO base address of vIOMMU. */
>>> +            uint64_t base_address;
>>> +            /* IN - Length of MMIO region */
>>> +            uint64_t length; > +            /* IN - Capabilities with
>>> which we want to create */
>>> +            uint64_t capabilities;
>>
>> ... capabilities ...
>>
>
> Sorry. miss the type and capability definition here.
>
> /* VIOMMU type */
> #define VIOMMU_TYPE_INTEL_VTD     (1u << 0)
>
> /* VIOMMU capabilities*/
> #define VIOMMU_CAP_IRQ_REMAPPING  (1u << 0)
>
> "viommu_type" means vendor vIOMMU device model. So far, we just support
> virtual Intel VTD.
>
> "capabilities" means the feature that vIOMMU supports. We just add
> interrupt remapping for virtual VTD.
>
>
>>> +            /* OUT - vIOMMU identity */
>>> +            uint32_t viommu_id;
>>> +        } create_viommu; > +
>>> +        struct {
>>> +            /* IN - vIOMMU identity */
>>> +            uint32_t viommu_id;
>>> +        } destroy_viommu;
>>> +
>>> +        struct {
>>> +            /* IN - vIOMMU type */
>>> +            uint64_t viommu_type; > +            /* OUT - vIOMMU
>>> Capabilities */
>>> +            uint64_t caps;
>>
>> ... and caps. I see you have defined them in a separate header
>> (viommu.h). But there are no way for the developer to know that they
>> should be used.
>
> Macros of "Capabilities" and "type" are defined under public directory
> in order to tool stack also can use them to pass vIOMMU type and
> capabilities.

My point was that if a developer read domctl.h first, he cannot guess 
that the value to be used in "capabilities" and "type" are defined in a 
separate header (viommu.h). You should at least write down a comment in 
the code explaining that.

>
>
>>
>>> +        } query_caps;
>>> +    } u;
>>> +};
>>> +
>>> +- XEN_DOMCTL_query_viommu_caps
>>> +    Query capabilities of vIOMMU device model. vIOMMU_type specifies
>>> +which vendor vIOMMU device model(E,G Intel VTD) is targeted and
>>> hypervisor
>>
>> "E,G" did you mean "e.g"?
>
> Yes. Will update.
>
>>
>>> +returns capability bits(E,G interrupt remapping bit).
>>
>> Ditto.
>>
>> A given platform may have multiple IOMMUs with different features. Are
>> we expecting
>
> So far, our patchset just supports VM with one vIOMMU as starter.
>
> Do you mean emulation of some vIOMMU capabilities rely on physical IOMMU
> and there are multiple IOMMUs with different feature?
>
> If yes, we need to emulate mult-vIOMMU for different assigned devices
> under different pIOMMU. Vendor vIOMMU device model needs to check
> whether the assigned device and support given capabilities passed by
> tool stack.

Hmmm, I think I was a bit confused with the domctl. You are querying the 
vIOMMU capabilities and they may be different from the physical IOMMU right?

>
>>
>>> +
>>> +- XEN_DOMCTL_create_viommu
>>> +    Create vIOMMU device with vIOMMU_type, capabilities, MMIO
>>> +base address and length. Hypervisor returns viommu_id. Capabilities
>>> should
>>> +be in range of value returned by query_viommu_caps hypercall.
>>
>> Can you explain what mmio and length are here for? Do you expect to trap
>> and emulate the MMIO region in Xen?
>
> Yes, we need to emulate VTD MMIO register in the Xen hypervisor and this
> is agreement under design stage. The MMIO base address is passed to
> guest via ACPI table which is built by tool stack and so tool stack
> manages vIOMMU MMIO region. When create vIOMMU, base address and length
> needs to be passed.

I am not yet sure we want to emulate an IOMMU for ARM. They are a bit 
complex to emulate and we have multiple one (SMMUv2, SMMUv3, 
IPMMU-VMSA,...). So PV might be the solution here. Though, it is too 
early to decide.

If we wanted to use emulation, an IOMMU may have multiple MMIO ranges 
and multiple interrupts (either legacy or MSI). Here you are assuming 
only one MMIO and no interrupt. This new interface is a DOMCTL so it 
might be ok to extend it in the future?

Furthermore, on ARM we would be able to create the vIOMMU but it would 
be unusable. Indeed, IOMMU are only used to protect devices. But you 
don't see any way to say "This device is protected by the IOMMU". Did I 
miss anything?

>
> For arm, the base address maybe passed by device tree?

Either Device Tree or ACPI. I don't think it matters here.

>
>>
>> From just looking at the document. I am struggling to understand how
>> this is going to be useful.
>>
>>> +
>>> +- XEN_DOMCTL_destroy_viommu
>>> +    Destroy vIOMMU in Xen hypervisor with viommu_id as parameters.
>>> +
>>> +xl vIOMMU configuration
>>> +=======================
>>> +viommu="type=vtd,intremap=1,x2apic=1"
>>> +
>>> +"type" - Specify vIOMMU device model type. Currently only supports
>>> Intel vtd
>>> +device model.
>>> +"intremap" - Enable vIOMMU interrupt remapping function.
>>> +"x2apic" - Support x2apic mode with interrupt remapping function.
>

Cheers,
lan,Tianyu July 6, 2017, 3:10 a.m. UTC | #4
On 2017年07月05日 21:25, Julien Grall wrote:
> 
> 
> On 05/07/17 04:15, Lan Tianyu wrote:
>> Hi Julien:
> 
> Hi Tianyu Lan,
> 
>>     Thanks for your review.
>>
>> On 2017年07月04日 18:39, Julien Grall wrote:
>>>> +vIOMMU hypercall
>>>> +================
>>>> +Introduce new domctl hypercall "xen_domctl_viommu_op" to
>>>> create/destroy
>>>> +vIOMMU and query vIOMMU capabilities that device model can support.
>>>> +
>>>> +* vIOMMU hypercall parameter structure
>>>> +struct xen_domctl_viommu_op {
>>>> +    uint32_t cmd;
>>>> +#define XEN_DOMCTL_create_viommu          0
>>>> +#define XEN_DOMCTL_destroy_viommu         1
>>>> +#define XEN_DOMCTL_query_viommu_caps      2
>>>
>>> I am a bit confused. This is only creating the vIOMMU. However, there
>>> might be multiple host IOMMUs, how do you link them together?
>>>
>>>> +    union {
>>>> +        struct {
>>>> +            /* IN - vIOMMU type */
>>>> +            uint64_t viommu_type;
>>>
>>> This is a bit confusing, you don't define what should be the value of
>>> viommu_type, ...
>>>
>>>> +            /* IN - MMIO base address of vIOMMU. */
>>>> +            uint64_t base_address;
>>>> +            /* IN - Length of MMIO region */
>>>> +            uint64_t length; > +            /* IN - Capabilities with
>>>> which we want to create */
>>>> +            uint64_t capabilities;
>>>
>>> ... capabilities ...
>>>
>>
>> Sorry. miss the type and capability definition here.
>>
>> /* VIOMMU type */
>> #define VIOMMU_TYPE_INTEL_VTD     (1u << 0)
>>
>> /* VIOMMU capabilities*/
>> #define VIOMMU_CAP_IRQ_REMAPPING  (1u << 0)
>>
>> "viommu_type" means vendor vIOMMU device model. So far, we just support
>> virtual Intel VTD.
>>
>> "capabilities" means the feature that vIOMMU supports. We just add
>> interrupt remapping for virtual VTD.
>>
>>
>>>> +            /* OUT - vIOMMU identity */
>>>> +            uint32_t viommu_id;
>>>> +        } create_viommu; > +
>>>> +        struct {
>>>> +            /* IN - vIOMMU identity */
>>>> +            uint32_t viommu_id;
>>>> +        } destroy_viommu;
>>>> +
>>>> +        struct {
>>>> +            /* IN - vIOMMU type */
>>>> +            uint64_t viommu_type; > +            /* OUT - vIOMMU
>>>> Capabilities */
>>>> +            uint64_t caps;
>>>
>>> ... and caps. I see you have defined them in a separate header
>>> (viommu.h). But there are no way for the developer to know that they
>>> should be used.
>>
>> Macros of "Capabilities" and "type" are defined under public directory
>> in order to tool stack also can use them to pass vIOMMU type and
>> capabilities.
> 
> My point was that if a developer read domctl.h first, he cannot guess
> that the value to be used in "capabilities" and "type" are defined in a
> separate header (viommu.h). You should at least write down a comment in
> the code explaining that.

Yes, good suggestion and will update in next version.

> 
>>
>>
>>>
>>>> +        } query_caps;
>>>> +    } u;
>>>> +};
>>>> +
>>>> +- XEN_DOMCTL_query_viommu_caps
>>>> +    Query capabilities of vIOMMU device model. vIOMMU_type specifies
>>>> +which vendor vIOMMU device model(E,G Intel VTD) is targeted and
>>>> hypervisor
>>>
>>> "E,G" did you mean "e.g"?
>>
>> Yes. Will update.
>>
>>>
>>>> +returns capability bits(E,G interrupt remapping bit).
>>>
>>> Ditto.
>>>
>>> A given platform may have multiple IOMMUs with different features. Are
>>> we expecting
>>
>> So far, our patchset just supports VM with one vIOMMU as starter.
>>
>> Do you mean emulation of some vIOMMU capabilities rely on physical IOMMU
>> and there are multiple IOMMUs with different feature?
>>
>> If yes, we need to emulate mult-vIOMMU for different assigned devices
>> under different pIOMMU. Vendor vIOMMU device model needs to check
>> whether the assigned device and support given capabilities passed by
>> tool stack.
> 
> Hmmm, I think I was a bit confused with the domctl. You are querying the
> vIOMMU capabilities and they may be different from the physical IOMMU
> right?

Yes, that's possible. If pass though two devices under different
physical IOMMUs.

> 
>>
>>>
>>>> +
>>>> +- XEN_DOMCTL_create_viommu
>>>> +    Create vIOMMU device with vIOMMU_type, capabilities, MMIO
>>>> +base address and length. Hypervisor returns viommu_id. Capabilities
>>>> should
>>>> +be in range of value returned by query_viommu_caps hypercall.
>>>
>>> Can you explain what mmio and length are here for? Do you expect to trap
>>> and emulate the MMIO region in Xen?
>>
>> Yes, we need to emulate VTD MMIO register in the Xen hypervisor and this
>> is agreement under design stage. The MMIO base address is passed to
>> guest via ACPI table which is built by tool stack and so tool stack
>> manages vIOMMU MMIO region. When create vIOMMU, base address and length
>> needs to be passed.
> 
> I am not yet sure we want to emulate an IOMMU for ARM. They are a bit
> complex to emulate and we have multiple one (SMMUv2, SMMUv3,
> IPMMU-VMSA,...). So PV might be the solution here. Though, it is too
> early to decide.

Yes, What I got ARM vIOMMU from KVM side is that ARM engineer are
pushing PV IOMMU and reason for that is just like you said about
multiple IOMMU version.

https://www.spinics.net/lists/kvm/msg147990.html

> 
> If we wanted to use emulation, an IOMMU may have multiple MMIO ranges
> and multiple interrupts (either legacy or MSI). Here you are assuming
> only one MMIO and no interrupt. This new interface is a DOMCTL so it
> might be ok to extend it in the future?

For Intel VTD, one instance's MMIO registers will be in "4KB-aligned
memorymapped location" and so just need to pass base address and
length(4KB). If other vendor have multi-MMIO region, the structure can
be extended.

Because we now just have onE vIOMMU, all virtual interrupt will be bound
to it. If need to support mult-vIOMMU, we can add device-scope
field(sbdf array or some thing like that) in the structure and specify
what devices should be under one vIOMMU.
lan,Tianyu July 6, 2017, 6:20 a.m. UTC | #5
On 2017年07月05日 21:25, Julien Grall wrote:
> Furthermore, on ARM we would be able to create the vIOMMU but it would
> be unusable. Indeed, IOMMU are only used to protect devices. But you
> don't see any way to say "This device is protected by the IOMMU". Did I
> miss anything?

The "device protection" you mentioned is DMA protection, right?. It's
one of IOMMU capabilities. IOMMU also provides interrupt remapping and
SVM(Shared virtual memory). I see ARM side also is pushing SVM feature
in KVM maillist for native support. Finally, it needs to support SVM in
VM and so virtual IOMMU is necessary regardless of full-virtualized or
PV IOMMU

http://lists.infradead.org/pipermail/linux-arm-kernel/2017-March/491614.html
Julien Grall July 7, 2017, 4:08 p.m. UTC | #6
Hi,

On 06/07/17 04:10, Lan Tianyu wrote:
> On 2017年07月05日 21:25, Julien Grall wrote:
>>
>>>
>>>>
>>>>> +
>>>>> +- XEN_DOMCTL_create_viommu
>>>>> +    Create vIOMMU device with vIOMMU_type, capabilities, MMIO
>>>>> +base address and length. Hypervisor returns viommu_id. Capabilities
>>>>> should
>>>>> +be in range of value returned by query_viommu_caps hypercall.
>>>>
>>>> Can you explain what mmio and length are here for? Do you expect to trap
>>>> and emulate the MMIO region in Xen?
>>>
>>> Yes, we need to emulate VTD MMIO register in the Xen hypervisor and this
>>> is agreement under design stage. The MMIO base address is passed to
>>> guest via ACPI table which is built by tool stack and so tool stack
>>> manages vIOMMU MMIO region. When create vIOMMU, base address and length
>>> needs to be passed.
>>
>> I am not yet sure we want to emulate an IOMMU for ARM. They are a bit
>> complex to emulate and we have multiple one (SMMUv2, SMMUv3,
>> IPMMU-VMSA,...). So PV might be the solution here. Though, it is too
>> early to decide.
>
> Yes, What I got ARM vIOMMU from KVM side is that ARM engineer are
> pushing PV IOMMU and reason for that is just like you said about
> multiple IOMMU version.
>
> https://www.spinics.net/lists/kvm/msg147990.html
>
>>
>> If we wanted to use emulation, an IOMMU may have multiple MMIO ranges
>> and multiple interrupts (either legacy or MSI). Here you are assuming
>> only one MMIO and no interrupt. This new interface is a DOMCTL so it
>> might be ok to extend it in the future?
>
> For Intel VTD, one instance's MMIO registers will be in "4KB-aligned
> memorymapped location" and so just need to pass base address and
> length(4KB). If other vendor have multi-MMIO region, the structure can
> be extended.

It can be extended if the hypercall introduced is only part of 
non-stable ABI. I realise that it is a DOMCTL, so I guess it is fine to 
be extended.

> Because we now just have onE vIOMMU, all virtual interrupt will be bound
> to it. If need to support mult-vIOMMU, we can add device-scope
> field(sbdf array or some thing like that) in the structure and specify
> what devices should be under one vIOMMU.

I am not sure to follow the argument here. Even if you have only one 
vIOMMU you need to be able to do the correspondence between the virtual 
MasterID (for PCI it is based on the RID) and the host MasterID.

So how do you do that with your solution?

Cheers,
Julien Grall July 7, 2017, 4:16 p.m. UTC | #7
Hi,

On 06/07/17 07:20, Lan Tianyu wrote:
> On 2017年07月05日 21:25, Julien Grall wrote:
>> Furthermore, on ARM we would be able to create the vIOMMU but it would
>> be unusable. Indeed, IOMMU are only used to protect devices. But you
>> don't see any way to say "This device is protected by the IOMMU". Did I
>> miss anything?
>
> The "device protection" you mentioned is DMA protection, right?. It's
> one of IOMMU capabilities. IOMMU also provides interrupt remapping and
> SVM(Shared virtual memory). I see ARM side also is pushing SVM feature
> in KVM maillist for native support. Finally, it needs to support SVM in
> VM and so virtual IOMMU is necessary regardless of full-virtualized or
> PV IOMMU
>
> http://lists.infradead.org/pipermail/linux-arm-kernel/2017-March/491614.html

I don't think SVM is strictly necessary to do DMA protection in the 
guest. Not all IOMMUs on ARM are able to use this feature but you may 
still want to allow the guest using the IOMMU. Did I miss anything?

Cheers,
lan,Tianyu July 12, 2017, 3:09 a.m. UTC | #8
On 2017年07月08日 00:08, Julien Grall wrote:
>> Because we now just have onE vIOMMU, all virtual interrupt will be bound
>> to it. If need to support mult-vIOMMU, we can add device-scope
>> field(sbdf array or some thing like that) in the structure and specify
>> what devices should be under one vIOMMU.
> 
> I am not sure to follow the argument here. Even if you have only one
> vIOMMU you need to be able to do the correspondence between the virtual
> MasterID (for PCI it is based on the RID) and the host MasterID.

Hi Julien:
     Sorry for later response.
     MasterID you mentioned here is sbdf, right? Binding between sbdf
and vsbdf(virtual sbdf) should be in the device pass through related
interface(e.g, xc_domain_bind_pt_irq_int() has already done such similar
thing that bind vsbdf with real interrupt of hypervisor.).
     vIOMMU device model can get vsbdf when guest configure vIOMMU entry
and hypervisor can do conversion between sbdf and vsbdf. For interrupt
remapping on virtual VTD, we don't find such requirement so far and got
enough data from IOAPIC/MSI entry and interrupt remapping entry of
virtual VTD. So we don't extend pass through interface.
lan,Tianyu July 12, 2017, 5:34 a.m. UTC | #9
On 2017年07月08日 00:16, Julien Grall wrote:
> Hi,
> 
> On 06/07/17 07:20, Lan Tianyu wrote:
>> On 2017年07月05日 21:25, Julien Grall wrote:
>>> Furthermore, on ARM we would be able to create the vIOMMU but it would
>>> be unusable. Indeed, IOMMU are only used to protect devices. But you
>>> don't see any way to say "This device is protected by the IOMMU". Did I
>>> miss anything?
>>
>> The "device protection" you mentioned is DMA protection, right?. It's
>> one of IOMMU capabilities. IOMMU also provides interrupt remapping and
>> SVM(Shared virtual memory). I see ARM side also is pushing SVM feature
>> in KVM maillist for native support. Finally, it needs to support SVM in
>> VM and so virtual IOMMU is necessary regardless of full-virtualized or
>> PV IOMMU
>>
>> http://lists.infradead.org/pipermail/linux-arm-kernel/2017-March/491614.html
>>
> 
> I don't think SVM is strictly necessary to do DMA protection in the
> guest.

SVM and DMA protection is different features of IOMMU. SVM is to share
same page table(VA->PA) between CPU and GPU or other device in order to
removing overhead to maintain two page table between cpu and device
side. Actually this is also a device feature and more devices will
support SVM besides GPU.

> Not all IOMMUs on ARM are able to use this feature but you may
> still want to allow the guest using the IOMMU. Did I miss anything?

If physical IOMMU doesn't support SVM, vIOMMU device model should not
return SVM capability to tool stack when receive "query capabilities"
cmd. There should be capabilities field in vIOMMU register or ACPI table
for vIOMMU(Not sure ARM side) and SVM capability bit won't be set.
So guest finally won't enable SVM feature.
Julien Grall July 12, 2017, 7:26 a.m. UTC | #10
Hi,

On 07/12/2017 04:09 AM, Lan Tianyu wrote:
> On 2017年07月08日 00:08, Julien Grall wrote:
>>> Because we now just have onE vIOMMU, all virtual interrupt will be bound
>>> to it. If need to support mult-vIOMMU, we can add device-scope
>>> field(sbdf array or some thing like that) in the structure and specify
>>> what devices should be under one vIOMMU.
>>
>> I am not sure to follow the argument here. Even if you have only one
>> vIOMMU you need to be able to do the correspondence between the virtual
>> MasterID (for PCI it is based on the RID) and the host MasterID.


>       Sorry for later response.
>       MasterID you mentioned here is sbdf, right? Binding between sbdf
> and vsbdf(virtual sbdf) should be in the device pass through related
> interface(e.g, xc_domain_bind_pt_irq_int() has already done such similar
> thing that bind vsbdf with real interrupt of hypervisor.).

The MasterID is not the sbdf. It is an identifier based on the tuple 
(Hostbridge, Requester ID). The RequesterID (RID), might be the bdf of 
the device or something different if there is DMA aliases.

The relation between MasterID and the tuple is defined by the hardware 
and will be reported by the firmware tables.

>       vIOMMU device model can get vsbdf when guest configure vIOMMU entry
> and hypervisor can do conversion between sbdf and vsbdf. For interrupt
> remapping on virtual VTD, we don't find such requirement so far and got
> enough data from IOAPIC/MSI entry and interrupt remapping entry of
> virtual VTD. So we don't extend pass through interface.

Well, you have to think how this could be extended in the future. This 
is quite important to plan head for stable ABI. Thankfully, you seem to 
use DOMCTL, so I guess we don't have to worry too much...

Cheers,
lan,Tianyu July 12, 2017, 11:44 a.m. UTC | #11
On 2017年07月12日 15:26, Julien Grall wrote:
> Hi,
> 
> On 07/12/2017 04:09 AM, Lan Tianyu wrote:
>> On 2017年07月08日 00:08, Julien Grall wrote:
>>>> Because we now just have onE vIOMMU, all virtual interrupt will be
>>>> bound
>>>> to it. If need to support mult-vIOMMU, we can add device-scope
>>>> field(sbdf array or some thing like that) in the structure and specify
>>>> what devices should be under one vIOMMU.
>>>
>>> I am not sure to follow the argument here. Even if you have only one
>>> vIOMMU you need to be able to do the correspondence between the virtual
>>> MasterID (for PCI it is based on the RID) and the host MasterID.
> 
> 
>>       Sorry for later response.
>>       MasterID you mentioned here is sbdf, right? Binding between sbdf
>> and vsbdf(virtual sbdf) should be in the device pass through related
>> interface(e.g, xc_domain_bind_pt_irq_int() has already done such similar
>> thing that bind vsbdf with real interrupt of hypervisor.).
> 
> The MasterID is not the sbdf. It is an identifier based on the tuple
> (Hostbridge, Requester ID). The RequesterID (RID), might be the bdf of
> the device or something different if there is DMA aliases.
> 
> The relation between MasterID and the tuple is defined by the hardware
> and will be reported by the firmware tables.

OK. This seems ARM specific, right? From my view, Binding virtual
MasterID with physical one still should be in pass through domctl and
we may store the relationship in the hypervisor at that point.

> 
>>       vIOMMU device model can get vsbdf when guest configure vIOMMU entry
>> and hypervisor can do conversion between sbdf and vsbdf. For interrupt
>> remapping on virtual VTD, we don't find such requirement so far and got
>> enough data from IOAPIC/MSI entry and interrupt remapping entry of
>> virtual VTD. So we don't extend pass through interface.
> 
> Well, you have to think how this could be extended in the future. This
> is quite important to plan head for stable ABI.

Sure.

> Thankfully, you seem to
> use DOMCTL, so I guess we don't have to worry too much...

I try to make new vIOMMU DOMCTL general enough for all vendors.
Operations of Create/destroy and Query capabilities are necessary for
all vendor. I don't have other vendor IOMMU's knowledge and Any
suggestions are very appreciated. Thanks.
diff mbox

Patch

diff --git a/docs/misc/viommu.txt b/docs/misc/viommu.txt
new file mode 100644
index 0000000..76d4cee
--- /dev/null
+++ b/docs/misc/viommu.txt
@@ -0,0 +1,129 @@ 
+Xen virtual IOMMU
+
+Motivation
+==========
+*) Enable more than 255 vcpu support
+HPC cloud service requires VM provides high performance parallel
+computing and we hope to create a huge VM with >255 vcpu on one machine
+to meet such requirement. Pin each vcpu to separate pcpus.
+
+To support >255 vcpus, X2APIC mode in guest is necessary because legacy
+APIC(XAPIC) just supports 8-bit APIC ID and it only can support 255
+vcpus at most. X2APIC mode supports 32-bit APIC ID and it requires
+interrupt mapping function of vIOMMU.
+
+The reason for this is that there is no modification to existing PCI MSI
+and IOAPIC with the introduction of X2APIC. PCI MSI/IOAPIC can only send
+interrupt message containing 8-bit APIC ID, which cannot address >255
+cpus. Interrupt remapping supports 32-bit APIC ID and so it's necessary
+to enable >255 cpus with x2apic mode.
+
+
+vIOMMU Architecture
+===================
+vIOMMU device model is inside Xen hypervisor for following factors
+    1) Avoid round trips between Qemu and Xen hypervisor
+    2) Ease of integration with the rest of hypervisor
+    3) HVMlite/PVH doesn't use Qemu
+
+* Interrupt remapping overview.
+Interrupts from virtual devices and physical devices are delivered
+to vLAPIC from vIOAPIC and vMSI. vIOMMU needs to remap interrupt during
+this procedure.
+
++---------------------------------------------------+
+|Qemu                       |VM                     |
+|                           | +----------------+    |
+|                           | |  Device driver |    |
+|                           | +--------+-------+    |
+|                           |          ^            |
+|       +----------------+  | +--------+-------+    |
+|       | Virtual device |  | |  IRQ subsystem |    |
+|       +-------+--------+  | +--------+-------+    |
+|               |           |          ^            |
+|               |           |          |            |
++---------------------------+-----------------------+
+|hyperviosr     |                      | VIRQ       |
+|               |            +---------+--------+   |
+|               |            |      vLAPIC      |   |
+|               |VIRQ        +---------+--------+   |
+|               |                      ^            |
+|               |                      |            |
+|               |            +---------+--------+   |
+|               |            |      vIOMMU      |   |
+|               |            +---------+--------+   |
+|               |                      ^            |
+|               |                      |            |
+|               |            +---------+--------+   |
+|               |            |   vIOAPIC/vMSI   |   |
+|               |            +----+----+--------+   |
+|               |                 ^    ^            |
+|               +-----------------+    |            |
+|                                      |            |
++---------------------------------------------------+
+HW                                     |IRQ
+                                +-------------------+
+                                |   PCI Device      |
+                                +-------------------+
+
+
+vIOMMU hypercall
+================
+Introduce new domctl hypercall "xen_domctl_viommu_op" to create/destroy
+vIOMMU and query vIOMMU capabilities that device model can support.
+
+* vIOMMU hypercall parameter structure
+struct xen_domctl_viommu_op {
+    uint32_t cmd;
+#define XEN_DOMCTL_create_viommu          0
+#define XEN_DOMCTL_destroy_viommu         1
+#define XEN_DOMCTL_query_viommu_caps      2
+    union {
+        struct {
+            /* IN - vIOMMU type */
+            uint64_t viommu_type;
+            /* IN - MMIO base address of vIOMMU. */
+            uint64_t base_address;
+            /* IN - Length of MMIO region */
+            uint64_t length;
+            /* IN - Capabilities with which we want to create */
+            uint64_t capabilities;
+            /* OUT - vIOMMU identity */
+            uint32_t viommu_id;
+        } create_viommu;
+
+        struct {
+            /* IN - vIOMMU identity */
+            uint32_t viommu_id;
+        } destroy_viommu;
+
+        struct {
+            /* IN - vIOMMU type */
+            uint64_t viommu_type;
+            /* OUT - vIOMMU Capabilities */
+            uint64_t caps;
+        } query_caps;
+    } u;
+};
+
+- XEN_DOMCTL_query_viommu_caps
+    Query capabilities of vIOMMU device model. vIOMMU_type specifies
+which vendor vIOMMU device model(E,G Intel VTD) is targeted and hypervisor
+returns capability bits(E,G interrupt remapping bit).
+
+- XEN_DOMCTL_create_viommu
+    Create vIOMMU device with vIOMMU_type, capabilities, MMIO
+base address and length. Hypervisor returns viommu_id. Capabilities should
+be in range of value returned by query_viommu_caps hypercall.
+
+- XEN_DOMCTL_destroy_viommu
+    Destroy vIOMMU in Xen hypervisor with viommu_id as parameters.
+
+xl vIOMMU configuration
+=======================
+viommu="type=vtd,intremap=1,x2apic=1"
+
+"type" - Specify vIOMMU device model type. Currently only supports Intel vtd
+device model.
+"intremap" - Enable vIOMMU interrupt remapping function.
+"x2apic" - Support x2apic mode with interrupt remapping function.