diff mbox series

[v4,15/32] vfio: introduce KVM-owned IOMMU type

Message ID 20220314194451.58266-16-mjrosato@linux.ibm.com (mailing list archive)
State New, archived
Headers show
Series KVM: s390: enable zPCI for interpretive execution | expand

Commit Message

Matthew Rosato March 14, 2022, 7:44 p.m. UTC
s390x will introduce a new IOMMU domain type where the mappings are
managed by KVM rather than in response to userspace mapping ioctls.  Allow
for specifying this type on the VFIO_SET_IOMMU ioctl and triggering the
appropriate iommu interface for overriding the default domain.

Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
---
 drivers/vfio/vfio_iommu_type1.c | 12 +++++++++++-
 include/uapi/linux/vfio.h       |  6 ++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

Comments

Jason Gunthorpe March 14, 2022, 9:38 p.m. UTC | #1
On Mon, Mar 14, 2022 at 03:44:34PM -0400, Matthew Rosato wrote:

> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 9394aa9444c1..0bec97077d61 100644
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -77,6 +77,7 @@ struct vfio_iommu {
>  	bool			nesting;
>  	bool			dirty_page_tracking;
>  	bool			container_open;
> +	bool			kvm;
>  	struct list_head	emulated_iommu_groups;
>  };
>  
> @@ -2203,7 +2204,12 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  		goto out_free_group;
>  
>  	ret = -EIO;
> -	domain->domain = iommu_domain_alloc(bus);
> +
> +	if (iommu->kvm)
> +		domain->domain = iommu_domain_alloc_type(bus, IOMMU_DOMAIN_KVM);
> +	else
> +		domain->domain = iommu_domain_alloc(bus);
> +
>  	if (!domain->domain)
>  		goto out_free_domain;
>  
> @@ -2552,6 +2558,9 @@ static void *vfio_iommu_type1_open(unsigned long arg)
>  	case VFIO_TYPE1v2_IOMMU:
>  		iommu->v2 = true;
>  		break;
> +	case VFIO_KVM_IOMMU:
> +		iommu->kvm = true;
> +		break;

Same remark for this - but more - this is called KVM but it doesn't
accept a kvm FD or any thing else to link the domain to the KVM
in-use.

Jason
Alex Williamson March 14, 2022, 10:50 p.m. UTC | #2
On Mon, 14 Mar 2022 15:44:34 -0400
Matthew Rosato <mjrosato@linux.ibm.com> wrote:

> s390x will introduce a new IOMMU domain type where the mappings are
> managed by KVM rather than in response to userspace mapping ioctls.  Allow
> for specifying this type on the VFIO_SET_IOMMU ioctl and triggering the
> appropriate iommu interface for overriding the default domain.
> 
> Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 12 +++++++++++-
>  include/uapi/linux/vfio.h       |  6 ++++++
>  2 files changed, 17 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 9394aa9444c1..0bec97077d61 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -77,6 +77,7 @@ struct vfio_iommu {
>  	bool			nesting;
>  	bool			dirty_page_tracking;
>  	bool			container_open;
> +	bool			kvm;
>  	struct list_head	emulated_iommu_groups;
>  };
>  
> @@ -2203,7 +2204,12 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  		goto out_free_group;
>  
>  	ret = -EIO;
> -	domain->domain = iommu_domain_alloc(bus);
> +
> +	if (iommu->kvm)
> +		domain->domain = iommu_domain_alloc_type(bus, IOMMU_DOMAIN_KVM);
> +	else
> +		domain->domain = iommu_domain_alloc(bus);
> +
>  	if (!domain->domain)
>  		goto out_free_domain;
>  
> @@ -2552,6 +2558,9 @@ static void *vfio_iommu_type1_open(unsigned long arg)
>  	case VFIO_TYPE1v2_IOMMU:
>  		iommu->v2 = true;
>  		break;
> +	case VFIO_KVM_IOMMU:
> +		iommu->kvm = true;
> +		break;
>  	default:
>  		kfree(iommu);
>  		return ERR_PTR(-EINVAL);
> @@ -2637,6 +2646,7 @@ static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
>  	case VFIO_TYPE1_NESTING_IOMMU:
>  	case VFIO_UNMAP_ALL:
>  	case VFIO_UPDATE_VADDR:
> +	case VFIO_KVM_IOMMU:
>  		return 1;
>  	case VFIO_DMA_CC_IOMMU:
>  		if (!iommu)
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index ef33ea002b0b..666edb6957ac 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -52,6 +52,12 @@
>  /* Supports the vaddr flag for DMA map and unmap */
>  #define VFIO_UPDATE_VADDR		10
>  
> +/*
> + * The KVM_IOMMU type implies that the hypervisor will control the mappings
> + * rather than userspace
> + */
> +#define VFIO_KVM_IOMMU			11

Then why is this hosted in the type1 code that exposes a wide variety
of userspace interfaces?  Thanks,

Alex
Jason Gunthorpe March 14, 2022, 11:18 p.m. UTC | #3
On Mon, Mar 14, 2022 at 04:50:33PM -0600, Alex Williamson wrote:

> > +/*
> > + * The KVM_IOMMU type implies that the hypervisor will control the mappings
> > + * rather than userspace
> > + */
> > +#define VFIO_KVM_IOMMU			11
> 
> Then why is this hosted in the type1 code that exposes a wide variety
> of userspace interfaces?  Thanks,

It is really badly named, this is the root level of a 2 stage nested
IO page table, and this approach needed a special flag to distinguish
the setup from the normal iommu_domain.

If we do try to stick this into VFIO it should probably use the
VFIO_TYPE1_NESTING_IOMMU instead - however, we would like to delete
that flag entirely as it was never fully implemented, was never used,
and isn't part of what we are proposing for IOMMU nesting on ARM
anyhow. (So far I've found nobody to explain what the plan here was..)

This is why I said the second level should be an explicit iommu_domain
all on its own that is explicitly coupled to the KVM to read the page
tables, if necessary.

But I'm not sure that reading the userspace io page tables with KVM is
even the best thing to do - the iommu driver already has the pinned
memory, it would be faster and more modular to traverse the io page
tables through the pfns in the root iommu_domain than by having KVM do
the translations. Lets see what Matthew says..

Jason
Tian, Kevin March 15, 2022, 7:57 a.m. UTC | #4
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Tuesday, March 15, 2022 7:18 AM
> 
> On Mon, Mar 14, 2022 at 04:50:33PM -0600, Alex Williamson wrote:
> 
> > > +/*
> > > + * The KVM_IOMMU type implies that the hypervisor will control the
> mappings
> > > + * rather than userspace
> > > + */
> > > +#define VFIO_KVM_IOMMU			11
> >
> > Then why is this hosted in the type1 code that exposes a wide variety
> > of userspace interfaces?  Thanks,
> 
> It is really badly named, this is the root level of a 2 stage nested
> IO page table, and this approach needed a special flag to distinguish
> the setup from the normal iommu_domain.
> 
> If we do try to stick this into VFIO it should probably use the
> VFIO_TYPE1_NESTING_IOMMU instead - however, we would like to delete
> that flag entirely as it was never fully implemented, was never used,
> and isn't part of what we are proposing for IOMMU nesting on ARM
> anyhow. (So far I've found nobody to explain what the plan here was..)
> 
> This is why I said the second level should be an explicit iommu_domain
> all on its own that is explicitly coupled to the KVM to read the page
> tables, if necessary.
> 
> But I'm not sure that reading the userspace io page tables with KVM is
> even the best thing to do - the iommu driver already has the pinned
> memory, it would be faster and more modular to traverse the io page
> tables through the pfns in the root iommu_domain than by having KVM do
> the translations. Lets see what Matthew says..
> 

Reading this thread it's sort of like an optimization to software nesting.
If that is the case does it make more sense to complete the basic form
of software nesting first and then adds this optimization?

The basic form would allow the userspace to create a special domain
type which points to a user/guest page table (like hardware nesting)
but doesn't install the user page table to the IOMMU hardware (unlike
hardware nesting). When receiving invalidate cmd from userspace 
the iommu driver walks the user page table (1st-level) and the parent 
page table (2nd-level) to generate a shadow mapping for the 
invalidated range in the non-nested hardware page table of this
special domain type.

Once that works what this series does just changes the matter of
how the invalidate cmd is triggered. Previously iommu driver receives
invalidate cmd from Qemu (via iommufd uAPI) while now receiving
the cmd from kvm (via iommufd kAPI) upon interception of RPCIT.
From this angle once the connection between iommufd and kvm fd 
is established there is even no direct talk between iommu driver and
kvm. 

Thanks
Kevin
Matthew Rosato March 15, 2022, 1:36 p.m. UTC | #5
On 3/14/22 7:18 PM, Jason Gunthorpe wrote:
> On Mon, Mar 14, 2022 at 04:50:33PM -0600, Alex Williamson wrote:
> 
>>> +/*
>>> + * The KVM_IOMMU type implies that the hypervisor will control the mappings
>>> + * rather than userspace
>>> + */
>>> +#define VFIO_KVM_IOMMU			11
>>
>> Then why is this hosted in the type1 code that exposes a wide variety
>> of userspace interfaces?  Thanks,
> 
> It is really badly named, this is the root level of a 2 stage nested
> IO page table, and this approach needed a special flag to distinguish
> the setup from the normal iommu_domain.

^^ Yes, this.

> 
> If we do try to stick this into VFIO it should probably use the
> VFIO_TYPE1_NESTING_IOMMU instead - however, we would like to delete
> that flag entirely as it was never fully implemented, was never used,
> and isn't part of what we are proposing for IOMMU nesting on ARM
> anyhow. (So far I've found nobody to explain what the plan here was..)
> 

I'm open to suggestions on how better to tie this into vfio.  The 
scenario basically plays out that:

1) the iommu will be domain_alloc'd once VFIO_SET_IOMMU is issued -- so 
at that time (or earlier) we have to make the decision on whether to use 
the standard IOMMU or this alternate KVM/nested IOMMU.

2) At the time VFIO_SET_IOMMU is received, we have not yet associated 
the vfio group with a KVM, so we can't (today) use this as an indicator 
to guess which IOMMU strategy to use.

3) Ideally, even if we changed QEMU vfio to make the KVM association 
earlier, it would be nice to still be able to indicate that we want to 
use the standard iommu/type1 despite a KVM association existing (e.g. 
backwards compatibility with older QEMU that lacks 'interpretation' 
support, nested virtualization scenarios).

> This is why I said the second level should be an explicit iommu_domain
> all on its own that is explicitly coupled to the KVM to read the page
> tables, if necessary.

Maybe I misunderstood this.  Are you proposing 2 layers of IOMMU that
interact with each other within host kernel space?

A second level runs the guest tables, pins the appropriate pieces from 
the guest to get the resulting phys_addr(s) which are then passed via 
iommu to a first level via map (or unmap)?

> 
> But I'm not sure that reading the userspace io page tables with KVM is
> even the best thing to do - the iommu driver already has the pinned
> memory, it would be faster and more modular to traverse the io page
> tables through the pfns in the root iommu_domain than by having KVM do
> the translations. Lets see what Matthew says..

OK, you lost me a bit here.  And this may be associated with the above.

So, what the current implementation is doing is reading the guest DMA 
tables (which we must pin the first time we access them) and then map 
the PTEs of the associated guest DMA entries into the associated host 
DMA table (so, again pin and place the address, or unpin and 
invalidate).  Basically we are shadowing the first level DMA table as a 
copy of the second level DMA table with the host address(es) of the 
pinned guest page(s).

I'm unclear where you are proposing the pinning be done if not by the 
iommu domain traversing the tables to perform the 'shadow' operation.
Matthew Rosato March 15, 2022, 1:49 p.m. UTC | #6
On 3/14/22 5:38 PM, Jason Gunthorpe wrote:
> On Mon, Mar 14, 2022 at 03:44:34PM -0400, Matthew Rosato wrote:
> 
>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>> index 9394aa9444c1..0bec97077d61 100644
>> +++ b/drivers/vfio/vfio_iommu_type1.c
>> @@ -77,6 +77,7 @@ struct vfio_iommu {
>>   	bool			nesting;
>>   	bool			dirty_page_tracking;
>>   	bool			container_open;
>> +	bool			kvm;
>>   	struct list_head	emulated_iommu_groups;
>>   };
>>   
>> @@ -2203,7 +2204,12 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>>   		goto out_free_group;
>>   
>>   	ret = -EIO;
>> -	domain->domain = iommu_domain_alloc(bus);
>> +
>> +	if (iommu->kvm)
>> +		domain->domain = iommu_domain_alloc_type(bus, IOMMU_DOMAIN_KVM);
>> +	else
>> +		domain->domain = iommu_domain_alloc(bus);
>> +
>>   	if (!domain->domain)
>>   		goto out_free_domain;
>>   
>> @@ -2552,6 +2558,9 @@ static void *vfio_iommu_type1_open(unsigned long arg)
>>   	case VFIO_TYPE1v2_IOMMU:
>>   		iommu->v2 = true;
>>   		break;
>> +	case VFIO_KVM_IOMMU:
>> +		iommu->kvm = true;
>> +		break;
> 
> Same remark for this - but more - this is called KVM but it doesn't
> accept a kvm FD or any thing else to link the domain to the KVM
> in-use.

Right...  The name is poor, but with the current design the KVM 
association comes shortly after.  To summarize, with this series, the 
following relevant steps occur:

1) VFIO_SET_IOMMU: Indicate we wish to use the alternate IOMMU domain
	-> At this point, the IOMMU will reject any maps (no KVM, no guest 
table anchor)
2) KVM ioctl "start":
	-> Register the KVM with the IOMMU domain
	-> At this point, IOMMU will still reject any maps (no guest table anchor)
3) KVM ioctl "register ioat"
	-> Register the guest DMA table head with the IOMMU domain
	-> now IOMMU maps are allowed

The rationale for splitting steps 1 and 2 are that VFIO_SET_IOMMU 
doesn't have a mechanism for specifying more than the type as an arg, 
no?  Otherwise yes, you could specify a kvm fd at this point and it 
would have some other advantages (e.g. skip notifier).  But we still 
can't use the IOMMU for mapping until step 3.

The rationale for splitting steps 2 and 3 are twofold:  1) during init, 
we simply don't know where the guest anchor will be when we allocate the 
domain and 2) because the guest can technically clear and re-initialize 
their DMA space during the life of the guest, moving the location of the 
table anchor.  We would receive another ioctl operation to unregister 
the guest table anchor and again reject any map operation until a new 
table location is provided.
Matthew Rosato March 15, 2022, 2:17 p.m. UTC | #7
On 3/15/22 3:57 AM, Tian, Kevin wrote:
>> From: Jason Gunthorpe <jgg@nvidia.com>
>> Sent: Tuesday, March 15, 2022 7:18 AM
>>
>> On Mon, Mar 14, 2022 at 04:50:33PM -0600, Alex Williamson wrote:
>>
>>>> +/*
>>>> + * The KVM_IOMMU type implies that the hypervisor will control the
>> mappings
>>>> + * rather than userspace
>>>> + */
>>>> +#define VFIO_KVM_IOMMU			11
>>>
>>> Then why is this hosted in the type1 code that exposes a wide variety
>>> of userspace interfaces?  Thanks,
>>
>> It is really badly named, this is the root level of a 2 stage nested
>> IO page table, and this approach needed a special flag to distinguish
>> the setup from the normal iommu_domain.
>>
>> If we do try to stick this into VFIO it should probably use the
>> VFIO_TYPE1_NESTING_IOMMU instead - however, we would like to delete
>> that flag entirely as it was never fully implemented, was never used,
>> and isn't part of what we are proposing for IOMMU nesting on ARM
>> anyhow. (So far I've found nobody to explain what the plan here was..)
>>
>> This is why I said the second level should be an explicit iommu_domain
>> all on its own that is explicitly coupled to the KVM to read the page
>> tables, if necessary.
>>
>> But I'm not sure that reading the userspace io page tables with KVM is
>> even the best thing to do - the iommu driver already has the pinned
>> memory, it would be faster and more modular to traverse the io page
>> tables through the pfns in the root iommu_domain than by having KVM do
>> the translations. Lets see what Matthew says..
>>
> 
> Reading this thread it's sort of like an optimization to software nesting.

Yes, we want to avoid breaking to userspace for a very frequent 
operation (RPCIT / updating shadow mappings)

> If that is the case does it make more sense to complete the basic form
> of software nesting first and then adds this optimization?
> 
> The basic form would allow the userspace to create a special domain
> type which points to a user/guest page table (like hardware nesting)
> but doesn't install the user page table to the IOMMU hardware (unlike
> hardware nesting). When receiving invalidate cmd from userspace > the iommu driver walks the user page table (1st-level) and the parent
> page table (2nd-level) to generate a shadow mapping for the
> invalidated range in the non-nested hardware page table of this
> special domain type.
> 
> Once that works what this series does just changes the matter of
> how the invalidate cmd is triggered. Previously iommu driver receives
> invalidate cmd from Qemu (via iommufd uAPI) while now receiving
> the cmd from kvm (via iommufd kAPI) upon interception of RPCIT.
>  From this angle once the connection between iommufd and kvm fd
> is established there is even no direct talk between iommu driver and
> kvm.

But something somewhere still needs to be responsible for 
pinning/unpinning of the guest table entries upon each RPCIT 
interception.  e.g. the RPCIT intercept can happen because the guest 
wants to invalidate some old mappings or has generated some new mappings 
over a range, so we must shadow the new mappings (by pinning the guest 
entries and placing them in the host hardware table / unpinning 
invalidated ones and clearing their entry in the host hardware table).
Jason Gunthorpe March 15, 2022, 2:38 p.m. UTC | #8
On Tue, Mar 15, 2022 at 09:49:01AM -0400, Matthew Rosato wrote:

> The rationale for splitting steps 1 and 2 are that VFIO_SET_IOMMU doesn't
> have a mechanism for specifying more than the type as an arg, no?  Otherwise
> yes, you could specify a kvm fd at this point and it would have some other
> advantages (e.g. skip notifier).  But we still can't use the IOMMU for
> mapping until step 3.

Stuff like this is why I'd be much happier if this could join our
iommfd project so we can have clean modeling of the multiple iommu_domains.

Jason
Jason Gunthorpe March 15, 2022, 2:55 p.m. UTC | #9
On Tue, Mar 15, 2022 at 09:36:08AM -0400, Matthew Rosato wrote:
> > If we do try to stick this into VFIO it should probably use the
> > VFIO_TYPE1_NESTING_IOMMU instead - however, we would like to delete
> > that flag entirely as it was never fully implemented, was never used,
> > and isn't part of what we are proposing for IOMMU nesting on ARM
> > anyhow. (So far I've found nobody to explain what the plan here was..)
> > 
> 
> I'm open to suggestions on how better to tie this into vfio.  The scenario
> basically plays out that:

Ideally I would like it to follow the same 'user space page table'
design that Eric and Kevin are working on for HW iommu.

You have an 1st iommu_domain that maps and pins the entire guest physical
address space.

You have an nested iommu_domain that represents the user page table
(the ioat in your language I think)

When the guest says it wants to set a user page table then you create
the nested iommu_domain representing that user page table and pass in
the anchor (guest address of the root IOPTE) to the kernel to do the
work.

The rule for all other HW's is that the user space page table is
translated by the top level kernel page table. So when you traverse it
you fetch the CPU page storing the guest's IOPTE by doing an IOVA
translation through the first level page table - not through KVM.

Since the first level page table an the KVM GPA should be 1:1 this is
an equivalent operation.

> 1) the iommu will be domain_alloc'd once VFIO_SET_IOMMU is issued -- so at
> that time (or earlier) we have to make the decision on whether to use the
> standard IOMMU or this alternate KVM/nested IOMMU.

So in terms of iommufd I would see it this would be an iommufd 'create
a device specific iomm_domain' IOCTL and you can pass in a S390
specific data blob to make it into this special mode.

> > This is why I said the second level should be an explicit iommu_domain
> > all on its own that is explicitly coupled to the KVM to read the page
> > tables, if necessary.
> 
> Maybe I misunderstood this.  Are you proposing 2 layers of IOMMU that
> interact with each other within host kernel space?
> 
> A second level runs the guest tables, pins the appropriate pieces from the
> guest to get the resulting phys_addr(s) which are then passed via iommu to a
> first level via map (or unmap)?


The first level iommu_domain has the 'type1' map and unmap and pins
the pages. This is the 1:1 map with the GPA and ends up pinning all
guest memory because the point is you don't want to take a memory pin
on your performance path

The second level iommu_domain points to a single IO page table in GPA
and is created/destroyed whenever the guest traps to the hypervisor to
manipulate the anchor (ie the GPA of the guest IO page table).

> > But I'm not sure that reading the userspace io page tables with KVM is
> > even the best thing to do - the iommu driver already has the pinned
> > memory, it would be faster and more modular to traverse the io page
> > tables through the pfns in the root iommu_domain than by having KVM do
> > the translations. Lets see what Matthew says..
> 
> OK, you lost me a bit here.  And this may be associated with the above.
> 
> So, what the current implementation is doing is reading the guest DMA tables
> (which we must pin the first time we access them) and then map the PTEs of
> the associated guest DMA entries into the associated host DMA table (so,
> again pin and place the address, or unpin and invalidate).  Basically we are
> shadowing the first level DMA table as a copy of the second level DMA table
> with the host address(es) of the pinned guest page(s).

You can't pin/unpin in this path, there is no real way to handle error
and ulimit stuff here, plus it is really slow. I didn't notice any of
this in your patches, so what do you mean by 'pin' above?

To be like other IOMMU nesting drivers the pages should already be
pinned and stored in the 1st iommu_domain, lets say in an xarray. This
xarray is populated by type1 map/unmap sytem calls like any
iommu_domain.

A nested iommu_domain should create the real HW IO page table and
associate it with the real HW IOMMU and record the parent 1st level iommu_domain.

When you do the shadowing you use the xarray of the 1st level
iommu_domain to translate from GPA to host physical and there is no
pinning/etc involved. After walking the guest table and learning the
final vIOVA it is translated through the xarray to a CPU physical and
then programmed into the real HW IO page table.

There is no reason to use KVM to do any of this, and is actively wrong
to place CPU pages from KVM into an IOPTE that did not come through
the type1 map/unmap calls that do all the proper validation and
accounting.

Jason
Matthew Rosato March 15, 2022, 4:04 p.m. UTC | #10
On 3/15/22 10:55 AM, Jason Gunthorpe wrote:
> On Tue, Mar 15, 2022 at 09:36:08AM -0400, Matthew Rosato wrote:
>>> If we do try to stick this into VFIO it should probably use the
>>> VFIO_TYPE1_NESTING_IOMMU instead - however, we would like to delete
>>> that flag entirely as it was never fully implemented, was never used,
>>> and isn't part of what we are proposing for IOMMU nesting on ARM
>>> anyhow. (So far I've found nobody to explain what the plan here was..)
>>>
>>
>> I'm open to suggestions on how better to tie this into vfio.  The scenario
>> basically plays out that:
> 
> Ideally I would like it to follow the same 'user space page table'
> design that Eric and Kevin are working on for HW iommu.

'[RFC v16 0/9] SMMUv3 Nested Stage Setup (IOMMU part)' ??

https://lore.kernel.org/linux-iommu/20211027104428.1059740-1-eric.auger@redhat.com/

> 
> You have an 1st iommu_domain that maps and pins the entire guest physical
> address space.

Ahh, I see.

@Christian would it be OK to pursue a model that pins all of guest 
memory upfront?

> 
> You have an nested iommu_domain that represents the user page table
> (the ioat in your language I think)

Yes

> 
> When the guest says it wants to set a user page table then you create
> the nested iommu_domain representing that user page table and pass in
> the anchor (guest address of the root IOPTE) to the kernel to do the
> work. >
> The rule for all other HW's is that the user space page table is
> translated by the top level kernel page table. So when you traverse it
> you fetch the CPU page storing the guest's IOPTE by doing an IOVA
> translation through the first level page table - not through KVM.
> 
> Since the first level page table an the KVM GPA should be 1:1 this is
> an equivalent operation.
> 
>> 1) the iommu will be domain_alloc'd once VFIO_SET_IOMMU is issued -- so at
>> that time (or earlier) we have to make the decision on whether to use the
>> standard IOMMU or this alternate KVM/nested IOMMU.
> 
> So in terms of iommufd I would see it this would be an iommufd 'create
> a device specific iomm_domain' IOCTL and you can pass in a S390
> specific data blob to make it into this special mode.
> 
>>> This is why I said the second level should be an explicit iommu_domain
>>> all on its own that is explicitly coupled to the KVM to read the page
>>> tables, if necessary.
>>
>> Maybe I misunderstood this.  Are you proposing 2 layers of IOMMU that
>> interact with each other within host kernel space?
>>
>> A second level runs the guest tables, pins the appropriate pieces from the
>> guest to get the resulting phys_addr(s) which are then passed via iommu to a
>> first level via map (or unmap)?
> 
> 
> The first level iommu_domain has the 'type1' map and unmap and pins
> the pages. This is the 1:1 map with the GPA and ends up pinning all
> guest memory because the point is you don't want to take a memory pin
> on your performance path
> 
> The second level iommu_domain points to a single IO page table in GPA
> and is created/destroyed whenever the guest traps to the hypervisor to
> manipulate the anchor (ie the GPA of the guest IO page table).
> 

That makes sense, thanks for clarifying.

>>> But I'm not sure that reading the userspace io page tables with KVM is
>>> even the best thing to do - the iommu driver already has the pinned
>>> memory, it would be faster and more modular to traverse the io page
>>> tables through the pfns in the root iommu_domain than by having KVM do
>>> the translations. Lets see what Matthew says..
>>
>> OK, you lost me a bit here.  And this may be associated with the above.
>>
>> So, what the current implementation is doing is reading the guest DMA tables
>> (which we must pin the first time we access them) and then map the PTEs of
>> the associated guest DMA entries into the associated host DMA table (so,
>> again pin and place the address, or unpin and invalidate).  Basically we are
>> shadowing the first level DMA table as a copy of the second level DMA table
>> with the host address(es) of the pinned guest page(s).
> 
> You can't pin/unpin in this path, there is no real way to handle error
> and ulimit stuff here, plus it is really slow. I didn't notice any of
> this in your patches, so what do you mean by 'pin' above?

patch 18 does some symbol_get for gfn_to_page (which will drive 
hva_to_pfn under the covers) and kvm_release_pfn_dirty and uses those 
symbols for pin/unpin.

pin/unpin errors in this series are reliant on the fact that RPCIT is 
architected to include a panic response to the guest of 'mappings failed 
for the specified range, go refresh your tables and make room', thus 
allowing this to work for pageable guests.

Agreed this would be unnecessary if we've already mapped all of guest 
memory via a 1st iommu domain.

> 
> To be like other IOMMU nesting drivers the pages should already be
> pinned and stored in the 1st iommu_domain, lets say in an xarray. This
> xarray is populated by type1 map/unmap sytem calls like any
> iommu_domain.
> 
> A nested iommu_domain should create the real HW IO page table and
> associate it with the real HW IOMMU and record the parent 1st level iommu_domain.
> 
> When you do the shadowing you use the xarray of the 1st level
> iommu_domain to translate from GPA to host physical and there is no
> pinning/etc involved. After walking the guest table and learning the
> final vIOVA it is translated through the xarray to a CPU physical and
> then programmed into the real HW IO page table.
> 
> There is no reason to use KVM to do any of this, and is actively wrong
> to place CPU pages from KVM into an IOPTE that did not come through
> the type1 map/unmap calls that do all the proper validation and
> accounting.
> 
> Jason
Matthew Rosato March 15, 2022, 4:29 p.m. UTC | #11
On 3/15/22 10:38 AM, Jason Gunthorpe wrote:
> On Tue, Mar 15, 2022 at 09:49:01AM -0400, Matthew Rosato wrote:
> 
>> The rationale for splitting steps 1 and 2 are that VFIO_SET_IOMMU doesn't
>> have a mechanism for specifying more than the type as an arg, no?  Otherwise
>> yes, you could specify a kvm fd at this point and it would have some other
>> advantages (e.g. skip notifier).  But we still can't use the IOMMU for
>> mapping until step 3.
> 
> Stuff like this is why I'd be much happier if this could join our
> iommfd project so we can have clean modeling of the multiple iommu_domains.
> 

I'd certainly be willing to collaborate so feel free to loop me in on 
the discussions; but I got the impression that iommufd is not close to 
ready (maybe I'm wrong?) -- if so I really don't want to completely 
delay this zPCI support behind it as it has a significant benefit for 
kvm guests on s390x :(
Matthew Rosato March 15, 2022, 5:01 p.m. UTC | #12
On 3/15/22 10:17 AM, Matthew Rosato wrote:
> On 3/15/22 3:57 AM, Tian, Kevin wrote:
>>> From: Jason Gunthorpe <jgg@nvidia.com>
>>> Sent: Tuesday, March 15, 2022 7:18 AM
>>>
>>> On Mon, Mar 14, 2022 at 04:50:33PM -0600, Alex Williamson wrote:
>>>
>>>>> +/*
>>>>> + * The KVM_IOMMU type implies that the hypervisor will control the
>>> mappings
>>>>> + * rather than userspace
>>>>> + */
>>>>> +#define VFIO_KVM_IOMMU            11
>>>>
>>>> Then why is this hosted in the type1 code that exposes a wide variety
>>>> of userspace interfaces?  Thanks,
>>>
>>> It is really badly named, this is the root level of a 2 stage nested
>>> IO page table, and this approach needed a special flag to distinguish
>>> the setup from the normal iommu_domain.
>>>
>>> If we do try to stick this into VFIO it should probably use the
>>> VFIO_TYPE1_NESTING_IOMMU instead - however, we would like to delete
>>> that flag entirely as it was never fully implemented, was never used,
>>> and isn't part of what we are proposing for IOMMU nesting on ARM
>>> anyhow. (So far I've found nobody to explain what the plan here was..)
>>>
>>> This is why I said the second level should be an explicit iommu_domain
>>> all on its own that is explicitly coupled to the KVM to read the page
>>> tables, if necessary.
>>>
>>> But I'm not sure that reading the userspace io page tables with KVM is
>>> even the best thing to do - the iommu driver already has the pinned
>>> memory, it would be faster and more modular to traverse the io page
>>> tables through the pfns in the root iommu_domain than by having KVM do
>>> the translations. Lets see what Matthew says..
>>>
>>
>> Reading this thread it's sort of like an optimization to software 
>> nesting.
> 
> Yes, we want to avoid breaking to userspace for a very frequent 
> operation (RPCIT / updating shadow mappings)
> 
>> If that is the case does it make more sense to complete the basic form
>> of software nesting first and then adds this optimization?
>>
>> The basic form would allow the userspace to create a special domain
>> type which points to a user/guest page table (like hardware nesting)
>> but doesn't install the user page table to the IOMMU hardware (unlike
>> hardware nesting). When receiving invalidate cmd from userspace > the 
>> iommu driver walks the user page table (1st-level) and the parent
>> page table (2nd-level) to generate a shadow mapping for the
>> invalidated range in the non-nested hardware page table of this
>> special domain type.
>>
>> Once that works what this series does just changes the matter of
>> how the invalidate cmd is triggered. Previously iommu driver receives
>> invalidate cmd from Qemu (via iommufd uAPI) while now receiving
>> the cmd from kvm (via iommufd kAPI) upon interception of RPCIT.
>>  From this angle once the connection between iommufd and kvm fd
>> is established there is even no direct talk between iommu driver and
>> kvm.
> 
> But something somewhere still needs to be responsible for 
> pinning/unpinning of the guest table entries upon each RPCIT 
> interception.  e.g. the RPCIT intercept can happen because the guest 
> wants to invalidate some old mappings or has generated some new mappings 
> over a range, so we must shadow the new mappings (by pinning the guest 
> entries and placing them in the host hardware table / unpinning 
> invalidated ones and clearing their entry in the host hardware table).
> 

OK, this got clarified by Jason in another thread: What I was missing 
here was an assumption that the 1st-level has already mapped and pinned 
all of guest physical address space; in that case there's no need to 
invoke pin/unpin operations against a kvm from within the iommu domain 
(this series as-is does not pin all of the guest physical address space; 
it does pins/unpins on-demand at RPCIT time)
Jason Gunthorpe March 15, 2022, 5:18 p.m. UTC | #13
On Tue, Mar 15, 2022 at 12:04:35PM -0400, Matthew Rosato wrote:

> > You can't pin/unpin in this path, there is no real way to handle error
> > and ulimit stuff here, plus it is really slow. I didn't notice any of
> > this in your patches, so what do you mean by 'pin' above?
> 
> patch 18 does some symbol_get for gfn_to_page (which will drive hva_to_pfn
> under the covers) and kvm_release_pfn_dirty and uses those symbols for
> pin/unpin.

To be very clear, this is quite wrong.

It does not account for the memory pinned by the guest and a hostile
VM could pin more memory than the KVM process is allowed to - which is
a security hole.

It also uses the wrong kind of pin, DMA pinned pages must be
pin_user_page'd not get_page'd and undone with unpin_user_page() and
not put_page(). This allows the guest to pin ZONE_MOVABALE memory and
other things which cannot be DMA'd to which will break the hypervisor
kernel. See David Hildenbrand's various series on COW bugs for an
overview why this is really security bad.

If you want to do dynamic pinning that is a different thing and
requires more CPU work in the shadowing operations. The modeling would
be similar except that the 1st stage iommu_domain would be this
'shared with KVM' domain people have been talking about - ie the page
table is not set with type 1 map/unmap but follows the KVM page table and
here it would be appropriate to use gfn_to_page/etc to access it.

However, if you do that then you do still have to take care of the
ulimit checks and you must teach kvm to use unpin_user_page/_dirty()
and related to be correct. This looks like a pretty big deal.

My advice is to start with the fully pinned case I described and
consider a KVM approach down the road.

[Also, I'm quite excited by this series you have, I think it shows
exactly how to fix POWER to work within the modern iommu framework,
they have the same basic problem]

Jason
Jason Gunthorpe March 15, 2022, 5:25 p.m. UTC | #14
On Tue, Mar 15, 2022 at 12:29:02PM -0400, Matthew Rosato wrote:
> On 3/15/22 10:38 AM, Jason Gunthorpe wrote:
> > On Tue, Mar 15, 2022 at 09:49:01AM -0400, Matthew Rosato wrote:
> > 
> > > The rationale for splitting steps 1 and 2 are that VFIO_SET_IOMMU doesn't
> > > have a mechanism for specifying more than the type as an arg, no?  Otherwise
> > > yes, you could specify a kvm fd at this point and it would have some other
> > > advantages (e.g. skip notifier).  But we still can't use the IOMMU for
> > > mapping until step 3.
> > 
> > Stuff like this is why I'd be much happier if this could join our
> > iommfd project so we can have clean modeling of the multiple iommu_domains.
> > 
> 
> I'd certainly be willing to collaborate so feel free to loop me in on the
> discussions; 

Sure, I have you on my list. I've been waiting for Eric to get a bit
further along on his ARM work so you have something appropriate to
look at.

In the mean time you can certainly work out the driver details as
you've been doing here and hacking through VFIO. The iommu_domain
logic is the big work item in this series, not the integration with
the uAPI.

> but I got the impression that iommufd is not close to ready (maybe
> I'm wrong?)

Well, quite alot has been done already and I think we are getting
close to having something that can start moving forward, but yes it
will not be "tomorrow".

> -- if so I really don't want to completely delay this zPCI support
> behind it as it has a significant benefit for kvm guests on s390x :(

Every platform vendor is trying to do this exact same performance
optimization. s390 is doing it a bit different, but as we've seen, not
fundamentally so compard to ARM and Intel IOMMU's with HW nesting.

I'm not sure that s390 should jump the queue and hacky hacky uAPIs all
over the place. ARM platform has been working on this for literal
years now.

Jason
Matthew Rosato March 17, 2022, 6:51 p.m. UTC | #15
On 3/15/22 1:25 PM, Jason Gunthorpe wrote:
> On Tue, Mar 15, 2022 at 12:29:02PM -0400, Matthew Rosato wrote:
>> On 3/15/22 10:38 AM, Jason Gunthorpe wrote:
>>> On Tue, Mar 15, 2022 at 09:49:01AM -0400, Matthew Rosato wrote:
>>>
>>>> The rationale for splitting steps 1 and 2 are that VFIO_SET_IOMMU doesn't
>>>> have a mechanism for specifying more than the type as an arg, no?  Otherwise
>>>> yes, you could specify a kvm fd at this point and it would have some other
>>>> advantages (e.g. skip notifier).  But we still can't use the IOMMU for
>>>> mapping until step 3.
>>>
>>> Stuff like this is why I'd be much happier if this could join our
>>> iommfd project so we can have clean modeling of the multiple iommu_domains.
>>>
>>
>> I'd certainly be willing to collaborate so feel free to loop me in on the
>> discussions;
> 
> Sure, I have you on my list. I've been waiting for Eric to get a bit
> further along on his ARM work so you have something appropriate to
> look at.
> 
> In the mean time you can certainly work out the driver details as
> you've been doing here and hacking through VFIO. The iommu_domain
> logic is the big work item in this series, not the integration with
> the uAPI.
>

A subset of this series (enabling some s390x firmware-assist facilities) 
is not tied to the iommu and would still provide value while continuing 
to use vfio_iommu_type1 for all mapping -- so I think I'll look into a 
next version that shrinks down to that subset (+ re-visit the setup API).

Separate from that, I will continue looking at implementing the nested 
iommu_domain logic for s390, and continue to hack through VFIO for now. 
I'll use an RFC series when I have something more to look at, likely 
starting with the fully-pinned guest as you suggest; ultimately I'm 
interested in both scenarios (pinned kvm guest & dynamic pinning during 
shadow)

Thanks,
Matt
Tian, Kevin March 18, 2022, 7:01 a.m. UTC | #16
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Tuesday, March 15, 2022 10:55 PM
> 
> The first level iommu_domain has the 'type1' map and unmap and pins
> the pages. This is the 1:1 map with the GPA and ends up pinning all
> guest memory because the point is you don't want to take a memory pin
> on your performance path
> 
> The second level iommu_domain points to a single IO page table in GPA
> and is created/destroyed whenever the guest traps to the hypervisor to
> manipulate the anchor (ie the GPA of the guest IO page table).
> 

Can we use consistent terms as used in iommufd and hardware, i.e.
with first-level/stage-1 referring to the child (GIOVA->GPA) which is
further nested on second-level/stage-2 as the parent (GPA->HPA)?

Otherwise all other explanations are agreed.

Thanks
Kevin
Jason Gunthorpe March 18, 2022, 1:46 p.m. UTC | #17
On Fri, Mar 18, 2022 at 07:01:19AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Tuesday, March 15, 2022 10:55 PM
> > 
> > The first level iommu_domain has the 'type1' map and unmap and pins
> > the pages. This is the 1:1 map with the GPA and ends up pinning all
> > guest memory because the point is you don't want to take a memory pin
> > on your performance path
> > 
> > The second level iommu_domain points to a single IO page table in GPA
> > and is created/destroyed whenever the guest traps to the hypervisor to
> > manipulate the anchor (ie the GPA of the guest IO page table).
> > 
> 
> Can we use consistent terms as used in iommufd and hardware, i.e.
> with first-level/stage-1 referring to the child (GIOVA->GPA) which is
> further nested on second-level/stage-2 as the parent (GPA->HPA)?

Honestly I don't like injecting terms that only make sense for
virtualization into iommu/vfio land.

That area is intended to be general. If you use what it exposes for
virtualization, then great.

This is why I prefer to use 'user page table' when talking about the
GIOVA->GPA or Stage 1 map because it is a phrase independent of
virtualization or HW and clearly conveys what it is to the kernel and
its inherent order in the translation scheme.

The S1/S2 is gets confusing because the HW people choose those names
so that S1 is the first translation a TLP sees and S2 is the second.

But from a software model, the S2 is the first domain created and the
first level of the translation tree, while the S1 is the second domain
created and the second level of the translation tree. ie the natural
SW numbers are backwards.

And I know Matthew isn't working on HW that has the S1/S2 HW naming :)

But yes, should try harder to have good names. Maybe it will be
clearer with code.

Jason
Tian, Kevin March 19, 2022, 7:47 a.m. UTC | #18
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Friday, March 18, 2022 9:46 PM
> 
> On Fri, Mar 18, 2022 at 07:01:19AM +0000, Tian, Kevin wrote:
> > > From: Jason Gunthorpe <jgg@nvidia.com>
> > > Sent: Tuesday, March 15, 2022 10:55 PM
> > >
> > > The first level iommu_domain has the 'type1' map and unmap and pins
> > > the pages. This is the 1:1 map with the GPA and ends up pinning all
> > > guest memory because the point is you don't want to take a memory pin
> > > on your performance path
> > >
> > > The second level iommu_domain points to a single IO page table in GPA
> > > and is created/destroyed whenever the guest traps to the hypervisor to
> > > manipulate the anchor (ie the GPA of the guest IO page table).
> > >
> >
> > Can we use consistent terms as used in iommufd and hardware, i.e.
> > with first-level/stage-1 referring to the child (GIOVA->GPA) which is
> > further nested on second-level/stage-2 as the parent (GPA->HPA)?
> 
> Honestly I don't like injecting terms that only make sense for
> virtualization into iommu/vfio land.

1st/2nd-level or stage-1/2 are hardware terms not tied to virtualization.
GIOVA/GPA are just examples in this story.

> 
> That area is intended to be general. If you use what it exposes for
> virtualization, then great.
> 
> This is why I prefer to use 'user page table' when talking about the
> GIOVA->GPA or Stage 1 map because it is a phrase independent of
> virtualization or HW and clearly conveys what it is to the kernel and
> its inherent order in the translation scheme.

I fully agree with this point. The confusion only comes when you
start talking about first/second level in a way incompatible with
what iommu/vfio guys typically understand. 
diff mbox series

Patch

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 9394aa9444c1..0bec97077d61 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -77,6 +77,7 @@  struct vfio_iommu {
 	bool			nesting;
 	bool			dirty_page_tracking;
 	bool			container_open;
+	bool			kvm;
 	struct list_head	emulated_iommu_groups;
 };
 
@@ -2203,7 +2204,12 @@  static int vfio_iommu_type1_attach_group(void *iommu_data,
 		goto out_free_group;
 
 	ret = -EIO;
-	domain->domain = iommu_domain_alloc(bus);
+
+	if (iommu->kvm)
+		domain->domain = iommu_domain_alloc_type(bus, IOMMU_DOMAIN_KVM);
+	else
+		domain->domain = iommu_domain_alloc(bus);
+
 	if (!domain->domain)
 		goto out_free_domain;
 
@@ -2552,6 +2558,9 @@  static void *vfio_iommu_type1_open(unsigned long arg)
 	case VFIO_TYPE1v2_IOMMU:
 		iommu->v2 = true;
 		break;
+	case VFIO_KVM_IOMMU:
+		iommu->kvm = true;
+		break;
 	default:
 		kfree(iommu);
 		return ERR_PTR(-EINVAL);
@@ -2637,6 +2646,7 @@  static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
 	case VFIO_TYPE1_NESTING_IOMMU:
 	case VFIO_UNMAP_ALL:
 	case VFIO_UPDATE_VADDR:
+	case VFIO_KVM_IOMMU:
 		return 1;
 	case VFIO_DMA_CC_IOMMU:
 		if (!iommu)
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index ef33ea002b0b..666edb6957ac 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -52,6 +52,12 @@ 
 /* Supports the vaddr flag for DMA map and unmap */
 #define VFIO_UPDATE_VADDR		10
 
+/*
+ * The KVM_IOMMU type implies that the hypervisor will control the mappings
+ * rather than userspace
+ */
+#define VFIO_KVM_IOMMU			11
+
 /*
  * The IOCTL interface is designed for extensibility by embedding the
  * structure length (argsz) and flags into structures passed between