diff mbox series

[RFC,v2,12/20] dma-iommu: Implement NESTED_MSI cookie

Message ID 20180918142457.3325-13-eric.auger@redhat.com (mailing list archive)
State New, archived
Headers show
Series SMMUv3 Nested Stage Setup | expand

Commit Message

Eric Auger Sept. 18, 2018, 2:24 p.m. UTC
Up to now, when the type was UNMANAGED, we used to
allocate IOVA pages within a range provided by the user.
This does not work in nested mode.

If both the host and the guest are exposed with SMMUs, each
would allocate an IOVA. The guest allocates an IOVA (gIOVA)
to map onto the guest MSI doorbell (gDB). The Host allocates
another IOVA (hIOVA) to map onto the physical doorbell (hDB).

So we end up with 2 unrelated mappings, at S1 and S2:
         S1             S2
gIOVA    ->     gDB
               hIOVA    ->    hDB

The PCI device would be programmed with hIOVA.

iommu_dma_bind_doorbell allows to pass gIOVA/gDB to the host
so that gIOVA can be used by the host instead of re-allocating
a new IOVA. That way the host can create the following nested
mapping:

         S1           S2
gIOVA    ->    gDB    ->    hDB

this time, the PCI device will be programmed with the gIOVA MSI
doorbell which is correctly map through the 2 stages.

Signed-off-by: Eric Auger <eric.auger@redhat.com>

---

v1 -> v2:
- unmap stage2 on put()
---
 drivers/iommu/dma-iommu.c | 97 +++++++++++++++++++++++++++++++++++++--
 include/linux/dma-iommu.h | 11 +++++
 2 files changed, 105 insertions(+), 3 deletions(-)

Comments

Robin Murphy Oct. 24, 2018, 6:02 p.m. UTC | #1
Hi Eric,

On 2018-09-18 3:24 pm, Eric Auger wrote:
> Up to now, when the type was UNMANAGED, we used to
> allocate IOVA pages within a range provided by the user.
> This does not work in nested mode.
> 
> If both the host and the guest are exposed with SMMUs, each
> would allocate an IOVA. The guest allocates an IOVA (gIOVA)
> to map onto the guest MSI doorbell (gDB). The Host allocates
> another IOVA (hIOVA) to map onto the physical doorbell (hDB).
> 
> So we end up with 2 unrelated mappings, at S1 and S2:
>           S1             S2
> gIOVA    ->     gDB
>                 hIOVA    ->    hDB
> 
> The PCI device would be programmed with hIOVA.
> 
> iommu_dma_bind_doorbell allows to pass gIOVA/gDB to the host
> so that gIOVA can be used by the host instead of re-allocating
> a new IOVA. That way the host can create the following nested
> mapping:
> 
>           S1           S2
> gIOVA    ->    gDB    ->    hDB
> 
> this time, the PCI device will be programmed with the gIOVA MSI
> doorbell which is correctly map through the 2 stages.

If I'm understanding things correctly, this plus a couple of the 
preceding patches all add up to a rather involved way of coercing an 
automatic allocator to only "allocate" predetermined addresses in an 
entirely known-ahead-of-time manner. Given that the guy calling 
iommu_dma_bind_doorbell() could seemingly just as easily call 
iommu_map() at that point and not bother with an allocator cookie and 
all this machinery at all, what am I missing?

Robin.

> 
> Signed-off-by: Eric Auger <eric.auger@redhat.com>
> 
> ---
> 
> v1 -> v2:
> - unmap stage2 on put()
> ---
>   drivers/iommu/dma-iommu.c | 97 +++++++++++++++++++++++++++++++++++++--
>   include/linux/dma-iommu.h | 11 +++++
>   2 files changed, 105 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index 511ff9a1d6d9..53444c3e8f2f 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -37,12 +37,14 @@
>   struct iommu_dma_msi_page {
>   	struct list_head	list;
>   	dma_addr_t		iova;
> +	dma_addr_t		ipa;
>   	phys_addr_t		phys;
>   };
>   
>   enum iommu_dma_cookie_type {
>   	IOMMU_DMA_IOVA_COOKIE,
>   	IOMMU_DMA_MSI_COOKIE,
> +	IOMMU_DMA_NESTED_MSI_COOKIE,
>   };
>   
>   struct iommu_dma_cookie {
> @@ -109,14 +111,17 @@ EXPORT_SYMBOL(iommu_get_dma_cookie);
>    *
>    * Users who manage their own IOVA allocation and do not want DMA API support,
>    * but would still like to take advantage of automatic MSI remapping, can use
> - * this to initialise their own domain appropriately. Users should reserve a
> + * this to initialise their own domain appropriately. Users may reserve a
>    * contiguous IOVA region, starting at @base, large enough to accommodate the
>    * number of PAGE_SIZE mappings necessary to cover every MSI doorbell address
> - * used by the devices attached to @domain.
> + * used by the devices attached to @domain. The other way round is to provide
> + * usable iova pages through the iommu_dma_bind_doorbell API (nested stages
> + * use case)
>    */
>   int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base)
>   {
>   	struct iommu_dma_cookie *cookie;
> +	int nesting, ret;
>   
>   	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
>   		return -EINVAL;
> @@ -124,7 +129,12 @@ int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base)
>   	if (domain->iova_cookie)
>   		return -EEXIST;
>   
> -	cookie = cookie_alloc(IOMMU_DMA_MSI_COOKIE);
> +	ret =  iommu_domain_get_attr(domain, DOMAIN_ATTR_NESTING, &nesting);
> +	if (!ret && nesting)
> +		cookie = cookie_alloc(IOMMU_DMA_NESTED_MSI_COOKIE);
> +	else
> +		cookie = cookie_alloc(IOMMU_DMA_MSI_COOKIE);
> +
>   	if (!cookie)
>   		return -ENOMEM;
>   
> @@ -145,6 +155,7 @@ void iommu_put_dma_cookie(struct iommu_domain *domain)
>   {
>   	struct iommu_dma_cookie *cookie = domain->iova_cookie;
>   	struct iommu_dma_msi_page *msi, *tmp;
> +	bool s2_unmap = false;
>   
>   	if (!cookie)
>   		return;
> @@ -152,7 +163,15 @@ void iommu_put_dma_cookie(struct iommu_domain *domain)
>   	if (cookie->type == IOMMU_DMA_IOVA_COOKIE && cookie->iovad.granule)
>   		put_iova_domain(&cookie->iovad);
>   
> +	if (cookie->type == IOMMU_DMA_NESTED_MSI_COOKIE)
> +		s2_unmap = true;
> +
>   	list_for_each_entry_safe(msi, tmp, &cookie->msi_page_list, list) {
> +		if (s2_unmap && msi->phys) {
> +			size_t size = cookie_msi_granule(cookie);
> +
> +			WARN_ON(iommu_unmap(domain, msi->ipa, size) != size);
> +		}
>   		list_del(&msi->list);
>   		kfree(msi);
>   	}
> @@ -161,6 +180,50 @@ void iommu_put_dma_cookie(struct iommu_domain *domain)
>   }
>   EXPORT_SYMBOL(iommu_put_dma_cookie);
>   
> +/**
> + * iommu_dma_bind_doorbell - Allows to provide a usable IOVA page
> + * @domain: domain handle
> + * @binding: IOVA/IPA binding
> + *
> + * In nested stage use case, the user can provide IOVA/IPA bindings
> + * corresponding to a guest MSI stage 1 mapping. When the host needs
> + * to map its own MSI doorbells, it can use the IPA as stage 2 input
> + * and map it onto the physical MSI doorbell.
> + */
> +int iommu_dma_bind_doorbell(struct iommu_domain *domain,
> +			    struct iommu_guest_msi_binding *binding)
> +{
> +	struct iommu_dma_cookie *cookie = domain->iova_cookie;
> +	struct iommu_dma_msi_page *msi;
> +	dma_addr_t ipa, iova;
> +	size_t size;
> +
> +	if (!cookie)
> +		return -EINVAL;
> +
> +	if (cookie->type != IOMMU_DMA_NESTED_MSI_COOKIE)
> +		return -EINVAL;
> +
> +	size = 1 << binding->granule;
> +	iova = binding->iova & ~(phys_addr_t)(size - 1);
> +	ipa = binding->gpa & ~(phys_addr_t)(size - 1);
> +
> +	list_for_each_entry(msi, &cookie->msi_page_list, list) {
> +		if (msi->iova == iova)
> +			return 0; /* this page is already registered */
> +	}
> +
> +	msi = kzalloc(sizeof(*msi), GFP_KERNEL);
> +	if (!msi)
> +		return -ENOMEM;
> +
> +	msi->iova = iova;
> +	msi->ipa = ipa;
> +	list_add(&msi->list, &cookie->msi_page_list);
> +	return 0;
> +}
> +EXPORT_SYMBOL(iommu_dma_bind_doorbell);
> +
>   /**
>    * iommu_dma_get_resv_regions - Reserved region driver helper
>    * @dev: Device from iommu_get_resv_regions()
> @@ -846,6 +909,34 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
>   		if (msi_page->phys == msi_addr)
>   			return msi_page;
>   
> +	/*
> +	 * In nested stage mode, we do not allocate an MSI page in
> +	 * a range provided by the user. Instead, IOVA/IPA bindings are
> +	 * individually provided. We reuse thise IOVAs to build the
> +	 * IOVA -> IPA -> MSI PA nested stage mapping.
> +	 */
> +	if (cookie->type == IOMMU_DMA_NESTED_MSI_COOKIE) {
> +		list_for_each_entry(msi_page, &cookie->msi_page_list, list)
> +			if (!msi_page->phys) { /* this binding is free to use */
> +				dma_addr_t ipa = msi_page->ipa;
> +				int ret;
> +
> +				msi_page->phys = msi_addr;
> +
> +				/* do the stage 2 mapping */
> +				ret = iommu_map(domain, ipa, msi_addr, size,
> +						IOMMU_MMIO | IOMMU_WRITE);
> +				if (ret) {
> +					pr_warn("MSI S2 mapping failed (%d)\n",
> +						ret);
> +					return NULL;
> +				}
> +				return msi_page;
> +			}
> +		pr_warn("%s no MSI binding found\n", __func__);
> +		return NULL;
> +	}
> +
>   	msi_page = kzalloc(sizeof(*msi_page), GFP_ATOMIC);
>   	if (!msi_page)
>   		return NULL;
> diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
> index e8ca5e654277..324745eef644 100644
> --- a/include/linux/dma-iommu.h
> +++ b/include/linux/dma-iommu.h
> @@ -24,6 +24,7 @@
>   #include <linux/dma-mapping.h>
>   #include <linux/iommu.h>
>   #include <linux/msi.h>
> +#include <uapi/linux/iommu.h>
>   
>   int iommu_dma_init(void);
>   
> @@ -74,12 +75,15 @@ int iommu_dma_mapping_error(struct device *dev, dma_addr_t dma_addr);
>   /* The DMA API isn't _quite_ the whole story, though... */
>   void iommu_dma_map_msi_msg(int irq, struct msi_msg *msg);
>   void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list);
> +int iommu_dma_bind_doorbell(struct iommu_domain *domain,
> +			    struct iommu_guest_msi_binding *binding);
>   
>   #else
>   
>   struct iommu_domain;
>   struct msi_msg;
>   struct device;
> +struct iommu_guest_msi_binding;
>   
>   static inline int iommu_dma_init(void)
>   {
> @@ -104,6 +108,13 @@ static inline void iommu_dma_map_msi_msg(int irq, struct msi_msg *msg)
>   {
>   }
>   
> +static inline int
> +iommu_dma_bind_doorbell(struct iommu_domain *domain,
> +			struct iommu_guest_msi_binding *binding)
> +{
> +	return -ENODEV;
> +}
> +
>   static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)
>   {
>   }
>
Eric Auger Oct. 24, 2018, 6:44 p.m. UTC | #2
Hi Robin,

On 10/24/18 8:02 PM, Robin Murphy wrote:
> Hi Eric,
> 
> On 2018-09-18 3:24 pm, Eric Auger wrote:
>> Up to now, when the type was UNMANAGED, we used to
>> allocate IOVA pages within a range provided by the user.
>> This does not work in nested mode.
>>
>> If both the host and the guest are exposed with SMMUs, each
>> would allocate an IOVA. The guest allocates an IOVA (gIOVA)
>> to map onto the guest MSI doorbell (gDB). The Host allocates
>> another IOVA (hIOVA) to map onto the physical doorbell (hDB).
>>
>> So we end up with 2 unrelated mappings, at S1 and S2:
>>           S1             S2
>> gIOVA    ->     gDB
>>                 hIOVA    ->    hDB
>>
>> The PCI device would be programmed with hIOVA.
>>
>> iommu_dma_bind_doorbell allows to pass gIOVA/gDB to the host
>> so that gIOVA can be used by the host instead of re-allocating
>> a new IOVA. That way the host can create the following nested
>> mapping:
>>
>>           S1           S2
>> gIOVA    ->    gDB    ->    hDB
>>
>> this time, the PCI device will be programmed with the gIOVA MSI
>> doorbell which is correctly map through the 2 stages.
> 
> If I'm understanding things correctly, this plus a couple of the
> preceding patches all add up to a rather involved way of coercing an
> automatic allocator to only "allocate" predetermined addresses in an
> entirely known-ahead-of-time manner.
agreed
 Given that the guy calling
> iommu_dma_bind_doorbell() could seemingly just as easily call
> iommu_map() at that point and not bother with an allocator cookie and
> all this machinery at all, what am I missing?
Well iommu_dma_map_msi_msg() gets called and is part of this existing
MSI mapping machinery. If we do not do anything this function allocates
an hIOVA that is not involved in any nested setup. So either we coerce
the allocator in place (which is what this series does) or we unplug the
allocator to replace this latter with a simple S2 mapping, as you
suggest, ie. iommu_map(gDB, hDB). Assuming we unplug the allocator, the
guy who actually calls  iommu_dma_bind_doorbell() knows gDB but does not
know hDB. So I don't really get how we can simplify things.

Thanks

Eric

> 
> Robin.
> 
>>
>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>
>> ---
>>
>> v1 -> v2:
>> - unmap stage2 on put()
>> ---
>>   drivers/iommu/dma-iommu.c | 97 +++++++++++++++++++++++++++++++++++++--
>>   include/linux/dma-iommu.h | 11 +++++
>>   2 files changed, 105 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
>> index 511ff9a1d6d9..53444c3e8f2f 100644
>> --- a/drivers/iommu/dma-iommu.c
>> +++ b/drivers/iommu/dma-iommu.c
>> @@ -37,12 +37,14 @@
>>   struct iommu_dma_msi_page {
>>       struct list_head    list;
>>       dma_addr_t        iova;
>> +    dma_addr_t        ipa;
>>       phys_addr_t        phys;
>>   };
>>     enum iommu_dma_cookie_type {
>>       IOMMU_DMA_IOVA_COOKIE,
>>       IOMMU_DMA_MSI_COOKIE,
>> +    IOMMU_DMA_NESTED_MSI_COOKIE,
>>   };
>>     struct iommu_dma_cookie {
>> @@ -109,14 +111,17 @@ EXPORT_SYMBOL(iommu_get_dma_cookie);
>>    *
>>    * Users who manage their own IOVA allocation and do not want DMA
>> API support,
>>    * but would still like to take advantage of automatic MSI
>> remapping, can use
>> - * this to initialise their own domain appropriately. Users should
>> reserve a
>> + * this to initialise their own domain appropriately. Users may
>> reserve a
>>    * contiguous IOVA region, starting at @base, large enough to
>> accommodate the
>>    * number of PAGE_SIZE mappings necessary to cover every MSI
>> doorbell address
>> - * used by the devices attached to @domain.
>> + * used by the devices attached to @domain. The other way round is to
>> provide
>> + * usable iova pages through the iommu_dma_bind_doorbell API (nested
>> stages
>> + * use case)
>>    */
>>   int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base)
>>   {
>>       struct iommu_dma_cookie *cookie;
>> +    int nesting, ret;
>>         if (domain->type != IOMMU_DOMAIN_UNMANAGED)
>>           return -EINVAL;
>> @@ -124,7 +129,12 @@ int iommu_get_msi_cookie(struct iommu_domain
>> *domain, dma_addr_t base)
>>       if (domain->iova_cookie)
>>           return -EEXIST;
>>   -    cookie = cookie_alloc(IOMMU_DMA_MSI_COOKIE);
>> +    ret =  iommu_domain_get_attr(domain, DOMAIN_ATTR_NESTING, &nesting);
>> +    if (!ret && nesting)
>> +        cookie = cookie_alloc(IOMMU_DMA_NESTED_MSI_COOKIE);
>> +    else
>> +        cookie = cookie_alloc(IOMMU_DMA_MSI_COOKIE);
>> +
>>       if (!cookie)
>>           return -ENOMEM;
>>   @@ -145,6 +155,7 @@ void iommu_put_dma_cookie(struct iommu_domain
>> *domain)
>>   {
>>       struct iommu_dma_cookie *cookie = domain->iova_cookie;
>>       struct iommu_dma_msi_page *msi, *tmp;
>> +    bool s2_unmap = false;
>>         if (!cookie)
>>           return;
>> @@ -152,7 +163,15 @@ void iommu_put_dma_cookie(struct iommu_domain
>> *domain)
>>       if (cookie->type == IOMMU_DMA_IOVA_COOKIE && cookie->iovad.granule)
>>           put_iova_domain(&cookie->iovad);
>>   +    if (cookie->type == IOMMU_DMA_NESTED_MSI_COOKIE)
>> +        s2_unmap = true;
>> +
>>       list_for_each_entry_safe(msi, tmp, &cookie->msi_page_list, list) {
>> +        if (s2_unmap && msi->phys) {
>> +            size_t size = cookie_msi_granule(cookie);
>> +
>> +            WARN_ON(iommu_unmap(domain, msi->ipa, size) != size);
>> +        }
>>           list_del(&msi->list);
>>           kfree(msi);
>>       }
>> @@ -161,6 +180,50 @@ void iommu_put_dma_cookie(struct iommu_domain
>> *domain)
>>   }
>>   EXPORT_SYMBOL(iommu_put_dma_cookie);
>>   +/**
>> + * iommu_dma_bind_doorbell - Allows to provide a usable IOVA page
>> + * @domain: domain handle
>> + * @binding: IOVA/IPA binding
>> + *
>> + * In nested stage use case, the user can provide IOVA/IPA bindings
>> + * corresponding to a guest MSI stage 1 mapping. When the host needs
>> + * to map its own MSI doorbells, it can use the IPA as stage 2 input
>> + * and map it onto the physical MSI doorbell.
>> + */
>> +int iommu_dma_bind_doorbell(struct iommu_domain *domain,
>> +                struct iommu_guest_msi_binding *binding)
>> +{
>> +    struct iommu_dma_cookie *cookie = domain->iova_cookie;
>> +    struct iommu_dma_msi_page *msi;
>> +    dma_addr_t ipa, iova;
>> +    size_t size;
>> +
>> +    if (!cookie)
>> +        return -EINVAL;
>> +
>> +    if (cookie->type != IOMMU_DMA_NESTED_MSI_COOKIE)
>> +        return -EINVAL;
>> +
>> +    size = 1 << binding->granule;
>> +    iova = binding->iova & ~(phys_addr_t)(size - 1);
>> +    ipa = binding->gpa & ~(phys_addr_t)(size - 1);
>> +
>> +    list_for_each_entry(msi, &cookie->msi_page_list, list) {
>> +        if (msi->iova == iova)
>> +            return 0; /* this page is already registered */
>> +    }
>> +
>> +    msi = kzalloc(sizeof(*msi), GFP_KERNEL);
>> +    if (!msi)
>> +        return -ENOMEM;
>> +
>> +    msi->iova = iova;
>> +    msi->ipa = ipa;
>> +    list_add(&msi->list, &cookie->msi_page_list);
>> +    return 0;
>> +}
>> +EXPORT_SYMBOL(iommu_dma_bind_doorbell);
>> +
>>   /**
>>    * iommu_dma_get_resv_regions - Reserved region driver helper
>>    * @dev: Device from iommu_get_resv_regions()
>> @@ -846,6 +909,34 @@ static struct iommu_dma_msi_page
>> *iommu_dma_get_msi_page(struct device *dev,
>>           if (msi_page->phys == msi_addr)
>>               return msi_page;
>>   +    /*
>> +     * In nested stage mode, we do not allocate an MSI page in
>> +     * a range provided by the user. Instead, IOVA/IPA bindings are
>> +     * individually provided. We reuse thise IOVAs to build the
>> +     * IOVA -> IPA -> MSI PA nested stage mapping.
>> +     */
>> +    if (cookie->type == IOMMU_DMA_NESTED_MSI_COOKIE) {
>> +        list_for_each_entry(msi_page, &cookie->msi_page_list, list)
>> +            if (!msi_page->phys) { /* this binding is free to use */
>> +                dma_addr_t ipa = msi_page->ipa;
>> +                int ret;
>> +
>> +                msi_page->phys = msi_addr;
>> +
>> +                /* do the stage 2 mapping */
>> +                ret = iommu_map(domain, ipa, msi_addr, size,
>> +                        IOMMU_MMIO | IOMMU_WRITE);
>> +                if (ret) {
>> +                    pr_warn("MSI S2 mapping failed (%d)\n",
>> +                        ret);
>> +                    return NULL;
>> +                }
>> +                return msi_page;
>> +            }
>> +        pr_warn("%s no MSI binding found\n", __func__);
>> +        return NULL;
>> +    }
>> +
>>       msi_page = kzalloc(sizeof(*msi_page), GFP_ATOMIC);
>>       if (!msi_page)
>>           return NULL;
>> diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
>> index e8ca5e654277..324745eef644 100644
>> --- a/include/linux/dma-iommu.h
>> +++ b/include/linux/dma-iommu.h
>> @@ -24,6 +24,7 @@
>>   #include <linux/dma-mapping.h>
>>   #include <linux/iommu.h>
>>   #include <linux/msi.h>
>> +#include <uapi/linux/iommu.h>
>>     int iommu_dma_init(void);
>>   @@ -74,12 +75,15 @@ int iommu_dma_mapping_error(struct device *dev,
>> dma_addr_t dma_addr);
>>   /* The DMA API isn't _quite_ the whole story, though... */
>>   void iommu_dma_map_msi_msg(int irq, struct msi_msg *msg);
>>   void iommu_dma_get_resv_regions(struct device *dev, struct list_head
>> *list);
>> +int iommu_dma_bind_doorbell(struct iommu_domain *domain,
>> +                struct iommu_guest_msi_binding *binding);
>>     #else
>>     struct iommu_domain;
>>   struct msi_msg;
>>   struct device;
>> +struct iommu_guest_msi_binding;
>>     static inline int iommu_dma_init(void)
>>   {
>> @@ -104,6 +108,13 @@ static inline void iommu_dma_map_msi_msg(int irq,
>> struct msi_msg *msg)
>>   {
>>   }
>>   +static inline int
>> +iommu_dma_bind_doorbell(struct iommu_domain *domain,
>> +            struct iommu_guest_msi_binding *binding)
>> +{
>> +    return -ENODEV;
>> +}
>> +
>>   static inline void iommu_dma_get_resv_regions(struct device *dev,
>> struct list_head *list)
>>   {
>>   }
>>
Robin Murphy Oct. 24, 2018, 10:05 p.m. UTC | #3
On 2018-10-24 7:44 pm, Auger Eric wrote:
> Hi Robin,
> 
> On 10/24/18 8:02 PM, Robin Murphy wrote:
>> Hi Eric,
>>
>> On 2018-09-18 3:24 pm, Eric Auger wrote:
>>> Up to now, when the type was UNMANAGED, we used to
>>> allocate IOVA pages within a range provided by the user.
>>> This does not work in nested mode.
>>>
>>> If both the host and the guest are exposed with SMMUs, each
>>> would allocate an IOVA. The guest allocates an IOVA (gIOVA)
>>> to map onto the guest MSI doorbell (gDB). The Host allocates
>>> another IOVA (hIOVA) to map onto the physical doorbell (hDB).
>>>
>>> So we end up with 2 unrelated mappings, at S1 and S2:
>>>            S1             S2
>>> gIOVA    ->     gDB
>>>                  hIOVA    ->    hDB
>>>
>>> The PCI device would be programmed with hIOVA.
>>>
>>> iommu_dma_bind_doorbell allows to pass gIOVA/gDB to the host
>>> so that gIOVA can be used by the host instead of re-allocating
>>> a new IOVA. That way the host can create the following nested
>>> mapping:
>>>
>>>            S1           S2
>>> gIOVA    ->    gDB    ->    hDB
>>>
>>> this time, the PCI device will be programmed with the gIOVA MSI
>>> doorbell which is correctly map through the 2 stages.
>>
>> If I'm understanding things correctly, this plus a couple of the
>> preceding patches all add up to a rather involved way of coercing an
>> automatic allocator to only "allocate" predetermined addresses in an
>> entirely known-ahead-of-time manner.
> agreed
>   Given that the guy calling
>> iommu_dma_bind_doorbell() could seemingly just as easily call
>> iommu_map() at that point and not bother with an allocator cookie and
>> all this machinery at all, what am I missing?
> Well iommu_dma_map_msi_msg() gets called and is part of this existing
> MSI mapping machinery. If we do not do anything this function allocates
> an hIOVA that is not involved in any nested setup. So either we coerce
> the allocator in place (which is what this series does) or we unplug the
> allocator to replace this latter with a simple S2 mapping, as you
> suggest, ie. iommu_map(gDB, hDB). Assuming we unplug the allocator, the
> guy who actually calls  iommu_dma_bind_doorbell() knows gDB but does not
> know hDB. So I don't really get how we can simplify things.

OK, there's what I was missing :D

But that then seems to reveal a somewhat bigger problem - if the callers 
are simply registering IPAs, and relying on the ITS driver to grab an 
entry and fill in a PA later, then how does either one know *which* PA 
is supposed to belong to a given IPA in the case where you have multiple 
devices with different ITS targets assigned to the same guest? (and if 
it's possible to assume a guest will use per-device stage 1 mappings and 
present it with a single vITS backed by multiple pITSes, I think things 
start breaking even harder.)

Other than allowing arbitrary disjoint IOVA pages, I'm not sure this 
really works any differently from the existing MSI cookie now that I 
look more closely :/

Robin.
Eric Auger Oct. 27, 2018, 9:24 a.m. UTC | #4
Hi Robin,

On 10/25/18 12:05 AM, Robin Murphy wrote:
> On 2018-10-24 7:44 pm, Auger Eric wrote:
>> Hi Robin,
>>
>> On 10/24/18 8:02 PM, Robin Murphy wrote:
>>> Hi Eric,
>>>
>>> On 2018-09-18 3:24 pm, Eric Auger wrote:
>>>> Up to now, when the type was UNMANAGED, we used to
>>>> allocate IOVA pages within a range provided by the user.
>>>> This does not work in nested mode.
>>>>
>>>> If both the host and the guest are exposed with SMMUs, each
>>>> would allocate an IOVA. The guest allocates an IOVA (gIOVA)
>>>> to map onto the guest MSI doorbell (gDB). The Host allocates
>>>> another IOVA (hIOVA) to map onto the physical doorbell (hDB).
>>>>
>>>> So we end up with 2 unrelated mappings, at S1 and S2:
>>>>            S1             S2
>>>> gIOVA    ->     gDB
>>>>                  hIOVA    ->    hDB
>>>>
>>>> The PCI device would be programmed with hIOVA.
>>>>
>>>> iommu_dma_bind_doorbell allows to pass gIOVA/gDB to the host
>>>> so that gIOVA can be used by the host instead of re-allocating
>>>> a new IOVA. That way the host can create the following nested
>>>> mapping:
>>>>
>>>>            S1           S2
>>>> gIOVA    ->    gDB    ->    hDB
>>>>
>>>> this time, the PCI device will be programmed with the gIOVA MSI
>>>> doorbell which is correctly map through the 2 stages.
>>>
>>> If I'm understanding things correctly, this plus a couple of the
>>> preceding patches all add up to a rather involved way of coercing an
>>> automatic allocator to only "allocate" predetermined addresses in an
>>> entirely known-ahead-of-time manner.
>> agreed
>>   Given that the guy calling
>>> iommu_dma_bind_doorbell() could seemingly just as easily call
>>> iommu_map() at that point and not bother with an allocator cookie and
>>> all this machinery at all, what am I missing?
>> Well iommu_dma_map_msi_msg() gets called and is part of this existing
>> MSI mapping machinery. If we do not do anything this function allocates
>> an hIOVA that is not involved in any nested setup. So either we coerce
>> the allocator in place (which is what this series does) or we unplug the
>> allocator to replace this latter with a simple S2 mapping, as you
>> suggest, ie. iommu_map(gDB, hDB). Assuming we unplug the allocator, the
>> guy who actually calls  iommu_dma_bind_doorbell() knows gDB but does not
>> know hDB. So I don't really get how we can simplify things.
> 
> OK, there's what I was missing :D
> 
> But that then seems to reveal a somewhat bigger problem - if the callers
> are simply registering IPAs, and relying on the ITS driver to grab an
> entry and fill in a PA later, then how does either one know *which* PA
> is supposed to belong to a given IPA in the case where you have multiple
> devices with different ITS targets assigned to the same guest?

You're definitively right here. I think this can be resolved by passing
the struct device handle along with the stage1 mapping and storing the
info together. Then when the host MSI controller looks for a free
unmapped iova, it must also check whether the device belongs to its MSI
domain.

 (and if
> it's possible to assume a guest will use per-device stage 1 mappings and
> present it with a single vITS backed by multiple pITSes, I think things
> start breaking even harder.)

I don't really get your point here. Assigned devices on guest side
should be in separate iommu domain because we want them to get isolated
from each other. There is a single vITS as of now and I don't think we
will change that anytime soon. The vITS driver is allocating a gIOVA for
each separate domain and I currently "trap" the gIOVA/gPA mapping on
irqfd routing setup. This mapping gets associated to a VFIO IOMMU, one
per assigned device, so we have different vfio containers for each of
them. If I then enumerate all the devices attached to the containers and
pass this stage1 binding along with the device struct, I think we should
be OK?

Thanks

Eric

> 
> Other than allowing arbitrary disjoint IOVA pages, I'm not sure this
> really works any differently from the existing MSI cookie now that I
> look more closely :/
> 
> Robin.
diff mbox series

Patch

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 511ff9a1d6d9..53444c3e8f2f 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -37,12 +37,14 @@ 
 struct iommu_dma_msi_page {
 	struct list_head	list;
 	dma_addr_t		iova;
+	dma_addr_t		ipa;
 	phys_addr_t		phys;
 };
 
 enum iommu_dma_cookie_type {
 	IOMMU_DMA_IOVA_COOKIE,
 	IOMMU_DMA_MSI_COOKIE,
+	IOMMU_DMA_NESTED_MSI_COOKIE,
 };
 
 struct iommu_dma_cookie {
@@ -109,14 +111,17 @@  EXPORT_SYMBOL(iommu_get_dma_cookie);
  *
  * Users who manage their own IOVA allocation and do not want DMA API support,
  * but would still like to take advantage of automatic MSI remapping, can use
- * this to initialise their own domain appropriately. Users should reserve a
+ * this to initialise their own domain appropriately. Users may reserve a
  * contiguous IOVA region, starting at @base, large enough to accommodate the
  * number of PAGE_SIZE mappings necessary to cover every MSI doorbell address
- * used by the devices attached to @domain.
+ * used by the devices attached to @domain. The other way round is to provide
+ * usable iova pages through the iommu_dma_bind_doorbell API (nested stages
+ * use case)
  */
 int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base)
 {
 	struct iommu_dma_cookie *cookie;
+	int nesting, ret;
 
 	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
 		return -EINVAL;
@@ -124,7 +129,12 @@  int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base)
 	if (domain->iova_cookie)
 		return -EEXIST;
 
-	cookie = cookie_alloc(IOMMU_DMA_MSI_COOKIE);
+	ret =  iommu_domain_get_attr(domain, DOMAIN_ATTR_NESTING, &nesting);
+	if (!ret && nesting)
+		cookie = cookie_alloc(IOMMU_DMA_NESTED_MSI_COOKIE);
+	else
+		cookie = cookie_alloc(IOMMU_DMA_MSI_COOKIE);
+
 	if (!cookie)
 		return -ENOMEM;
 
@@ -145,6 +155,7 @@  void iommu_put_dma_cookie(struct iommu_domain *domain)
 {
 	struct iommu_dma_cookie *cookie = domain->iova_cookie;
 	struct iommu_dma_msi_page *msi, *tmp;
+	bool s2_unmap = false;
 
 	if (!cookie)
 		return;
@@ -152,7 +163,15 @@  void iommu_put_dma_cookie(struct iommu_domain *domain)
 	if (cookie->type == IOMMU_DMA_IOVA_COOKIE && cookie->iovad.granule)
 		put_iova_domain(&cookie->iovad);
 
+	if (cookie->type == IOMMU_DMA_NESTED_MSI_COOKIE)
+		s2_unmap = true;
+
 	list_for_each_entry_safe(msi, tmp, &cookie->msi_page_list, list) {
+		if (s2_unmap && msi->phys) {
+			size_t size = cookie_msi_granule(cookie);
+
+			WARN_ON(iommu_unmap(domain, msi->ipa, size) != size);
+		}
 		list_del(&msi->list);
 		kfree(msi);
 	}
@@ -161,6 +180,50 @@  void iommu_put_dma_cookie(struct iommu_domain *domain)
 }
 EXPORT_SYMBOL(iommu_put_dma_cookie);
 
+/**
+ * iommu_dma_bind_doorbell - Allows to provide a usable IOVA page
+ * @domain: domain handle
+ * @binding: IOVA/IPA binding
+ *
+ * In nested stage use case, the user can provide IOVA/IPA bindings
+ * corresponding to a guest MSI stage 1 mapping. When the host needs
+ * to map its own MSI doorbells, it can use the IPA as stage 2 input
+ * and map it onto the physical MSI doorbell.
+ */
+int iommu_dma_bind_doorbell(struct iommu_domain *domain,
+			    struct iommu_guest_msi_binding *binding)
+{
+	struct iommu_dma_cookie *cookie = domain->iova_cookie;
+	struct iommu_dma_msi_page *msi;
+	dma_addr_t ipa, iova;
+	size_t size;
+
+	if (!cookie)
+		return -EINVAL;
+
+	if (cookie->type != IOMMU_DMA_NESTED_MSI_COOKIE)
+		return -EINVAL;
+
+	size = 1 << binding->granule;
+	iova = binding->iova & ~(phys_addr_t)(size - 1);
+	ipa = binding->gpa & ~(phys_addr_t)(size - 1);
+
+	list_for_each_entry(msi, &cookie->msi_page_list, list) {
+		if (msi->iova == iova)
+			return 0; /* this page is already registered */
+	}
+
+	msi = kzalloc(sizeof(*msi), GFP_KERNEL);
+	if (!msi)
+		return -ENOMEM;
+
+	msi->iova = iova;
+	msi->ipa = ipa;
+	list_add(&msi->list, &cookie->msi_page_list);
+	return 0;
+}
+EXPORT_SYMBOL(iommu_dma_bind_doorbell);
+
 /**
  * iommu_dma_get_resv_regions - Reserved region driver helper
  * @dev: Device from iommu_get_resv_regions()
@@ -846,6 +909,34 @@  static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
 		if (msi_page->phys == msi_addr)
 			return msi_page;
 
+	/*
+	 * In nested stage mode, we do not allocate an MSI page in
+	 * a range provided by the user. Instead, IOVA/IPA bindings are
+	 * individually provided. We reuse thise IOVAs to build the
+	 * IOVA -> IPA -> MSI PA nested stage mapping.
+	 */
+	if (cookie->type == IOMMU_DMA_NESTED_MSI_COOKIE) {
+		list_for_each_entry(msi_page, &cookie->msi_page_list, list)
+			if (!msi_page->phys) { /* this binding is free to use */
+				dma_addr_t ipa = msi_page->ipa;
+				int ret;
+
+				msi_page->phys = msi_addr;
+
+				/* do the stage 2 mapping */
+				ret = iommu_map(domain, ipa, msi_addr, size,
+						IOMMU_MMIO | IOMMU_WRITE);
+				if (ret) {
+					pr_warn("MSI S2 mapping failed (%d)\n",
+						ret);
+					return NULL;
+				}
+				return msi_page;
+			}
+		pr_warn("%s no MSI binding found\n", __func__);
+		return NULL;
+	}
+
 	msi_page = kzalloc(sizeof(*msi_page), GFP_ATOMIC);
 	if (!msi_page)
 		return NULL;
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index e8ca5e654277..324745eef644 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -24,6 +24,7 @@ 
 #include <linux/dma-mapping.h>
 #include <linux/iommu.h>
 #include <linux/msi.h>
+#include <uapi/linux/iommu.h>
 
 int iommu_dma_init(void);
 
@@ -74,12 +75,15 @@  int iommu_dma_mapping_error(struct device *dev, dma_addr_t dma_addr);
 /* The DMA API isn't _quite_ the whole story, though... */
 void iommu_dma_map_msi_msg(int irq, struct msi_msg *msg);
 void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list);
+int iommu_dma_bind_doorbell(struct iommu_domain *domain,
+			    struct iommu_guest_msi_binding *binding);
 
 #else
 
 struct iommu_domain;
 struct msi_msg;
 struct device;
+struct iommu_guest_msi_binding;
 
 static inline int iommu_dma_init(void)
 {
@@ -104,6 +108,13 @@  static inline void iommu_dma_map_msi_msg(int irq, struct msi_msg *msg)
 {
 }
 
+static inline int
+iommu_dma_bind_doorbell(struct iommu_domain *domain,
+			struct iommu_guest_msi_binding *binding)
+{
+	return -ENODEV;
+}
+
 static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)
 {
 }