Message ID | 4ca696150d2baee03af27c4ddefdb7b0b0280e7b.1740014950.git.nicolinc@nvidia.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | iommu: Add MSI mapping support with nested SMMU (Part-1 core) | expand |
On 2025-02-20 1:31 am, Nicolin Chen wrote: > From: Jason Gunthorpe <jgg@nvidia.com> > > SW_MSI supports IOMMU to translate an MSI message before the MSI message > is delivered to the interrupt controller. On such systems, an iommu_domain > must have a translation for the MSI message for interrupts to work. > > The IRQ subsystem will call into IOMMU to request that a physical page be > set up to receive MSI messages, and the IOMMU then sets an IOVA that maps > to that physical page. Ultimately the IOVA is programmed into the device > via the msi_msg. > > Generalize this by allowing iommu_domain owners to provide implementations > of this mapping. Add a function pointer in struct iommu_domain to allow a > domain owner to provide its own implementation. > > Have dma-iommu supply its implementation for IOMMU_DOMAIN_DMA types during > the iommu_get_dma_cookie() path. For IOMMU_DOMAIN_UNMANAGED types used by > VFIO (and iommufd for now), have the same iommu_dma_sw_msi set as well in > the iommu_get_msi_cookie() path. > > Hold the group mutex while in iommu_dma_prepare_msi() to ensure the domain > doesn't change or become freed while running. Races with IRQ operations > from VFIO and domain changes from iommufd are possible here. > > Replace the msi_prepare_lock with a lockdep assertion for the group mutex > as documentation. For the dmau_iommu.c each iommu_domain is unique to a > group. > > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> > Signed-off-by: Nicolin Chen <nicolinc@nvidia.com> > --- > include/linux/iommu.h | 44 ++++++++++++++++++++++++++------------- > drivers/iommu/dma-iommu.c | 33 +++++++++++++---------------- > drivers/iommu/iommu.c | 29 ++++++++++++++++++++++++++ > 3 files changed, 73 insertions(+), 33 deletions(-) > > diff --git a/include/linux/iommu.h b/include/linux/iommu.h > index caee952febd4..761c5e186de9 100644 > --- a/include/linux/iommu.h > +++ b/include/linux/iommu.h > @@ -44,6 +44,8 @@ struct iommu_dma_cookie; > struct iommu_fault_param; > struct iommufd_ctx; > struct iommufd_viommu; > +struct msi_desc; > +struct msi_msg; > > #define IOMMU_FAULT_PERM_READ (1 << 0) /* read */ > #define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */ > @@ -216,6 +218,12 @@ struct iommu_domain { > struct iommu_domain_geometry geometry; > struct iommu_dma_cookie *iova_cookie; > int (*iopf_handler)(struct iopf_group *group); > + > +#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) > + int (*sw_msi)(struct iommu_domain *domain, struct msi_desc *desc, > + phys_addr_t msi_addr); > +#endif > + > void *fault_data; > union { > struct { > @@ -234,6 +242,16 @@ struct iommu_domain { > }; > }; > > +static inline void iommu_domain_set_sw_msi( > + struct iommu_domain *domain, > + int (*sw_msi)(struct iommu_domain *domain, struct msi_desc *desc, > + phys_addr_t msi_addr)) > +{ > +#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) > + domain->sw_msi = sw_msi; > +#endif > +} Yuck. Realistically we are going to have no more than two different implementations of this; a fiddly callback interface seems overkill. All we should need in the domain is a simple indicator of *which* MSI translation scheme is in use (if it can't be squeezed into the domain type itself), then iommu_dma_prepare_msi() can simply dispatch between iommu-dma and IOMMUFD based on that, and then it's easy to solve all the other fragility issues too. Thanks, Robin. > + > static inline bool iommu_is_dma_domain(struct iommu_domain *domain) > { > return domain->type & __IOMMU_DOMAIN_DMA_API; > @@ -1470,6 +1488,18 @@ static inline ioasid_t iommu_alloc_global_pasid(struct device *dev) > static inline void iommu_free_global_pasid(ioasid_t pasid) {} > #endif /* CONFIG_IOMMU_API */ > > +#ifdef CONFIG_IRQ_MSI_IOMMU > +#ifdef CONFIG_IOMMU_API > +int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr); > +#else > +static inline int iommu_dma_prepare_msi(struct msi_desc *desc, > + phys_addr_t msi_addr) > +{ > + return 0; > +} > +#endif /* CONFIG_IOMMU_API */ > +#endif /* CONFIG_IRQ_MSI_IOMMU */ > + > #if IS_ENABLED(CONFIG_LOCKDEP) && IS_ENABLED(CONFIG_IOMMU_API) > void iommu_group_mutex_assert(struct device *dev); > #else > @@ -1503,26 +1533,12 @@ static inline void iommu_debugfs_setup(void) {} > #endif > > #ifdef CONFIG_IOMMU_DMA > -#include <linux/msi.h> > - > int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base); > - > -int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr); > - > #else /* CONFIG_IOMMU_DMA */ > - > -struct msi_desc; > -struct msi_msg; > - > static inline int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base) > { > return -ENODEV; > } > - > -static inline int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) > -{ > - return 0; > -} > #endif /* CONFIG_IOMMU_DMA */ > > /* > diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c > index bf91e014d179..3b58244e6344 100644 > --- a/drivers/iommu/dma-iommu.c > +++ b/drivers/iommu/dma-iommu.c > @@ -24,6 +24,7 @@ > #include <linux/memremap.h> > #include <linux/mm.h> > #include <linux/mutex.h> > +#include <linux/msi.h> > #include <linux/of_iommu.h> > #include <linux/pci.h> > #include <linux/scatterlist.h> > @@ -102,6 +103,9 @@ static int __init iommu_dma_forcedac_setup(char *str) > } > early_param("iommu.forcedac", iommu_dma_forcedac_setup); > > +static int iommu_dma_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, > + phys_addr_t msi_addr); > + > /* Number of entries per flush queue */ > #define IOVA_DEFAULT_FQ_SIZE 256 > #define IOVA_SINGLE_FQ_SIZE 32768 > @@ -398,6 +402,7 @@ int iommu_get_dma_cookie(struct iommu_domain *domain) > return -ENOMEM; > > mutex_init(&domain->iova_cookie->mutex); > + iommu_domain_set_sw_msi(domain, iommu_dma_sw_msi); > return 0; > } > > @@ -429,6 +434,7 @@ int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base) > > cookie->msi_iova = base; > domain->iova_cookie = cookie; > + iommu_domain_set_sw_msi(domain, iommu_dma_sw_msi); > return 0; > } > EXPORT_SYMBOL(iommu_get_msi_cookie); > @@ -443,6 +449,9 @@ void iommu_put_dma_cookie(struct iommu_domain *domain) > struct iommu_dma_cookie *cookie = domain->iova_cookie; > struct iommu_dma_msi_page *msi, *tmp; > > + if (domain->sw_msi != iommu_dma_sw_msi) > + return; > + > if (!cookie) > return; > > @@ -1800,33 +1809,19 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, > return NULL; > } > > -/** > - * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain > - * @desc: MSI descriptor, will store the MSI page > - * @msi_addr: MSI target address to be mapped > - * > - * Return: 0 on success or negative error code if the mapping failed. > - */ > -int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) > +static int iommu_dma_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, > + phys_addr_t msi_addr) > { > struct device *dev = msi_desc_to_dev(desc); > - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); > - struct iommu_dma_msi_page *msi_page; > - static DEFINE_MUTEX(msi_prepare_lock); /* see below */ > + const struct iommu_dma_msi_page *msi_page; > > - if (!domain || !domain->iova_cookie) { > + if (!domain->iova_cookie) { > msi_desc_set_iommu_msi_iova(desc, 0, 0); > return 0; > } > > - /* > - * In fact the whole prepare operation should already be serialised by > - * irq_domain_mutex further up the callchain, but that's pretty subtle > - * on its own, so consider this locking as failsafe documentation... > - */ > - mutex_lock(&msi_prepare_lock); > + iommu_group_mutex_assert(dev); > msi_page = iommu_dma_get_msi_page(dev, msi_addr, domain); > - mutex_unlock(&msi_prepare_lock); > if (!msi_page) > return -ENOMEM; > > diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c > index 870c3cdbd0f6..022bf96a18c5 100644 > --- a/drivers/iommu/iommu.c > +++ b/drivers/iommu/iommu.c > @@ -3596,3 +3596,32 @@ int iommu_replace_group_handle(struct iommu_group *group, > return ret; > } > EXPORT_SYMBOL_NS_GPL(iommu_replace_group_handle, "IOMMUFD_INTERNAL"); > + > +#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) > +/** > + * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain > + * @desc: MSI descriptor, will store the MSI page > + * @msi_addr: MSI target address to be mapped > + * > + * The implementation of sw_msi() should take msi_addr and map it to > + * an IOVA in the domain and call msi_desc_set_iommu_msi_iova() with the > + * mapping information. > + * > + * Return: 0 on success or negative error code if the mapping failed. > + */ > +int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) > +{ > + struct device *dev = msi_desc_to_dev(desc); > + struct iommu_group *group = dev->iommu_group; > + int ret = 0; > + > + if (!group) > + return 0; > + > + mutex_lock(&group->mutex); > + if (group->domain && group->domain->sw_msi) > + ret = group->domain->sw_msi(group->domain, desc, msi_addr); > + mutex_unlock(&group->mutex); > + return ret; > +} > +#endif /* CONFIG_IRQ_MSI_IOMMU */
On Fri, Feb 21, 2025 at 03:39:45PM +0000, Robin Murphy wrote: > Yuck. Realistically we are going to have no more than two different > implementations of this; a fiddly callback interface seems overkill. All we > should need in the domain is a simple indicator of *which* MSI translation > scheme is in use (if it can't be squeezed into the domain type itself), then > iommu_dma_prepare_msi() can simply dispatch between iommu-dma and IOMMUFD > based on that, and then it's easy to solve all the other fragility issues > too. That would make module dependency problems, we have so far avoided having the core kernel hard depend on iommufd. Jason
On 2025-02-21 4:44 pm, Jason Gunthorpe wrote: > On Fri, Feb 21, 2025 at 03:39:45PM +0000, Robin Murphy wrote: >> Yuck. Realistically we are going to have no more than two different >> implementations of this; a fiddly callback interface seems overkill. All we >> should need in the domain is a simple indicator of *which* MSI translation >> scheme is in use (if it can't be squeezed into the domain type itself), then >> iommu_dma_prepare_msi() can simply dispatch between iommu-dma and IOMMUFD >> based on that, and then it's easy to solve all the other fragility issues >> too. > > That would make module dependency problems, we have so far avoided > having the core kernel hard depend on iommufd. It wouldn't need a hard dependency, it's easy to have a trivial built-in stub function which becomes valid once the module loads - you literally have the iommufd_driver infrastructure for precisely that sort of thing already. All I'm saying is to hide the callback detail in the IOMMUFD code because being IOMMUFD modular is unique to IOMMUFD and not the rest of the core code's problem. And frankly otherwise, what even is the benefit of moving the iova_cookie pointer into the union if we have to replace it with another whole pointer to make it work? This is just adding more code and more complexity in in order to make struct iommu_domain... the same size it already is :/ Thanks, Robin.
On Thu, Feb 27, 2025 at 11:21:28AM +0000, Robin Murphy wrote: > It wouldn't need a hard dependency, it's easy to have a trivial built-in > stub function which becomes valid once the module loads - you literally have > the iommufd_driver infrastructure for precisely that sort of thing already. Yes, but I also kinda dislike using it because it bloats the built in kernel for a narrow use case.. > All I'm saying is to hide the callback detail in the IOMMUFD code because > being IOMMUFD modular is unique to IOMMUFD and not the rest of the core > code's problem. Maybe we could use a global function pointer set/cleared on iommufd module load? Regardless, we need to first find a way for the core code to tell if the domain is iommufd owned or not. We should also make it so we can tell if dma-iommu.c is linked to that domain (eg vfio or the default_domain), then we can do the iova_cookie move without changing the destruction flows. This would be the missing union struct tag you mentioned in the other email. What I've been thinking of is changing type into flags. I think we have now removed type from all drivers so this should be a small enough work. Nicolin should be able to look into some followup here, it is not a small change. > And frankly otherwise, what even is the benefit of moving the iova_cookie > pointer into the union if we have to replace it with another whole pointer > to make it work? It makes a lot more semantic sense that the domain owners all share a single "private data" pointer. > This is just adding more code and more complexity in in > order to make struct iommu_domain... the same size it already is :/ That we get back the space we spent on sw_msi is a nice bonus. Jason
On Thu, Feb 27, 2025 at 11:32:42AM -0400, Jason Gunthorpe wrote: > On Thu, Feb 27, 2025 at 11:21:28AM +0000, Robin Murphy wrote: > > All I'm saying is to hide the callback detail in the IOMMUFD code because > > being IOMMUFD modular is unique to IOMMUFD and not the rest of the core > > code's problem. > > Maybe we could use a global function pointer set/cleared on iommufd > module load? > > Regardless, we need to first find a way for the core code to tell if > the domain is iommufd owned or not. > > We should also make it so we can tell if dma-iommu.c is linked to that > domain (eg vfio or the default_domain), then we can do the iova_cookie > move without changing the destruction flows. This would be the missing > union struct tag you mentioned in the other email. > > What I've been thinking of is changing type into flags. I think we > have now removed type from all drivers so this should be a small > enough work. > > Nicolin should be able to look into some followup here, it is not a > small change. > > > And frankly otherwise, what even is the benefit of moving the iova_cookie > > pointer into the union if we have to replace it with another whole pointer > > to make it work? > > It makes a lot more semantic sense that the domain owners all share a > single "private data" pointer. I found a bit confusing to use "owner" as the domain->owner isn't the same thing in this context. Maybe it should be "driver_ops"? Then, "owner" could be another op structure that holds the owner- specific things, such as: enum iommu_domain_owner { DMA/VFIO/IOMMUFD}; // or flag? union { iova_cookie; // DMA msi_cookie; // VFIO iommufd_hwpt; // IOMMUFD } (*sw_msi); ? Thanks Nicolin
On Thu, Feb 27, 2025 at 09:46:55AM -0800, Nicolin Chen wrote: > I found a bit confusing to use "owner" as the domain->owner isn't > the same thing in this context. Maybe it should be "driver_ops"? Maybe, but I wouldn't churn it > Then, "owner" could be another op structure that holds the owner- > specific things, such as: > enum iommu_domain_owner { DMA/VFIO/IOMMUFD}; // or flag? I was thinking about breaking type into something like this: u32 private_data_owner:2 // DMA/IOMMUFD/None u32 translation_type:3 // paging/identity/sva/platform/blocked/nested u32 dma_fq:1 // true/false u32 dma_api_domain:1 // true/false Which is close to how it already is with just some breaking up of the bits differently.. Get rid of the word unmanaged and drop the IOMMU_DOMAIN_* defines. I also wanted to separate the "policy" enum that determines which of the three default domains you get from the type. Lots of type combinations are not allowed as policy. Jason
diff --git a/include/linux/iommu.h b/include/linux/iommu.h index caee952febd4..761c5e186de9 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -44,6 +44,8 @@ struct iommu_dma_cookie; struct iommu_fault_param; struct iommufd_ctx; struct iommufd_viommu; +struct msi_desc; +struct msi_msg; #define IOMMU_FAULT_PERM_READ (1 << 0) /* read */ #define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */ @@ -216,6 +218,12 @@ struct iommu_domain { struct iommu_domain_geometry geometry; struct iommu_dma_cookie *iova_cookie; int (*iopf_handler)(struct iopf_group *group); + +#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) + int (*sw_msi)(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr); +#endif + void *fault_data; union { struct { @@ -234,6 +242,16 @@ struct iommu_domain { }; }; +static inline void iommu_domain_set_sw_msi( + struct iommu_domain *domain, + int (*sw_msi)(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr)) +{ +#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) + domain->sw_msi = sw_msi; +#endif +} + static inline bool iommu_is_dma_domain(struct iommu_domain *domain) { return domain->type & __IOMMU_DOMAIN_DMA_API; @@ -1470,6 +1488,18 @@ static inline ioasid_t iommu_alloc_global_pasid(struct device *dev) static inline void iommu_free_global_pasid(ioasid_t pasid) {} #endif /* CONFIG_IOMMU_API */ +#ifdef CONFIG_IRQ_MSI_IOMMU +#ifdef CONFIG_IOMMU_API +int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr); +#else +static inline int iommu_dma_prepare_msi(struct msi_desc *desc, + phys_addr_t msi_addr) +{ + return 0; +} +#endif /* CONFIG_IOMMU_API */ +#endif /* CONFIG_IRQ_MSI_IOMMU */ + #if IS_ENABLED(CONFIG_LOCKDEP) && IS_ENABLED(CONFIG_IOMMU_API) void iommu_group_mutex_assert(struct device *dev); #else @@ -1503,26 +1533,12 @@ static inline void iommu_debugfs_setup(void) {} #endif #ifdef CONFIG_IOMMU_DMA -#include <linux/msi.h> - int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base); - -int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr); - #else /* CONFIG_IOMMU_DMA */ - -struct msi_desc; -struct msi_msg; - static inline int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base) { return -ENODEV; } - -static inline int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) -{ - return 0; -} #endif /* CONFIG_IOMMU_DMA */ /* diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index bf91e014d179..3b58244e6344 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -24,6 +24,7 @@ #include <linux/memremap.h> #include <linux/mm.h> #include <linux/mutex.h> +#include <linux/msi.h> #include <linux/of_iommu.h> #include <linux/pci.h> #include <linux/scatterlist.h> @@ -102,6 +103,9 @@ static int __init iommu_dma_forcedac_setup(char *str) } early_param("iommu.forcedac", iommu_dma_forcedac_setup); +static int iommu_dma_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr); + /* Number of entries per flush queue */ #define IOVA_DEFAULT_FQ_SIZE 256 #define IOVA_SINGLE_FQ_SIZE 32768 @@ -398,6 +402,7 @@ int iommu_get_dma_cookie(struct iommu_domain *domain) return -ENOMEM; mutex_init(&domain->iova_cookie->mutex); + iommu_domain_set_sw_msi(domain, iommu_dma_sw_msi); return 0; } @@ -429,6 +434,7 @@ int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base) cookie->msi_iova = base; domain->iova_cookie = cookie; + iommu_domain_set_sw_msi(domain, iommu_dma_sw_msi); return 0; } EXPORT_SYMBOL(iommu_get_msi_cookie); @@ -443,6 +449,9 @@ void iommu_put_dma_cookie(struct iommu_domain *domain) struct iommu_dma_cookie *cookie = domain->iova_cookie; struct iommu_dma_msi_page *msi, *tmp; + if (domain->sw_msi != iommu_dma_sw_msi) + return; + if (!cookie) return; @@ -1800,33 +1809,19 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, return NULL; } -/** - * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain - * @desc: MSI descriptor, will store the MSI page - * @msi_addr: MSI target address to be mapped - * - * Return: 0 on success or negative error code if the mapping failed. - */ -int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) +static int iommu_dma_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr) { struct device *dev = msi_desc_to_dev(desc); - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - struct iommu_dma_msi_page *msi_page; - static DEFINE_MUTEX(msi_prepare_lock); /* see below */ + const struct iommu_dma_msi_page *msi_page; - if (!domain || !domain->iova_cookie) { + if (!domain->iova_cookie) { msi_desc_set_iommu_msi_iova(desc, 0, 0); return 0; } - /* - * In fact the whole prepare operation should already be serialised by - * irq_domain_mutex further up the callchain, but that's pretty subtle - * on its own, so consider this locking as failsafe documentation... - */ - mutex_lock(&msi_prepare_lock); + iommu_group_mutex_assert(dev); msi_page = iommu_dma_get_msi_page(dev, msi_addr, domain); - mutex_unlock(&msi_prepare_lock); if (!msi_page) return -ENOMEM; diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 870c3cdbd0f6..022bf96a18c5 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3596,3 +3596,32 @@ int iommu_replace_group_handle(struct iommu_group *group, return ret; } EXPORT_SYMBOL_NS_GPL(iommu_replace_group_handle, "IOMMUFD_INTERNAL"); + +#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) +/** + * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain + * @desc: MSI descriptor, will store the MSI page + * @msi_addr: MSI target address to be mapped + * + * The implementation of sw_msi() should take msi_addr and map it to + * an IOVA in the domain and call msi_desc_set_iommu_msi_iova() with the + * mapping information. + * + * Return: 0 on success or negative error code if the mapping failed. + */ +int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) +{ + struct device *dev = msi_desc_to_dev(desc); + struct iommu_group *group = dev->iommu_group; + int ret = 0; + + if (!group) + return 0; + + mutex_lock(&group->mutex); + if (group->domain && group->domain->sw_msi) + ret = group->domain->sw_msi(group->domain, desc, msi_addr); + mutex_unlock(&group->mutex); + return ret; +} +#endif /* CONFIG_IRQ_MSI_IOMMU */