Message ID | 20191128022550.9832-6-baolu.lu@linux.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Use 1st-level for DMA remapping | expand |
On Thu, 28 Nov 2019 10:25:47 +0800 Lu Baolu <baolu.lu@linux.intel.com> wrote: > This adds functions to manipulate first level page tables > which could be used by a scalale mode capable IOMMU unit. > FL and SL page tables are very similar, and I presume we are not using all the flag bits in FL paging structures for DMA mapping. Are there enough relevant differences to warrant a new set of helper functions for FL? Or we can merge into one. > Cc: Ashok Raj <ashok.raj@intel.com> > Cc: Jacob Pan <jacob.jun.pan@linux.intel.com> > Cc: Kevin Tian <kevin.tian@intel.com> > Cc: Liu Yi L <yi.l.liu@intel.com> > Cc: Yi Sun <yi.y.sun@linux.intel.com> > Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com> > --- > drivers/iommu/Makefile | 2 +- > drivers/iommu/intel-iommu.c | 33 +++ > drivers/iommu/intel-pgtable.c | 376 > +++++++++++++++++++++++++++++ include/linux/intel-iommu.h | > 33 ++- include/trace/events/intel_iommu.h | 60 +++++ > 5 files changed, 502 insertions(+), 2 deletions(-) > create mode 100644 drivers/iommu/intel-pgtable.c > > diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile > index 35d17094fe3b..aa04f4c3ae26 100644 > --- a/drivers/iommu/Makefile > +++ b/drivers/iommu/Makefile > @@ -18,7 +18,7 @@ obj-$(CONFIG_ARM_SMMU) += arm-smmu.o arm-smmu-impl.o > obj-$(CONFIG_ARM_SMMU_V3) += arm-smmu-v3.o > obj-$(CONFIG_DMAR_TABLE) += dmar.o > obj-$(CONFIG_INTEL_IOMMU) += intel-iommu.o intel-pasid.o > -obj-$(CONFIG_INTEL_IOMMU) += intel-trace.o > +obj-$(CONFIG_INTEL_IOMMU) += intel-trace.o intel-pgtable.o > obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += intel-iommu-debugfs.o > obj-$(CONFIG_INTEL_IOMMU_SVM) += intel-svm.o > obj-$(CONFIG_IPMMU_VMSA) += ipmmu-vmsa.o > diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c > index 66f76f6df2c2..a314892ee72b 100644 > --- a/drivers/iommu/intel-iommu.c > +++ b/drivers/iommu/intel-iommu.c > @@ -1670,6 +1670,37 @@ static void free_dmar_iommu(struct intel_iommu > *iommu) #endif > } > > +/* First level 5-level paging support */ > +static bool first_lvl_5lp_support(void) > +{ > + struct dmar_drhd_unit *drhd; > + struct intel_iommu *iommu; > + static int first_level_5lp_supported = -1; > + > + if (likely(first_level_5lp_supported != -1)) > + return first_level_5lp_supported; > + > + first_level_5lp_supported = 1; > +#ifdef CONFIG_X86 > + /* Match IOMMU first level and CPU paging mode */ > + if (!cpu_feature_enabled(X86_FEATURE_LA57)) { > + first_level_5lp_supported = 0; > + return first_level_5lp_supported; > + } > +#endif /* #ifdef CONFIG_X86 */ > + > + rcu_read_lock(); > + for_each_active_iommu(iommu, drhd) { > + if (!cap_5lp_support(iommu->cap)) { > + first_level_5lp_supported = 0; > + break; > + } > + } > + rcu_read_unlock(); > + > + return first_level_5lp_supported; > +} > + > static struct dmar_domain *alloc_domain(int flags) > { > struct dmar_domain *domain; > @@ -1683,6 +1714,8 @@ static struct dmar_domain *alloc_domain(int > flags) domain->flags = flags; > domain->has_iotlb_device = false; > domain->ops = &second_lvl_pgtable_ops; > + domain->first_lvl_5lp = first_lvl_5lp_support(); > + spin_lock_init(&domain->page_table_lock); > INIT_LIST_HEAD(&domain->devices); > > return domain; > diff --git a/drivers/iommu/intel-pgtable.c > b/drivers/iommu/intel-pgtable.c new file mode 100644 > index 000000000000..4a26d08a7570 > --- /dev/null > +++ b/drivers/iommu/intel-pgtable.c > @@ -0,0 +1,376 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/** > + * intel-pgtable.c - Intel IOMMU page table manipulation library > + * > + * Copyright (C) 2019 Intel Corporation > + * > + * Author: Lu Baolu <baolu.lu@linux.intel.com> > + */ > + > +#define pr_fmt(fmt) "DMAR: " fmt > +#include <linux/vmalloc.h> > +#include <linux/mm.h> > +#include <linux/sched.h> > +#include <linux/io.h> > +#include <linux/export.h> > +#include <linux/intel-iommu.h> > +#include <asm/cacheflush.h> > +#include <asm/pgtable.h> > +#include <asm/pgalloc.h> > +#include <trace/events/intel_iommu.h> > + > +/* > + * first_lvl_map: Map a range of IO virtual address to physical > addresses. > + */ > +#ifdef CONFIG_X86 > +#define pgtable_populate(domain, > nm) \ +do > { > \ > + void *__new = > alloc_pgtable_page(domain->nid); \ > + if > (!__new) \ > + return > -ENOMEM; \ > + > smp_wmb(); \ > + > spin_lock(&(domain)->page_table_lock); > \ > + if (nm ## _present(*nm)) > { \ > + > free_pgtable_page(__new); \ > + } else > { \ > + set_##nm(nm, __##nm(__pa(__new) | > _PAGE_TABLE)); \ > + domain_flush_cache(domain, nm, > sizeof(nm##_t)); \ > + } > \ > + > spin_unlock(&(domain)->page_table_lock); \ +} > while (0) + > +static int > +first_lvl_map_pte_range(struct dmar_domain *domain, pmd_t *pmd, > + unsigned long addr, unsigned long end, > + phys_addr_t phys_addr, pgprot_t prot) > +{ > + pte_t *pte, *first_pte; > + u64 pfn; > + > + pfn = phys_addr >> PAGE_SHIFT; > + if (unlikely(pmd_none(*pmd))) > + pgtable_populate(domain, pmd); > + > + first_pte = pte = pte_offset_kernel(pmd, addr); > + > + do { > + if (pte_present(*pte)) > + pr_crit("ERROR: PTE for vPFN 0x%llx already > set to 0x%llx\n", > + pfn, (unsigned long > long)pte_val(*pte)); > + set_pte(pte, pfn_pte(pfn, prot)); > + pfn++; > + } while (pte++, addr += PAGE_SIZE, addr != end); > + > + domain_flush_cache(domain, first_pte, (void *)pte - (void > *)first_pte); + > + return 0; > +} > + > +static int > +first_lvl_map_pmd_range(struct dmar_domain *domain, pud_t *pud, > + unsigned long addr, unsigned long end, > + phys_addr_t phys_addr, pgprot_t prot) > +{ > + unsigned long next; > + pmd_t *pmd; > + > + if (unlikely(pud_none(*pud))) > + pgtable_populate(domain, pud); > + pmd = pmd_offset(pud, addr); > + > + phys_addr -= addr; > + do { > + next = pmd_addr_end(addr, end); > + if (first_lvl_map_pte_range(domain, pmd, addr, next, > + phys_addr + addr, prot)) > + return -ENOMEM; > + } while (pmd++, addr = next, addr != end); > + > + return 0; > +} > + > +static int > +first_lvl_map_pud_range(struct dmar_domain *domain, p4d_t *p4d, > + unsigned long addr, unsigned long end, > + phys_addr_t phys_addr, pgprot_t prot) > +{ > + unsigned long next; > + pud_t *pud; > + > + if (unlikely(p4d_none(*p4d))) > + pgtable_populate(domain, p4d); > + > + pud = pud_offset(p4d, addr); > + > + phys_addr -= addr; > + do { > + next = pud_addr_end(addr, end); > + if (first_lvl_map_pmd_range(domain, pud, addr, next, > + phys_addr + addr, prot)) > + return -ENOMEM; > + } while (pud++, addr = next, addr != end); > + > + return 0; > +} > + > +static int > +first_lvl_map_p4d_range(struct dmar_domain *domain, pgd_t *pgd, > + unsigned long addr, unsigned long end, > + phys_addr_t phys_addr, pgprot_t prot) > +{ > + unsigned long next; > + p4d_t *p4d; > + > + if (domain->first_lvl_5lp && unlikely(pgd_none(*pgd))) > + pgtable_populate(domain, pgd); > + > + p4d = p4d_offset(pgd, addr); > + > + phys_addr -= addr; > + do { > + next = p4d_addr_end(addr, end); > + if (first_lvl_map_pud_range(domain, p4d, addr, next, > + phys_addr + addr, prot)) > + return -ENOMEM; > + } while (p4d++, addr = next, addr != end); > + > + return 0; > +} > + > +int first_lvl_map_range(struct dmar_domain *domain, unsigned long > addr, > + unsigned long end, phys_addr_t phys_addr, > int dma_prot) +{ > + unsigned long next; > + pgprot_t prot; > + pgd_t *pgd; > + > + trace_domain_mm_map(domain, addr, end, phys_addr); > + > + /* > + * There is no PAGE_KERNEL_WO for a pte entry, so let's use > RW > + * for a pte that requires write operation. > + */ > + prot = dma_prot & DMA_PTE_WRITE ? PAGE_KERNEL : > PAGE_KERNEL_RO; > + if (WARN_ON(addr >= end)) > + return -EINVAL; > + > + phys_addr -= addr; > + pgd = pgd_offset_pgd(domain->pgd, addr); > + do { > + next = pgd_addr_end(addr, end); > + if (first_lvl_map_p4d_range(domain, pgd, addr, next, > + phys_addr + addr, prot)) > + return -ENOMEM; > + } while (pgd++, addr = next, addr != end); > + > + return 0; > +} > + > +/* > + * first_lvl_unmap: Unmap an existing mapping between a range of IO > virtual > + * address and physical addresses. > + */ > +static struct page * > +first_lvl_unmap_pte_range(struct dmar_domain *domain, pmd_t *pmd, > + unsigned long addr, unsigned long end, > + struct page *freelist) > +{ > + unsigned long start; > + pte_t *pte, *first_pte; > + > + start = addr; > + pte = pte_offset_kernel(pmd, addr); > + first_pte = pte; > + do { > + set_pte(pte, __pte(0)); > + } while (pte++, addr += PAGE_SIZE, addr != end); > + > + domain_flush_cache(domain, first_pte, (void *)pte - (void > *)first_pte); + > + /* > + * Reclaim pmd page, lock is unnecessary here if it owns > + * the whole range. > + */ > + if (start != end && IS_ALIGNED(start | end, PMD_SIZE)) { > + struct page *pte_page; > + > + pte_page = pmd_page(*pmd); > + pte_page->freelist = freelist; > + freelist = pte_page; > + pmd_clear(pmd); > + domain_flush_cache(domain, pmd, sizeof(pmd_t)); > + } > + > + return freelist; > +} > + > +static struct page * > +first_lvl_unmap_pmd_range(struct dmar_domain *domain, pud_t *pud, > + unsigned long addr, unsigned long end, > + struct page *freelist) > +{ > + pmd_t *pmd; > + unsigned long start, next; > + > + start = addr; > + pmd = pmd_offset(pud, addr); > + do { > + next = pmd_addr_end(addr, end); > + if (pmd_none_or_clear_bad(pmd)) > + continue; > + freelist = first_lvl_unmap_pte_range(domain, pmd, > + addr, next, > freelist); > + } while (pmd++, addr = next, addr != end); > + > + /* > + * Reclaim pud page, lock is unnecessary here if it owns > + * the whole range. > + */ > + if (start != end && IS_ALIGNED(start | end, PUD_SIZE)) { > + struct page *pmd_page; > + > + pmd_page = pud_page(*pud); > + pmd_page->freelist = freelist; > + freelist = pmd_page; > + pud_clear(pud); > + domain_flush_cache(domain, pud, sizeof(pud_t)); > + } > + > + return freelist; > +} > + > +static struct page * > +first_lvl_unmap_pud_range(struct dmar_domain *domain, p4d_t *p4d, > + unsigned long addr, unsigned long end, > + struct page *freelist) > +{ > + pud_t *pud; > + unsigned long start, next; > + > + start = addr; > + pud = pud_offset(p4d, addr); > + do { > + next = pud_addr_end(addr, end); > + if (pud_none_or_clear_bad(pud)) > + continue; > + freelist = first_lvl_unmap_pmd_range(domain, pud, > + addr, next, > freelist); > + } while (pud++, addr = next, addr != end); > + > + /* > + * Reclaim p4d page, lock is unnecessary here if it owns > + * the whole range. > + */ > + if (start != end && IS_ALIGNED(start | end, P4D_SIZE)) { > + struct page *pud_page; > + > + pud_page = p4d_page(*p4d); > + pud_page->freelist = freelist; > + freelist = pud_page; > + p4d_clear(p4d); > + domain_flush_cache(domain, p4d, sizeof(p4d_t)); > + } > + > + return freelist; > +} > + > +static struct page * > +first_lvl_unmap_p4d_range(struct dmar_domain *domain, pgd_t *pgd, > + unsigned long addr, unsigned long end, > + struct page *freelist) > +{ > + p4d_t *p4d; > + unsigned long start, next; > + > + start = addr; > + p4d = p4d_offset(pgd, addr); > + do { > + next = p4d_addr_end(addr, end); > + if (p4d_none_or_clear_bad(p4d)) > + continue; > + freelist = first_lvl_unmap_pud_range(domain, p4d, > + addr, next, > freelist); > + } while (p4d++, addr = next, addr != end); > + > + /* > + * Reclaim pgd page, lock is unnecessary here if it owns > + * the whole range. > + */ > + if (domain->first_lvl_5lp && start != end && > + IS_ALIGNED(start | end, PGDIR_SIZE)) { > + struct page *p4d_page; > + > + p4d_page = pgd_page(*pgd); > + p4d_page->freelist = freelist; > + freelist = p4d_page; > + pgd_clear(pgd); > + domain_flush_cache(domain, pgd, sizeof(pgd_t)); > + } > + > + return freelist; > +} > + > +struct page *first_lvl_unmap_range(struct dmar_domain *domain, > + unsigned long addr, unsigned long > end) +{ > + pgd_t *pgd; > + unsigned long next; > + struct page *freelist = NULL; > + > + trace_domain_mm_unmap(domain, addr, end); > + > + if (WARN_ON(addr >= end)) > + return NULL; > + > + pgd = pgd_offset_pgd(domain->pgd, addr); > + do { > + next = pgd_addr_end(addr, end); > + if (pgd_none_or_clear_bad(pgd)) > + continue; > + freelist = first_lvl_unmap_p4d_range(domain, pgd, > + addr, next, > freelist); > + } while (pgd++, addr = next, addr != end); > + > + return freelist; > +} > + > +static pte_t *iova_to_pte(struct dmar_domain *domain, unsigned long > iova) +{ > + pgd_t *pgd; > + p4d_t *p4d; > + pud_t *pud; > + pmd_t *pmd; > + > + if (WARN_ON_ONCE(!IS_ALIGNED(iova, PAGE_SIZE))) > + return NULL; > + > + pgd = pgd_offset_pgd(domain->pgd, iova); > + if (pgd_none_or_clear_bad(pgd)) > + return NULL; > + > + p4d = p4d_offset(pgd, iova); > + if (p4d_none_or_clear_bad(p4d)) > + return NULL; > + > + pud = pud_offset(p4d, iova); > + if (pud_none_or_clear_bad(pud)) > + return NULL; > + > + pmd = pmd_offset(pud, iova); > + if (pmd_none_or_clear_bad(pmd)) > + return NULL; > + > + return pte_offset_kernel(pmd, iova); > +} > + > +phys_addr_t > +first_lvl_iova_to_phys(struct dmar_domain *domain, unsigned long > iova) +{ > + pte_t *pte = iova_to_pte(domain, PAGE_ALIGN(iova)); > + > + if (!pte || !pte_present(*pte)) > + return 0; > + > + return (pte_val(*pte) & PTE_PFN_MASK) | (iova & ~PAGE_MASK); > +} > +#endif /* CONFIG_X86 */ > diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h > index 9b259756057b..9273e3f59078 100644 > --- a/include/linux/intel-iommu.h > +++ b/include/linux/intel-iommu.h > @@ -540,9 +540,11 @@ struct dmar_domain { > struct iova_domain iovad; /* iova's that belong to > this domain */ > /* page table used by this domain */ > - struct dma_pte *pgd; /* virtual > address */ > + void *pgd; /* virtual address > */ > + spinlock_t page_table_lock; /* Protects page tables */ > int gaw; /* max guest address > width */ const struct pgtable_ops *ops; /* page table ops */ > + bool first_lvl_5lp; /* First level > 5-level paging support */ > /* adjusted guest address width, 0 is level 2 30-bit */ > int agaw; > @@ -708,6 +710,35 @@ int for_each_device_domain(int (*fn)(struct > device_domain_info *info, void iommu_flush_write_buffer(struct > intel_iommu *iommu); int intel_iommu_enable_pasid(struct intel_iommu > *iommu, struct device *dev); > +#ifdef CONFIG_X86 > +int first_lvl_map_range(struct dmar_domain *domain, unsigned long > addr, > + unsigned long end, phys_addr_t phys_addr, > int dma_prot); +struct page *first_lvl_unmap_range(struct dmar_domain > *domain, > + unsigned long addr, unsigned long > end); +phys_addr_t first_lvl_iova_to_phys(struct dmar_domain *domain, > + unsigned long iova); > +#else > +static inline int > +first_lvl_map_range(struct dmar_domain *domain, unsigned long addr, > + unsigned long end, phys_addr_t phys_addr, int > dma_prot) +{ > + return -ENODEV; > +} > + > +static inline struct page * > +first_lvl_unmap_range(struct dmar_domain *domain, > + unsigned long addr, unsigned long end) > +{ > + return NULL; > +} > + > +static inline phys_addr_t > +first_lvl_iova_to_phys(struct dmar_domain *domain, unsigned long > iova) +{ > + return 0; > +} > +#endif /* CONFIG_X86 */ > + > #ifdef CONFIG_INTEL_IOMMU_SVM > extern void intel_svm_check(struct intel_iommu *iommu); > extern int intel_svm_enable_prq(struct intel_iommu *iommu); > diff --git a/include/trace/events/intel_iommu.h > b/include/trace/events/intel_iommu.h index 54e61d456cdf..e8c95290fd13 > 100644 --- a/include/trace/events/intel_iommu.h > +++ b/include/trace/events/intel_iommu.h > @@ -99,6 +99,66 @@ DEFINE_EVENT(dma_unmap, bounce_unmap_single, > TP_ARGS(dev, dev_addr, size) > ); > > +DECLARE_EVENT_CLASS(domain_map, > + TP_PROTO(struct dmar_domain *domain, unsigned long addr, > + unsigned long end, phys_addr_t phys_addr), > + > + TP_ARGS(domain, addr, end, phys_addr), > + > + TP_STRUCT__entry( > + __field(struct dmar_domain *, domain) > + __field(unsigned long, addr) > + __field(unsigned long, end) > + __field(phys_addr_t, phys_addr) > + ), > + > + TP_fast_assign( > + __entry->domain = domain; > + __entry->addr = addr; > + __entry->end = end; > + __entry->phys_addr = phys_addr; > + ), > + > + TP_printk("domain=%p addr=0x%lx end=0x%lx phys_addr=0x%llx", > + __entry->domain, __entry->addr, __entry->end, > + (unsigned long long)__entry->phys_addr) > +); > + > +DEFINE_EVENT(domain_map, domain_mm_map, > + TP_PROTO(struct dmar_domain *domain, unsigned long addr, > + unsigned long end, phys_addr_t phys_addr), > + > + TP_ARGS(domain, addr, end, phys_addr) > +); > + > +DECLARE_EVENT_CLASS(domain_unmap, > + TP_PROTO(struct dmar_domain *domain, unsigned long addr, > + unsigned long end), > + > + TP_ARGS(domain, addr, end), > + > + TP_STRUCT__entry( > + __field(struct dmar_domain *, domain) > + __field(unsigned long, addr) > + __field(unsigned long, end) > + ), > + > + TP_fast_assign( > + __entry->domain = domain; > + __entry->addr = addr; > + __entry->end = end; > + ), > + > + TP_printk("domain=%p addr=0x%lx end=0x%lx", > + __entry->domain, __entry->addr, __entry->end) > +); > + > +DEFINE_EVENT(domain_unmap, domain_mm_unmap, > + TP_PROTO(struct dmar_domain *domain, unsigned long addr, > + unsigned long end), > + > + TP_ARGS(domain, addr, end) > +); > #endif /* _TRACE_INTEL_IOMMU_H */ > > /* This part must be outside protection */ [Jacob Pan]
Hi, On 12/3/19 7:27 AM, Jacob Pan wrote: > On Thu, 28 Nov 2019 10:25:47 +0800 > Lu Baolu<baolu.lu@linux.intel.com> wrote: > >> This adds functions to manipulate first level page tables >> which could be used by a scalale mode capable IOMMU unit. >> > FL and SL page tables are very similar, and I presume we are not using > all the flag bits in FL paging structures for DMA mapping. Are there > enough relevant differences to warrant a new set of helper functions > for FL? Or we can merge into one. > I ever thought about this and I am still open for this suggestion. We had a quick compare on these two page tables. The only concern is the read/write/present encoding. The present bit in first level implies read permission while second level page table explicitly has a READ bit. (recalled from memory, correct me if it's bad. :-)). Anyway, let's listen to more opinions. Best regards, baolu
Hi Jacob, On 12/3/19 7:27 AM, Jacob Pan wrote: > On Thu, 28 Nov 2019 10:25:47 +0800 > Lu Baolu <baolu.lu@linux.intel.com> wrote: > >> This adds functions to manipulate first level page tables >> which could be used by a scalale mode capable IOMMU unit. >> > FL and SL page tables are very similar, and I presume we are not using > all the flag bits in FL paging structures for DMA mapping. Are there > enough relevant differences to warrant a new set of helper functions > for FL? Or we can merge into one. I evaluated your suggestion these days. It turned out that your suggestion make code simpler and easier for maintainence. Thank you for the comment and I will send out a new version for review soon. Best regards, baolu
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index 35d17094fe3b..aa04f4c3ae26 100644 --- a/drivers/iommu/Makefile +++ b/drivers/iommu/Makefile @@ -18,7 +18,7 @@ obj-$(CONFIG_ARM_SMMU) += arm-smmu.o arm-smmu-impl.o obj-$(CONFIG_ARM_SMMU_V3) += arm-smmu-v3.o obj-$(CONFIG_DMAR_TABLE) += dmar.o obj-$(CONFIG_INTEL_IOMMU) += intel-iommu.o intel-pasid.o -obj-$(CONFIG_INTEL_IOMMU) += intel-trace.o +obj-$(CONFIG_INTEL_IOMMU) += intel-trace.o intel-pgtable.o obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += intel-iommu-debugfs.o obj-$(CONFIG_INTEL_IOMMU_SVM) += intel-svm.o obj-$(CONFIG_IPMMU_VMSA) += ipmmu-vmsa.o diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 66f76f6df2c2..a314892ee72b 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -1670,6 +1670,37 @@ static void free_dmar_iommu(struct intel_iommu *iommu) #endif } +/* First level 5-level paging support */ +static bool first_lvl_5lp_support(void) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + static int first_level_5lp_supported = -1; + + if (likely(first_level_5lp_supported != -1)) + return first_level_5lp_supported; + + first_level_5lp_supported = 1; +#ifdef CONFIG_X86 + /* Match IOMMU first level and CPU paging mode */ + if (!cpu_feature_enabled(X86_FEATURE_LA57)) { + first_level_5lp_supported = 0; + return first_level_5lp_supported; + } +#endif /* #ifdef CONFIG_X86 */ + + rcu_read_lock(); + for_each_active_iommu(iommu, drhd) { + if (!cap_5lp_support(iommu->cap)) { + first_level_5lp_supported = 0; + break; + } + } + rcu_read_unlock(); + + return first_level_5lp_supported; +} + static struct dmar_domain *alloc_domain(int flags) { struct dmar_domain *domain; @@ -1683,6 +1714,8 @@ static struct dmar_domain *alloc_domain(int flags) domain->flags = flags; domain->has_iotlb_device = false; domain->ops = &second_lvl_pgtable_ops; + domain->first_lvl_5lp = first_lvl_5lp_support(); + spin_lock_init(&domain->page_table_lock); INIT_LIST_HEAD(&domain->devices); return domain; diff --git a/drivers/iommu/intel-pgtable.c b/drivers/iommu/intel-pgtable.c new file mode 100644 index 000000000000..4a26d08a7570 --- /dev/null +++ b/drivers/iommu/intel-pgtable.c @@ -0,0 +1,376 @@ +// SPDX-License-Identifier: GPL-2.0 +/** + * intel-pgtable.c - Intel IOMMU page table manipulation library + * + * Copyright (C) 2019 Intel Corporation + * + * Author: Lu Baolu <baolu.lu@linux.intel.com> + */ + +#define pr_fmt(fmt) "DMAR: " fmt +#include <linux/vmalloc.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/io.h> +#include <linux/export.h> +#include <linux/intel-iommu.h> +#include <asm/cacheflush.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <trace/events/intel_iommu.h> + +/* + * first_lvl_map: Map a range of IO virtual address to physical addresses. + */ +#ifdef CONFIG_X86 +#define pgtable_populate(domain, nm) \ +do { \ + void *__new = alloc_pgtable_page(domain->nid); \ + if (!__new) \ + return -ENOMEM; \ + smp_wmb(); \ + spin_lock(&(domain)->page_table_lock); \ + if (nm ## _present(*nm)) { \ + free_pgtable_page(__new); \ + } else { \ + set_##nm(nm, __##nm(__pa(__new) | _PAGE_TABLE)); \ + domain_flush_cache(domain, nm, sizeof(nm##_t)); \ + } \ + spin_unlock(&(domain)->page_table_lock); \ +} while (0) + +static int +first_lvl_map_pte_range(struct dmar_domain *domain, pmd_t *pmd, + unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot) +{ + pte_t *pte, *first_pte; + u64 pfn; + + pfn = phys_addr >> PAGE_SHIFT; + if (unlikely(pmd_none(*pmd))) + pgtable_populate(domain, pmd); + + first_pte = pte = pte_offset_kernel(pmd, addr); + + do { + if (pte_present(*pte)) + pr_crit("ERROR: PTE for vPFN 0x%llx already set to 0x%llx\n", + pfn, (unsigned long long)pte_val(*pte)); + set_pte(pte, pfn_pte(pfn, prot)); + pfn++; + } while (pte++, addr += PAGE_SIZE, addr != end); + + domain_flush_cache(domain, first_pte, (void *)pte - (void *)first_pte); + + return 0; +} + +static int +first_lvl_map_pmd_range(struct dmar_domain *domain, pud_t *pud, + unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot) +{ + unsigned long next; + pmd_t *pmd; + + if (unlikely(pud_none(*pud))) + pgtable_populate(domain, pud); + pmd = pmd_offset(pud, addr); + + phys_addr -= addr; + do { + next = pmd_addr_end(addr, end); + if (first_lvl_map_pte_range(domain, pmd, addr, next, + phys_addr + addr, prot)) + return -ENOMEM; + } while (pmd++, addr = next, addr != end); + + return 0; +} + +static int +first_lvl_map_pud_range(struct dmar_domain *domain, p4d_t *p4d, + unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot) +{ + unsigned long next; + pud_t *pud; + + if (unlikely(p4d_none(*p4d))) + pgtable_populate(domain, p4d); + + pud = pud_offset(p4d, addr); + + phys_addr -= addr; + do { + next = pud_addr_end(addr, end); + if (first_lvl_map_pmd_range(domain, pud, addr, next, + phys_addr + addr, prot)) + return -ENOMEM; + } while (pud++, addr = next, addr != end); + + return 0; +} + +static int +first_lvl_map_p4d_range(struct dmar_domain *domain, pgd_t *pgd, + unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot) +{ + unsigned long next; + p4d_t *p4d; + + if (domain->first_lvl_5lp && unlikely(pgd_none(*pgd))) + pgtable_populate(domain, pgd); + + p4d = p4d_offset(pgd, addr); + + phys_addr -= addr; + do { + next = p4d_addr_end(addr, end); + if (first_lvl_map_pud_range(domain, p4d, addr, next, + phys_addr + addr, prot)) + return -ENOMEM; + } while (p4d++, addr = next, addr != end); + + return 0; +} + +int first_lvl_map_range(struct dmar_domain *domain, unsigned long addr, + unsigned long end, phys_addr_t phys_addr, int dma_prot) +{ + unsigned long next; + pgprot_t prot; + pgd_t *pgd; + + trace_domain_mm_map(domain, addr, end, phys_addr); + + /* + * There is no PAGE_KERNEL_WO for a pte entry, so let's use RW + * for a pte that requires write operation. + */ + prot = dma_prot & DMA_PTE_WRITE ? PAGE_KERNEL : PAGE_KERNEL_RO; + if (WARN_ON(addr >= end)) + return -EINVAL; + + phys_addr -= addr; + pgd = pgd_offset_pgd(domain->pgd, addr); + do { + next = pgd_addr_end(addr, end); + if (first_lvl_map_p4d_range(domain, pgd, addr, next, + phys_addr + addr, prot)) + return -ENOMEM; + } while (pgd++, addr = next, addr != end); + + return 0; +} + +/* + * first_lvl_unmap: Unmap an existing mapping between a range of IO virtual + * address and physical addresses. + */ +static struct page * +first_lvl_unmap_pte_range(struct dmar_domain *domain, pmd_t *pmd, + unsigned long addr, unsigned long end, + struct page *freelist) +{ + unsigned long start; + pte_t *pte, *first_pte; + + start = addr; + pte = pte_offset_kernel(pmd, addr); + first_pte = pte; + do { + set_pte(pte, __pte(0)); + } while (pte++, addr += PAGE_SIZE, addr != end); + + domain_flush_cache(domain, first_pte, (void *)pte - (void *)first_pte); + + /* + * Reclaim pmd page, lock is unnecessary here if it owns + * the whole range. + */ + if (start != end && IS_ALIGNED(start | end, PMD_SIZE)) { + struct page *pte_page; + + pte_page = pmd_page(*pmd); + pte_page->freelist = freelist; + freelist = pte_page; + pmd_clear(pmd); + domain_flush_cache(domain, pmd, sizeof(pmd_t)); + } + + return freelist; +} + +static struct page * +first_lvl_unmap_pmd_range(struct dmar_domain *domain, pud_t *pud, + unsigned long addr, unsigned long end, + struct page *freelist) +{ + pmd_t *pmd; + unsigned long start, next; + + start = addr; + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; + freelist = first_lvl_unmap_pte_range(domain, pmd, + addr, next, freelist); + } while (pmd++, addr = next, addr != end); + + /* + * Reclaim pud page, lock is unnecessary here if it owns + * the whole range. + */ + if (start != end && IS_ALIGNED(start | end, PUD_SIZE)) { + struct page *pmd_page; + + pmd_page = pud_page(*pud); + pmd_page->freelist = freelist; + freelist = pmd_page; + pud_clear(pud); + domain_flush_cache(domain, pud, sizeof(pud_t)); + } + + return freelist; +} + +static struct page * +first_lvl_unmap_pud_range(struct dmar_domain *domain, p4d_t *p4d, + unsigned long addr, unsigned long end, + struct page *freelist) +{ + pud_t *pud; + unsigned long start, next; + + start = addr; + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + freelist = first_lvl_unmap_pmd_range(domain, pud, + addr, next, freelist); + } while (pud++, addr = next, addr != end); + + /* + * Reclaim p4d page, lock is unnecessary here if it owns + * the whole range. + */ + if (start != end && IS_ALIGNED(start | end, P4D_SIZE)) { + struct page *pud_page; + + pud_page = p4d_page(*p4d); + pud_page->freelist = freelist; + freelist = pud_page; + p4d_clear(p4d); + domain_flush_cache(domain, p4d, sizeof(p4d_t)); + } + + return freelist; +} + +static struct page * +first_lvl_unmap_p4d_range(struct dmar_domain *domain, pgd_t *pgd, + unsigned long addr, unsigned long end, + struct page *freelist) +{ + p4d_t *p4d; + unsigned long start, next; + + start = addr; + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (p4d_none_or_clear_bad(p4d)) + continue; + freelist = first_lvl_unmap_pud_range(domain, p4d, + addr, next, freelist); + } while (p4d++, addr = next, addr != end); + + /* + * Reclaim pgd page, lock is unnecessary here if it owns + * the whole range. + */ + if (domain->first_lvl_5lp && start != end && + IS_ALIGNED(start | end, PGDIR_SIZE)) { + struct page *p4d_page; + + p4d_page = pgd_page(*pgd); + p4d_page->freelist = freelist; + freelist = p4d_page; + pgd_clear(pgd); + domain_flush_cache(domain, pgd, sizeof(pgd_t)); + } + + return freelist; +} + +struct page *first_lvl_unmap_range(struct dmar_domain *domain, + unsigned long addr, unsigned long end) +{ + pgd_t *pgd; + unsigned long next; + struct page *freelist = NULL; + + trace_domain_mm_unmap(domain, addr, end); + + if (WARN_ON(addr >= end)) + return NULL; + + pgd = pgd_offset_pgd(domain->pgd, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + freelist = first_lvl_unmap_p4d_range(domain, pgd, + addr, next, freelist); + } while (pgd++, addr = next, addr != end); + + return freelist; +} + +static pte_t *iova_to_pte(struct dmar_domain *domain, unsigned long iova) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + if (WARN_ON_ONCE(!IS_ALIGNED(iova, PAGE_SIZE))) + return NULL; + + pgd = pgd_offset_pgd(domain->pgd, iova); + if (pgd_none_or_clear_bad(pgd)) + return NULL; + + p4d = p4d_offset(pgd, iova); + if (p4d_none_or_clear_bad(p4d)) + return NULL; + + pud = pud_offset(p4d, iova); + if (pud_none_or_clear_bad(pud)) + return NULL; + + pmd = pmd_offset(pud, iova); + if (pmd_none_or_clear_bad(pmd)) + return NULL; + + return pte_offset_kernel(pmd, iova); +} + +phys_addr_t +first_lvl_iova_to_phys(struct dmar_domain *domain, unsigned long iova) +{ + pte_t *pte = iova_to_pte(domain, PAGE_ALIGN(iova)); + + if (!pte || !pte_present(*pte)) + return 0; + + return (pte_val(*pte) & PTE_PFN_MASK) | (iova & ~PAGE_MASK); +} +#endif /* CONFIG_X86 */ diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 9b259756057b..9273e3f59078 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -540,9 +540,11 @@ struct dmar_domain { struct iova_domain iovad; /* iova's that belong to this domain */ /* page table used by this domain */ - struct dma_pte *pgd; /* virtual address */ + void *pgd; /* virtual address */ + spinlock_t page_table_lock; /* Protects page tables */ int gaw; /* max guest address width */ const struct pgtable_ops *ops; /* page table ops */ + bool first_lvl_5lp; /* First level 5-level paging support */ /* adjusted guest address width, 0 is level 2 30-bit */ int agaw; @@ -708,6 +710,35 @@ int for_each_device_domain(int (*fn)(struct device_domain_info *info, void iommu_flush_write_buffer(struct intel_iommu *iommu); int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev); +#ifdef CONFIG_X86 +int first_lvl_map_range(struct dmar_domain *domain, unsigned long addr, + unsigned long end, phys_addr_t phys_addr, int dma_prot); +struct page *first_lvl_unmap_range(struct dmar_domain *domain, + unsigned long addr, unsigned long end); +phys_addr_t first_lvl_iova_to_phys(struct dmar_domain *domain, + unsigned long iova); +#else +static inline int +first_lvl_map_range(struct dmar_domain *domain, unsigned long addr, + unsigned long end, phys_addr_t phys_addr, int dma_prot) +{ + return -ENODEV; +} + +static inline struct page * +first_lvl_unmap_range(struct dmar_domain *domain, + unsigned long addr, unsigned long end) +{ + return NULL; +} + +static inline phys_addr_t +first_lvl_iova_to_phys(struct dmar_domain *domain, unsigned long iova) +{ + return 0; +} +#endif /* CONFIG_X86 */ + #ifdef CONFIG_INTEL_IOMMU_SVM extern void intel_svm_check(struct intel_iommu *iommu); extern int intel_svm_enable_prq(struct intel_iommu *iommu); diff --git a/include/trace/events/intel_iommu.h b/include/trace/events/intel_iommu.h index 54e61d456cdf..e8c95290fd13 100644 --- a/include/trace/events/intel_iommu.h +++ b/include/trace/events/intel_iommu.h @@ -99,6 +99,66 @@ DEFINE_EVENT(dma_unmap, bounce_unmap_single, TP_ARGS(dev, dev_addr, size) ); +DECLARE_EVENT_CLASS(domain_map, + TP_PROTO(struct dmar_domain *domain, unsigned long addr, + unsigned long end, phys_addr_t phys_addr), + + TP_ARGS(domain, addr, end, phys_addr), + + TP_STRUCT__entry( + __field(struct dmar_domain *, domain) + __field(unsigned long, addr) + __field(unsigned long, end) + __field(phys_addr_t, phys_addr) + ), + + TP_fast_assign( + __entry->domain = domain; + __entry->addr = addr; + __entry->end = end; + __entry->phys_addr = phys_addr; + ), + + TP_printk("domain=%p addr=0x%lx end=0x%lx phys_addr=0x%llx", + __entry->domain, __entry->addr, __entry->end, + (unsigned long long)__entry->phys_addr) +); + +DEFINE_EVENT(domain_map, domain_mm_map, + TP_PROTO(struct dmar_domain *domain, unsigned long addr, + unsigned long end, phys_addr_t phys_addr), + + TP_ARGS(domain, addr, end, phys_addr) +); + +DECLARE_EVENT_CLASS(domain_unmap, + TP_PROTO(struct dmar_domain *domain, unsigned long addr, + unsigned long end), + + TP_ARGS(domain, addr, end), + + TP_STRUCT__entry( + __field(struct dmar_domain *, domain) + __field(unsigned long, addr) + __field(unsigned long, end) + ), + + TP_fast_assign( + __entry->domain = domain; + __entry->addr = addr; + __entry->end = end; + ), + + TP_printk("domain=%p addr=0x%lx end=0x%lx", + __entry->domain, __entry->addr, __entry->end) +); + +DEFINE_EVENT(domain_unmap, domain_mm_unmap, + TP_PROTO(struct dmar_domain *domain, unsigned long addr, + unsigned long end), + + TP_ARGS(domain, addr, end) +); #endif /* _TRACE_INTEL_IOMMU_H */ /* This part must be outside protection */
This adds functions to manipulate first level page tables which could be used by a scalale mode capable IOMMU unit. Cc: Ashok Raj <ashok.raj@intel.com> Cc: Jacob Pan <jacob.jun.pan@linux.intel.com> Cc: Kevin Tian <kevin.tian@intel.com> Cc: Liu Yi L <yi.l.liu@intel.com> Cc: Yi Sun <yi.y.sun@linux.intel.com> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com> --- drivers/iommu/Makefile | 2 +- drivers/iommu/intel-iommu.c | 33 +++ drivers/iommu/intel-pgtable.c | 376 +++++++++++++++++++++++++++++ include/linux/intel-iommu.h | 33 ++- include/trace/events/intel_iommu.h | 60 +++++ 5 files changed, 502 insertions(+), 2 deletions(-) create mode 100644 drivers/iommu/intel-pgtable.c