Message ID | 20240222173942.1481394-2-pasha.tatashin@soleen.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | IOMMU memory observability | expand |
Hi Pasha, On 2024-02-22 at 23:09:27, Pasha Tatashin (pasha.tatashin@soleen.com) wrote: > In order to improve observability and accountability of IOMMU layer, we > must account the number of pages that are allocated by functions that > are calling directly into buddy allocator. > > This is achieved by first wrapping the allocation related functions into a > separate inline functions in new file: > > drivers/iommu/iommu-pages.h > > Convert all page allocation calls under iommu/intel to use these new > functions. > > Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com> > Acked-by: David Rientjes <rientjes@google.com> > Tested-by: Bagas Sanjaya <bagasdotme@gmail.com> > --- > drivers/iommu/intel/dmar.c | 16 +-- > drivers/iommu/intel/iommu.c | 47 +++------ > drivers/iommu/intel/iommu.h | 2 - > drivers/iommu/intel/irq_remapping.c | 16 +-- > drivers/iommu/intel/pasid.c | 18 ++-- > drivers/iommu/intel/svm.c | 11 +- > drivers/iommu/iommu-pages.h | 154 ++++++++++++++++++++++++++++ > 7 files changed, 201 insertions(+), 63 deletions(-) > create mode 100644 drivers/iommu/iommu-pages.h Few minor nits. > > diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c > index 23cb80d62a9a..ff6045ae8e97 100644 > --- a/drivers/iommu/intel/dmar.c > +++ b/drivers/iommu/intel/dmar.c > @@ -32,6 +32,7 @@ > > #include "iommu.h" > #include "../irq_remapping.h" > +#include "../iommu-pages.h" > #include "perf.h" > #include "trace.h" > #include "perfmon.h" > @@ -1185,7 +1186,7 @@ static void free_iommu(struct intel_iommu *iommu) > } > > if (iommu->qi) { > - free_page((unsigned long)iommu->qi->desc); > + iommu_free_page(iommu->qi->desc); > kfree(iommu->qi->desc_status); > kfree(iommu->qi); > } > @@ -1731,7 +1732,8 @@ static void __dmar_enable_qi(struct intel_iommu *iommu) > int dmar_enable_qi(struct intel_iommu *iommu) > { > struct q_inval *qi; > - struct page *desc_page; > + void *desc; > + int order; > > if (!ecap_qis(iommu->ecap)) > return -ENOENT; > @@ -1752,19 +1754,19 @@ int dmar_enable_qi(struct intel_iommu *iommu) > * Need two pages to accommodate 256 descriptors of 256 bits each > * if the remapping hardware supports scalable mode translation. > */ > - desc_page = alloc_pages_node(iommu->node, GFP_ATOMIC | __GFP_ZERO, > - !!ecap_smts(iommu->ecap)); > - if (!desc_page) { > + order = ecap_smts(iommu->ecap) ? 1 : 0; > + desc = iommu_alloc_pages_node(iommu->node, GFP_ATOMIC, order); > + if (!desc) { > kfree(qi); > iommu->qi = NULL; > return -ENOMEM; > } > > - qi->desc = page_address(desc_page); > + qi->desc = desc; > > qi->desc_status = kcalloc(QI_LENGTH, sizeof(int), GFP_ATOMIC); > if (!qi->desc_status) { > - free_page((unsigned long) qi->desc); > + iommu_free_page(qi->desc); > kfree(qi); > iommu->qi = NULL; > return -ENOMEM; > diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c > index 6fb5f6fceea1..2c676f46e38c 100644 > --- a/drivers/iommu/intel/iommu.c > +++ b/drivers/iommu/intel/iommu.c > @@ -28,6 +28,7 @@ > #include "../dma-iommu.h" > #include "../irq_remapping.h" > #include "../iommu-sva.h" > +#include "../iommu-pages.h" > #include "pasid.h" > #include "cap_audit.h" > #include "perfmon.h" > @@ -224,22 +225,6 @@ static int __init intel_iommu_setup(char *str) > } > __setup("intel_iommu=", intel_iommu_setup); > > -void *alloc_pgtable_page(int node, gfp_t gfp) > -{ > - struct page *page; > - void *vaddr = NULL; > - > - page = alloc_pages_node(node, gfp | __GFP_ZERO, 0); > - if (page) > - vaddr = page_address(page); > - return vaddr; > -} > - > -void free_pgtable_page(void *vaddr) > -{ > - free_page((unsigned long)vaddr); > -} > - > static int domain_type_is_si(struct dmar_domain *domain) > { > return domain->domain.type == IOMMU_DOMAIN_IDENTITY; > @@ -473,7 +458,7 @@ struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, > if (!alloc) > return NULL; > > - context = alloc_pgtable_page(iommu->node, GFP_ATOMIC); > + context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC); > if (!context) > return NULL; > > @@ -647,17 +632,17 @@ static void free_context_table(struct intel_iommu *iommu) > for (i = 0; i < ROOT_ENTRY_NR; i++) { > context = iommu_context_addr(iommu, i, 0, 0); > if (context) > - free_pgtable_page(context); > + iommu_free_page(context); > > if (!sm_supported(iommu)) > continue; > > context = iommu_context_addr(iommu, i, 0x80, 0); > if (context) > - free_pgtable_page(context); > + iommu_free_page(context); > } > > - free_pgtable_page(iommu->root_entry); > + iommu_free_page(iommu->root_entry); > iommu->root_entry = NULL; > } > > @@ -795,7 +780,7 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, > if (!dma_pte_present(pte)) { > uint64_t pteval; > > - tmp_page = alloc_pgtable_page(domain->nid, gfp); > + tmp_page = iommu_alloc_page_node(domain->nid, gfp); > > if (!tmp_page) > return NULL; > @@ -807,7 +792,7 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, > > if (cmpxchg64(&pte->val, 0ULL, pteval)) > /* Someone else set it while we were thinking; use theirs. */ > - free_pgtable_page(tmp_page); > + iommu_free_page(tmp_page); > else > domain_flush_cache(domain, pte, sizeof(*pte)); > } > @@ -920,7 +905,7 @@ static void dma_pte_free_level(struct dmar_domain *domain, int level, > last_pfn < level_pfn + level_size(level) - 1)) { > dma_clear_pte(pte); > domain_flush_cache(domain, pte, sizeof(*pte)); > - free_pgtable_page(level_pte); > + iommu_free_page(level_pte); > } > next: > pfn += level_size(level); > @@ -944,7 +929,7 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain, > > /* free pgd */ > if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { > - free_pgtable_page(domain->pgd); > + iommu_free_page(domain->pgd); > domain->pgd = NULL; > } > } > @@ -1046,7 +1031,7 @@ static int iommu_alloc_root_entry(struct intel_iommu *iommu) > { > struct root_entry *root; > > - root = alloc_pgtable_page(iommu->node, GFP_ATOMIC); > + root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC); > if (!root) { > pr_err("Allocating root entry for %s failed\n", > iommu->name); > @@ -1718,7 +1703,7 @@ static void domain_exit(struct dmar_domain *domain) > LIST_HEAD(freelist); > > domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist); > - put_pages_list(&freelist); > + iommu_put_pages_list(&freelist); > } > > if (WARN_ON(!list_empty(&domain->devices))) > @@ -2452,7 +2437,7 @@ static int copy_context_table(struct intel_iommu *iommu, > if (!old_ce) > goto out; > > - new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL); > + new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL); > if (!new_ce) > goto out_unmap; > > @@ -3385,7 +3370,7 @@ static int intel_iommu_memory_notifier(struct notifier_block *nb, > start_vpfn, mhp->nr_pages, > list_empty(&freelist), 0); > rcu_read_unlock(); > - put_pages_list(&freelist); > + iommu_put_pages_list(&freelist); > } > break; > } > @@ -3816,7 +3801,7 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width) > domain->max_addr = 0; > > /* always allocate the top pgd */ > - domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC); > + domain->pgd = iommu_alloc_page_node(domain->nid, GFP_ATOMIC); > if (!domain->pgd) > return -ENOMEM; > domain_flush_cache(domain, domain->pgd, PAGE_SIZE); > @@ -3960,7 +3945,7 @@ int prepare_domain_attach_device(struct iommu_domain *domain, > pte = dmar_domain->pgd; > if (dma_pte_present(pte)) { > dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte)); > - free_pgtable_page(pte); > + iommu_free_page(pte); > } > dmar_domain->agaw--; > } > @@ -4107,7 +4092,7 @@ static void intel_iommu_tlb_sync(struct iommu_domain *domain, > start_pfn, nrpages, > list_empty(&gather->freelist), 0); > > - put_pages_list(&gather->freelist); > + iommu_put_pages_list(&gather->freelist); > } > > static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, > diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h > index d02f916d8e59..9fe04cea29c4 100644 > --- a/drivers/iommu/intel/iommu.h > +++ b/drivers/iommu/intel/iommu.h > @@ -1069,8 +1069,6 @@ void domain_update_iommu_cap(struct dmar_domain *domain); > > int dmar_ir_support(void); > > -void *alloc_pgtable_page(int node, gfp_t gfp); > -void free_pgtable_page(void *vaddr); > void iommu_flush_write_buffer(struct intel_iommu *iommu); > struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, > const struct iommu_user_data *user_data); > diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c > index 566297bc87dd..39cd9626eb8d 100644 > --- a/drivers/iommu/intel/irq_remapping.c > +++ b/drivers/iommu/intel/irq_remapping.c > @@ -22,6 +22,7 @@ > > #include "iommu.h" > #include "../irq_remapping.h" > +#include "../iommu-pages.h" > #include "cap_audit.h" > > enum irq_mode { > @@ -527,7 +528,7 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) > struct ir_table *ir_table; > struct fwnode_handle *fn; > unsigned long *bitmap; > - struct page *pages; > + void *ir_table_base; > > if (iommu->ir_table) > return 0; > @@ -536,9 +537,9 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) > if (!ir_table) > return -ENOMEM; > > - pages = alloc_pages_node(iommu->node, GFP_KERNEL | __GFP_ZERO, > - INTR_REMAP_PAGE_ORDER); > - if (!pages) { > + ir_table_base = iommu_alloc_pages_node(iommu->node, GFP_KERNEL, > + INTR_REMAP_PAGE_ORDER); > + if (!ir_table_base) { > pr_err("IR%d: failed to allocate pages of order %d\n", > iommu->seq_id, INTR_REMAP_PAGE_ORDER); > goto out_free_table; > @@ -573,7 +574,7 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) > else > iommu->ir_domain->msi_parent_ops = &dmar_msi_parent_ops; > > - ir_table->base = page_address(pages); > + ir_table->base = ir_table_base; > ir_table->bitmap = bitmap; > iommu->ir_table = ir_table; > > @@ -622,7 +623,7 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) > out_free_bitmap: > bitmap_free(bitmap); > out_free_pages: > - __free_pages(pages, INTR_REMAP_PAGE_ORDER); > + iommu_free_pages(ir_table_base, INTR_REMAP_PAGE_ORDER); > out_free_table: > kfree(ir_table); > > @@ -643,8 +644,7 @@ static void intel_teardown_irq_remapping(struct intel_iommu *iommu) > irq_domain_free_fwnode(fn); > iommu->ir_domain = NULL; > } > - free_pages((unsigned long)iommu->ir_table->base, > - INTR_REMAP_PAGE_ORDER); > + iommu_free_pages(iommu->ir_table->base, INTR_REMAP_PAGE_ORDER); > bitmap_free(iommu->ir_table->bitmap); > kfree(iommu->ir_table); > iommu->ir_table = NULL; > diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c > index 3239cefa4c33..d46f661dd971 100644 > --- a/drivers/iommu/intel/pasid.c > +++ b/drivers/iommu/intel/pasid.c > @@ -20,6 +20,7 @@ > > #include "iommu.h" > #include "pasid.h" > +#include "../iommu-pages.h" > > /* > * Intel IOMMU system wide PASID name space: > @@ -38,7 +39,7 @@ int intel_pasid_alloc_table(struct device *dev) > { > struct device_domain_info *info; > struct pasid_table *pasid_table; > - struct page *pages; > + struct pasid_dir_entry *dir; > u32 max_pasid = 0; > int order, size; > > @@ -59,14 +60,13 @@ int intel_pasid_alloc_table(struct device *dev) > > size = max_pasid >> (PASID_PDE_SHIFT - 3); > order = size ? get_order(size) : 0; > - pages = alloc_pages_node(info->iommu->node, > - GFP_KERNEL | __GFP_ZERO, order); > - if (!pages) { > + dir = iommu_alloc_pages_node(info->iommu->node, GFP_KERNEL, order); > + if (!dir) { > kfree(pasid_table); > return -ENOMEM; > } > > - pasid_table->table = page_address(pages); > + pasid_table->table = dir; > pasid_table->order = order; > pasid_table->max_pasid = 1 << (order + PAGE_SHIFT + 3); > info->pasid_table = pasid_table; > @@ -97,10 +97,10 @@ void intel_pasid_free_table(struct device *dev) > max_pde = pasid_table->max_pasid >> PASID_PDE_SHIFT; > for (i = 0; i < max_pde; i++) { > table = get_pasid_table_from_pde(&dir[i]); > - free_pgtable_page(table); > + iommu_free_page(table); > } > > - free_pages((unsigned long)pasid_table->table, pasid_table->order); > + iommu_free_pages(pasid_table->table, pasid_table->order); > kfree(pasid_table); > } > > @@ -146,7 +146,7 @@ static struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid) > retry: > entries = get_pasid_table_from_pde(&dir[dir_index]); > if (!entries) { > - entries = alloc_pgtable_page(info->iommu->node, GFP_ATOMIC); > + entries = iommu_alloc_page_node(info->iommu->node, GFP_ATOMIC); > if (!entries) > return NULL; > > @@ -158,7 +158,7 @@ static struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid) > */ > if (cmpxchg64(&dir[dir_index].val, 0ULL, > (u64)virt_to_phys(entries) | PASID_PTE_PRESENT)) { > - free_pgtable_page(entries); > + iommu_free_page(entries); > goto retry; > } > if (!ecap_coherent(info->iommu->ecap)) { > diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c > index 40edd282903f..a691f917456c 100644 > --- a/drivers/iommu/intel/svm.c > +++ b/drivers/iommu/intel/svm.c > @@ -23,6 +23,7 @@ > #include "pasid.h" > #include "perf.h" > #include "../iommu-sva.h" > +#include "../iommu-pages.h" > #include "trace.h" > > static irqreturn_t prq_event_thread(int irq, void *d); > @@ -64,16 +65,14 @@ svm_lookup_device_by_dev(struct intel_svm *svm, struct device *dev) > int intel_svm_enable_prq(struct intel_iommu *iommu) > { > struct iopf_queue *iopfq; > - struct page *pages; > int irq, ret; > > - pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER); > - if (!pages) { > + iommu->prq = iommu_alloc_pages(GFP_KERNEL, PRQ_ORDER); > + if (!iommu->prq) { > pr_warn("IOMMU: %s: Failed to allocate page request queue\n", > iommu->name); > return -ENOMEM; > } > - iommu->prq = page_address(pages); > > irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PRQ + iommu->seq_id, iommu->node, iommu); > if (irq <= 0) { > @@ -118,7 +117,7 @@ int intel_svm_enable_prq(struct intel_iommu *iommu) > dmar_free_hwirq(irq); > iommu->pr_irq = 0; > free_prq: > - free_pages((unsigned long)iommu->prq, PRQ_ORDER); > + iommu_free_pages(iommu->prq, PRQ_ORDER); > iommu->prq = NULL; > > return ret; > @@ -141,7 +140,7 @@ int intel_svm_finish_prq(struct intel_iommu *iommu) > iommu->iopf_queue = NULL; > } > > - free_pages((unsigned long)iommu->prq, PRQ_ORDER); > + iommu_free_pages(iommu->prq, PRQ_ORDER); > iommu->prq = NULL; > > return 0; > diff --git a/drivers/iommu/iommu-pages.h b/drivers/iommu/iommu-pages.h > new file mode 100644 > index 000000000000..35bfa369b134 > --- /dev/null > +++ b/drivers/iommu/iommu-pages.h > @@ -0,0 +1,154 @@ > +/* SPDX-License-Identifier: GPL-2.0-only */ > +/* > + * Copyright (c) 2024, Google LLC. > + * Pasha Tatashin <pasha.tatashin@soleen.com> > + */ > + > +#ifndef __IOMMU_PAGES_H > +#define __IOMMU_PAGES_H > + > +#include <linux/vmstat.h> > +#include <linux/gfp.h> > +#include <linux/mm.h> > + > +/* > + * All page allocations that should be reported to as "iommu-pagetables" to > + * userspace must use on of the functions below. This includes allocations of > + * page-tables and other per-iommu_domain configuration structures. /s/use on/use one/? > + * > + * This is necessary for the proper accounting as IOMMU state can be rather > + * large, i.e. multiple gigabytes in size. > + */ > + > +/** > + * __iommu_alloc_pages - allocate a zeroed page of a given order. > + * @gfp: buddy allocator flags Shall we keep the comments generic here(avoid reference to allocator algo) ? > + * @order: page order > + * > + * returns the head struct page of the allocated page. > + */ > +static inline struct page *__iommu_alloc_pages(gfp_t gfp, int order) > +{ > + struct page *page; > + > + page = alloc_pages(gfp | __GFP_ZERO, order); > + if (unlikely(!page)) > + return NULL; > + > + return page; > +} > + > +/** > + * __iommu_free_pages - free page of a given order > + * @page: head struct page of the page > + * @order: page order > + */ > +static inline void __iommu_free_pages(struct page *page, int order) > +{ > + if (!page) > + return; > + > + __free_pages(page, order); > +} > + > +/** > + * iommu_alloc_pages_node - allocate a zeroed page of a given order from > + * specific NUMA node. > + * @nid: memory NUMA node id > + * @gfp: buddy allocator flags Same here for this one and other references below. > + * @order: page order > + * > + * returns the virtual address of the allocated page > + */ > +static inline void *iommu_alloc_pages_node(int nid, gfp_t gfp, int order) > +{ > + struct page *page = alloc_pages_node(nid, gfp | __GFP_ZERO, order); > + > + if (unlikely(!page)) > + return NULL; > + > + return page_address(page); > +} > + > +/** > + * iommu_alloc_pages - allocate a zeroed page of a given order > + * @gfp: buddy allocator flags > + * @order: page order > + * > + * returns the virtual address of the allocated page > + */ > +static inline void *iommu_alloc_pages(gfp_t gfp, int order) > +{ > + struct page *page = __iommu_alloc_pages(gfp, order); > + > + if (unlikely(!page)) > + return NULL; > + > + return page_address(page); > +} > + > +/** > + * iommu_alloc_page_node - allocate a zeroed page at specific NUMA node. > + * @nid: memory NUMA node id > + * @gfp: buddy allocator flags > + * > + * returns the virtual address of the allocated page > + */ > +static inline void *iommu_alloc_page_node(int nid, gfp_t gfp) > +{ > + return iommu_alloc_pages_node(nid, gfp, 0); > +} > + > +/** > + * iommu_alloc_page - allocate a zeroed page > + * @gfp: buddy allocator flags > + * > + * returns the virtual address of the allocated page > + */ > +static inline void *iommu_alloc_page(gfp_t gfp) > +{ > + return iommu_alloc_pages(gfp, 0); > +} > + > +/** > + * iommu_free_pages - free page of a given order > + * @virt: virtual address of the page to be freed. > + * @order: page order > + */ > +static inline void iommu_free_pages(void *virt, int order) > +{ > + if (!virt) > + return; > + > + __iommu_free_pages(virt_to_page(virt), order); > +} > + > +/** > + * iommu_free_page - free page > + * @virt: virtual address of the page to be freed. > + */ > +static inline void iommu_free_page(void *virt) > +{ > + iommu_free_pages(virt, 0); > +} > + > +/** > + * iommu_put_pages_list - free a list of pages. > + * @page: the head of the lru list to be freed. > + * > + * There are no locking requirement for these pages, as they are going to be > + * put on a free list as soon as refcount reaches 0. Pages are put on this LRU > + * list once they are removed from the IOMMU page tables. However, they can > + * still be access through debugfs. > + */ > +static inline void iommu_put_pages_list(struct list_head *page) > +{ > + while (!list_empty(page)) { > + struct page *p = list_entry(page->prev, struct page, lru); > + > + list_del(&p->lru); > + put_page(p); > + } > +} > + > +#endif /* __IOMMU_PAGES_H */ > -- > 2.44.0.rc0.258.g7320e95886-goog >
> Few minor nits. Hi Linu, Thank you for taking a look at this patch, my replies below. > > +/* > > + * All page allocations that should be reported to as "iommu-pagetables" to > > + * userspace must use on of the functions below. This includes allocations of > > + * page-tables and other per-iommu_domain configuration structures. > > /s/use on/use one/? I will correct in the next version (if there is going to be one). > > + * > > + * This is necessary for the proper accounting as IOMMU state can be rather > > + * large, i.e. multiple gigabytes in size. > > + */ > > + > > +/** > > + * __iommu_alloc_pages - allocate a zeroed page of a given order. > > + * @gfp: buddy allocator flags > > Shall we keep the comments generic here(avoid reference to allocator > algo) ? There are no references to allocator algorithm. I specify the zero page because this function adds __GFP_ZERO. The order and gfp arguments are provided by the caller, therefore, should be mentioned. > > + * @order: page order > > + * > > + * returns the head struct page of the allocated page. > > + */ > > +static inline struct page *__iommu_alloc_pages(gfp_t gfp, int order) > > +{ > > + struct page *page; > > + > > + page = alloc_pages(gfp | __GFP_ZERO, order); > > + if (unlikely(!page)) > > + return NULL; > > + > > + return page; > > +} > > + > > +/** > > + * __iommu_free_pages - free page of a given order > > + * @page: head struct page of the page > > + * @order: page order > > + */ > > +static inline void __iommu_free_pages(struct page *page, int order) > > +{ > > + if (!page) > > + return; > > + > > + __free_pages(page, order); > > +} > > + > > +/** > > + * iommu_alloc_pages_node - allocate a zeroed page of a given order from > > + * specific NUMA node. > > + * @nid: memory NUMA node id > > + * @gfp: buddy allocator flags > > Same here for this one and other references below. ditto. Thank you, Pasha
Hi Pasha, > -----Original Message----- > From: Pasha Tatashin <pasha.tatashin@soleen.com> > Sent: Thursday, April 4, 2024 7:26 PM > To: Linu Cherian <lcherian@marvell.com> > Cc: akpm@linux-foundation.org; alim.akhtar@samsung.com; > alyssa@rosenzweig.io; asahi@lists.linux.dev; baolu.lu@linux.intel.com; > bhelgaas@google.com; cgroups@vger.kernel.org; corbet@lwn.net; > david@redhat.com; dwmw2@infradead.org; hannes@cmpxchg.org; > heiko@sntech.de; iommu@lists.linux.dev; jernej.skrabec@gmail.com; > jonathanh@nvidia.com; joro@8bytes.org; krzysztof.kozlowski@linaro.org; > linux-doc@vger.kernel.org; linux-fsdevel@vger.kernel.org; linux- > kernel@vger.kernel.org; linux-mm@kvack.org; linux- > rockchip@lists.infradead.org; linux-samsung-soc@vger.kernel.org; linux- > sunxi@lists.linux.dev; linux-tegra@vger.kernel.org; > lizefan.x@bytedance.com; marcan@marcan.st; mhiramat@kernel.org; > m.szyprowski@samsung.com; paulmck@kernel.org; rdunlap@infradead.org; > robin.murphy@arm.com; samuel@sholland.org; > suravee.suthikulpanit@amd.com; sven@svenpeter.dev; > thierry.reding@gmail.com; tj@kernel.org; tomas.mudrunka@gmail.com; > vdumpa@nvidia.com; wens@csie.org; will@kernel.org; yu- > cheng.yu@intel.com; rientjes@google.com; bagasdotme@gmail.com; > mkoutny@suse.com > Subject: [EXTERNAL] Re: [PATCH v5 01/11] iommu/vt-d: add wrapper > functions for page allocations > > > Few minor nits. > > Hi Linu, > > Thank you for taking a look at this patch, my replies below. > > > > +/* > > > + * All page allocations that should be reported to as > > > +"iommu-pagetables" to > > > + * userspace must use on of the functions below. This includes > > > +allocations of > > > + * page-tables and other per-iommu_domain configuration structures. > > > > /s/use on/use one/? > > I will correct in the next version (if there is going to be one). > > > > + * > > > + * This is necessary for the proper accounting as IOMMU state can > > > + be rather > > > + * large, i.e. multiple gigabytes in size. > > > + */ > > > + > > > +/** > > > + * __iommu_alloc_pages - allocate a zeroed page of a given order. > > > + * @gfp: buddy allocator flags > > > > Shall we keep the comments generic here(avoid reference to allocator > > algo) ? > > There are no references to allocator algorithm. I specify the zero page > because this function adds __GFP_ZERO. The order and gfp arguments are > provided by the caller, therefore, should be mentioned. Just meant to remove the mention of "buddy allocator" in the above comments if I was not clear. Ie. "* @gfp: allocator flags" instead of "* @gfp: buddy allocator flags" since that is allocator specific. Thanks Linu Cherian.
diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index 23cb80d62a9a..ff6045ae8e97 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -32,6 +32,7 @@ #include "iommu.h" #include "../irq_remapping.h" +#include "../iommu-pages.h" #include "perf.h" #include "trace.h" #include "perfmon.h" @@ -1185,7 +1186,7 @@ static void free_iommu(struct intel_iommu *iommu) } if (iommu->qi) { - free_page((unsigned long)iommu->qi->desc); + iommu_free_page(iommu->qi->desc); kfree(iommu->qi->desc_status); kfree(iommu->qi); } @@ -1731,7 +1732,8 @@ static void __dmar_enable_qi(struct intel_iommu *iommu) int dmar_enable_qi(struct intel_iommu *iommu) { struct q_inval *qi; - struct page *desc_page; + void *desc; + int order; if (!ecap_qis(iommu->ecap)) return -ENOENT; @@ -1752,19 +1754,19 @@ int dmar_enable_qi(struct intel_iommu *iommu) * Need two pages to accommodate 256 descriptors of 256 bits each * if the remapping hardware supports scalable mode translation. */ - desc_page = alloc_pages_node(iommu->node, GFP_ATOMIC | __GFP_ZERO, - !!ecap_smts(iommu->ecap)); - if (!desc_page) { + order = ecap_smts(iommu->ecap) ? 1 : 0; + desc = iommu_alloc_pages_node(iommu->node, GFP_ATOMIC, order); + if (!desc) { kfree(qi); iommu->qi = NULL; return -ENOMEM; } - qi->desc = page_address(desc_page); + qi->desc = desc; qi->desc_status = kcalloc(QI_LENGTH, sizeof(int), GFP_ATOMIC); if (!qi->desc_status) { - free_page((unsigned long) qi->desc); + iommu_free_page(qi->desc); kfree(qi); iommu->qi = NULL; return -ENOMEM; diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 6fb5f6fceea1..2c676f46e38c 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -28,6 +28,7 @@ #include "../dma-iommu.h" #include "../irq_remapping.h" #include "../iommu-sva.h" +#include "../iommu-pages.h" #include "pasid.h" #include "cap_audit.h" #include "perfmon.h" @@ -224,22 +225,6 @@ static int __init intel_iommu_setup(char *str) } __setup("intel_iommu=", intel_iommu_setup); -void *alloc_pgtable_page(int node, gfp_t gfp) -{ - struct page *page; - void *vaddr = NULL; - - page = alloc_pages_node(node, gfp | __GFP_ZERO, 0); - if (page) - vaddr = page_address(page); - return vaddr; -} - -void free_pgtable_page(void *vaddr) -{ - free_page((unsigned long)vaddr); -} - static int domain_type_is_si(struct dmar_domain *domain) { return domain->domain.type == IOMMU_DOMAIN_IDENTITY; @@ -473,7 +458,7 @@ struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, if (!alloc) return NULL; - context = alloc_pgtable_page(iommu->node, GFP_ATOMIC); + context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC); if (!context) return NULL; @@ -647,17 +632,17 @@ static void free_context_table(struct intel_iommu *iommu) for (i = 0; i < ROOT_ENTRY_NR; i++) { context = iommu_context_addr(iommu, i, 0, 0); if (context) - free_pgtable_page(context); + iommu_free_page(context); if (!sm_supported(iommu)) continue; context = iommu_context_addr(iommu, i, 0x80, 0); if (context) - free_pgtable_page(context); + iommu_free_page(context); } - free_pgtable_page(iommu->root_entry); + iommu_free_page(iommu->root_entry); iommu->root_entry = NULL; } @@ -795,7 +780,7 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, if (!dma_pte_present(pte)) { uint64_t pteval; - tmp_page = alloc_pgtable_page(domain->nid, gfp); + tmp_page = iommu_alloc_page_node(domain->nid, gfp); if (!tmp_page) return NULL; @@ -807,7 +792,7 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, if (cmpxchg64(&pte->val, 0ULL, pteval)) /* Someone else set it while we were thinking; use theirs. */ - free_pgtable_page(tmp_page); + iommu_free_page(tmp_page); else domain_flush_cache(domain, pte, sizeof(*pte)); } @@ -920,7 +905,7 @@ static void dma_pte_free_level(struct dmar_domain *domain, int level, last_pfn < level_pfn + level_size(level) - 1)) { dma_clear_pte(pte); domain_flush_cache(domain, pte, sizeof(*pte)); - free_pgtable_page(level_pte); + iommu_free_page(level_pte); } next: pfn += level_size(level); @@ -944,7 +929,7 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain, /* free pgd */ if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { - free_pgtable_page(domain->pgd); + iommu_free_page(domain->pgd); domain->pgd = NULL; } } @@ -1046,7 +1031,7 @@ static int iommu_alloc_root_entry(struct intel_iommu *iommu) { struct root_entry *root; - root = alloc_pgtable_page(iommu->node, GFP_ATOMIC); + root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC); if (!root) { pr_err("Allocating root entry for %s failed\n", iommu->name); @@ -1718,7 +1703,7 @@ static void domain_exit(struct dmar_domain *domain) LIST_HEAD(freelist); domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist); - put_pages_list(&freelist); + iommu_put_pages_list(&freelist); } if (WARN_ON(!list_empty(&domain->devices))) @@ -2452,7 +2437,7 @@ static int copy_context_table(struct intel_iommu *iommu, if (!old_ce) goto out; - new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL); + new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL); if (!new_ce) goto out_unmap; @@ -3385,7 +3370,7 @@ static int intel_iommu_memory_notifier(struct notifier_block *nb, start_vpfn, mhp->nr_pages, list_empty(&freelist), 0); rcu_read_unlock(); - put_pages_list(&freelist); + iommu_put_pages_list(&freelist); } break; } @@ -3816,7 +3801,7 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width) domain->max_addr = 0; /* always allocate the top pgd */ - domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC); + domain->pgd = iommu_alloc_page_node(domain->nid, GFP_ATOMIC); if (!domain->pgd) return -ENOMEM; domain_flush_cache(domain, domain->pgd, PAGE_SIZE); @@ -3960,7 +3945,7 @@ int prepare_domain_attach_device(struct iommu_domain *domain, pte = dmar_domain->pgd; if (dma_pte_present(pte)) { dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte)); - free_pgtable_page(pte); + iommu_free_page(pte); } dmar_domain->agaw--; } @@ -4107,7 +4092,7 @@ static void intel_iommu_tlb_sync(struct iommu_domain *domain, start_pfn, nrpages, list_empty(&gather->freelist), 0); - put_pages_list(&gather->freelist); + iommu_put_pages_list(&gather->freelist); } static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index d02f916d8e59..9fe04cea29c4 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -1069,8 +1069,6 @@ void domain_update_iommu_cap(struct dmar_domain *domain); int dmar_ir_support(void); -void *alloc_pgtable_page(int node, gfp_t gfp); -void free_pgtable_page(void *vaddr); void iommu_flush_write_buffer(struct intel_iommu *iommu); struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, const struct iommu_user_data *user_data); diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index 566297bc87dd..39cd9626eb8d 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -22,6 +22,7 @@ #include "iommu.h" #include "../irq_remapping.h" +#include "../iommu-pages.h" #include "cap_audit.h" enum irq_mode { @@ -527,7 +528,7 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) struct ir_table *ir_table; struct fwnode_handle *fn; unsigned long *bitmap; - struct page *pages; + void *ir_table_base; if (iommu->ir_table) return 0; @@ -536,9 +537,9 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) if (!ir_table) return -ENOMEM; - pages = alloc_pages_node(iommu->node, GFP_KERNEL | __GFP_ZERO, - INTR_REMAP_PAGE_ORDER); - if (!pages) { + ir_table_base = iommu_alloc_pages_node(iommu->node, GFP_KERNEL, + INTR_REMAP_PAGE_ORDER); + if (!ir_table_base) { pr_err("IR%d: failed to allocate pages of order %d\n", iommu->seq_id, INTR_REMAP_PAGE_ORDER); goto out_free_table; @@ -573,7 +574,7 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) else iommu->ir_domain->msi_parent_ops = &dmar_msi_parent_ops; - ir_table->base = page_address(pages); + ir_table->base = ir_table_base; ir_table->bitmap = bitmap; iommu->ir_table = ir_table; @@ -622,7 +623,7 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) out_free_bitmap: bitmap_free(bitmap); out_free_pages: - __free_pages(pages, INTR_REMAP_PAGE_ORDER); + iommu_free_pages(ir_table_base, INTR_REMAP_PAGE_ORDER); out_free_table: kfree(ir_table); @@ -643,8 +644,7 @@ static void intel_teardown_irq_remapping(struct intel_iommu *iommu) irq_domain_free_fwnode(fn); iommu->ir_domain = NULL; } - free_pages((unsigned long)iommu->ir_table->base, - INTR_REMAP_PAGE_ORDER); + iommu_free_pages(iommu->ir_table->base, INTR_REMAP_PAGE_ORDER); bitmap_free(iommu->ir_table->bitmap); kfree(iommu->ir_table); iommu->ir_table = NULL; diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 3239cefa4c33..d46f661dd971 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -20,6 +20,7 @@ #include "iommu.h" #include "pasid.h" +#include "../iommu-pages.h" /* * Intel IOMMU system wide PASID name space: @@ -38,7 +39,7 @@ int intel_pasid_alloc_table(struct device *dev) { struct device_domain_info *info; struct pasid_table *pasid_table; - struct page *pages; + struct pasid_dir_entry *dir; u32 max_pasid = 0; int order, size; @@ -59,14 +60,13 @@ int intel_pasid_alloc_table(struct device *dev) size = max_pasid >> (PASID_PDE_SHIFT - 3); order = size ? get_order(size) : 0; - pages = alloc_pages_node(info->iommu->node, - GFP_KERNEL | __GFP_ZERO, order); - if (!pages) { + dir = iommu_alloc_pages_node(info->iommu->node, GFP_KERNEL, order); + if (!dir) { kfree(pasid_table); return -ENOMEM; } - pasid_table->table = page_address(pages); + pasid_table->table = dir; pasid_table->order = order; pasid_table->max_pasid = 1 << (order + PAGE_SHIFT + 3); info->pasid_table = pasid_table; @@ -97,10 +97,10 @@ void intel_pasid_free_table(struct device *dev) max_pde = pasid_table->max_pasid >> PASID_PDE_SHIFT; for (i = 0; i < max_pde; i++) { table = get_pasid_table_from_pde(&dir[i]); - free_pgtable_page(table); + iommu_free_page(table); } - free_pages((unsigned long)pasid_table->table, pasid_table->order); + iommu_free_pages(pasid_table->table, pasid_table->order); kfree(pasid_table); } @@ -146,7 +146,7 @@ static struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid) retry: entries = get_pasid_table_from_pde(&dir[dir_index]); if (!entries) { - entries = alloc_pgtable_page(info->iommu->node, GFP_ATOMIC); + entries = iommu_alloc_page_node(info->iommu->node, GFP_ATOMIC); if (!entries) return NULL; @@ -158,7 +158,7 @@ static struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid) */ if (cmpxchg64(&dir[dir_index].val, 0ULL, (u64)virt_to_phys(entries) | PASID_PTE_PRESENT)) { - free_pgtable_page(entries); + iommu_free_page(entries); goto retry; } if (!ecap_coherent(info->iommu->ecap)) { diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 40edd282903f..a691f917456c 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -23,6 +23,7 @@ #include "pasid.h" #include "perf.h" #include "../iommu-sva.h" +#include "../iommu-pages.h" #include "trace.h" static irqreturn_t prq_event_thread(int irq, void *d); @@ -64,16 +65,14 @@ svm_lookup_device_by_dev(struct intel_svm *svm, struct device *dev) int intel_svm_enable_prq(struct intel_iommu *iommu) { struct iopf_queue *iopfq; - struct page *pages; int irq, ret; - pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER); - if (!pages) { + iommu->prq = iommu_alloc_pages(GFP_KERNEL, PRQ_ORDER); + if (!iommu->prq) { pr_warn("IOMMU: %s: Failed to allocate page request queue\n", iommu->name); return -ENOMEM; } - iommu->prq = page_address(pages); irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PRQ + iommu->seq_id, iommu->node, iommu); if (irq <= 0) { @@ -118,7 +117,7 @@ int intel_svm_enable_prq(struct intel_iommu *iommu) dmar_free_hwirq(irq); iommu->pr_irq = 0; free_prq: - free_pages((unsigned long)iommu->prq, PRQ_ORDER); + iommu_free_pages(iommu->prq, PRQ_ORDER); iommu->prq = NULL; return ret; @@ -141,7 +140,7 @@ int intel_svm_finish_prq(struct intel_iommu *iommu) iommu->iopf_queue = NULL; } - free_pages((unsigned long)iommu->prq, PRQ_ORDER); + iommu_free_pages(iommu->prq, PRQ_ORDER); iommu->prq = NULL; return 0; diff --git a/drivers/iommu/iommu-pages.h b/drivers/iommu/iommu-pages.h new file mode 100644 index 000000000000..35bfa369b134 --- /dev/null +++ b/drivers/iommu/iommu-pages.h @@ -0,0 +1,154 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024, Google LLC. + * Pasha Tatashin <pasha.tatashin@soleen.com> + */ + +#ifndef __IOMMU_PAGES_H +#define __IOMMU_PAGES_H + +#include <linux/vmstat.h> +#include <linux/gfp.h> +#include <linux/mm.h> + +/* + * All page allocations that should be reported to as "iommu-pagetables" to + * userspace must use on of the functions below. This includes allocations of + * page-tables and other per-iommu_domain configuration structures. + * + * This is necessary for the proper accounting as IOMMU state can be rather + * large, i.e. multiple gigabytes in size. + */ + +/** + * __iommu_alloc_pages - allocate a zeroed page of a given order. + * @gfp: buddy allocator flags + * @order: page order + * + * returns the head struct page of the allocated page. + */ +static inline struct page *__iommu_alloc_pages(gfp_t gfp, int order) +{ + struct page *page; + + page = alloc_pages(gfp | __GFP_ZERO, order); + if (unlikely(!page)) + return NULL; + + return page; +} + +/** + * __iommu_free_pages - free page of a given order + * @page: head struct page of the page + * @order: page order + */ +static inline void __iommu_free_pages(struct page *page, int order) +{ + if (!page) + return; + + __free_pages(page, order); +} + +/** + * iommu_alloc_pages_node - allocate a zeroed page of a given order from + * specific NUMA node. + * @nid: memory NUMA node id + * @gfp: buddy allocator flags + * @order: page order + * + * returns the virtual address of the allocated page + */ +static inline void *iommu_alloc_pages_node(int nid, gfp_t gfp, int order) +{ + struct page *page = alloc_pages_node(nid, gfp | __GFP_ZERO, order); + + if (unlikely(!page)) + return NULL; + + return page_address(page); +} + +/** + * iommu_alloc_pages - allocate a zeroed page of a given order + * @gfp: buddy allocator flags + * @order: page order + * + * returns the virtual address of the allocated page + */ +static inline void *iommu_alloc_pages(gfp_t gfp, int order) +{ + struct page *page = __iommu_alloc_pages(gfp, order); + + if (unlikely(!page)) + return NULL; + + return page_address(page); +} + +/** + * iommu_alloc_page_node - allocate a zeroed page at specific NUMA node. + * @nid: memory NUMA node id + * @gfp: buddy allocator flags + * + * returns the virtual address of the allocated page + */ +static inline void *iommu_alloc_page_node(int nid, gfp_t gfp) +{ + return iommu_alloc_pages_node(nid, gfp, 0); +} + +/** + * iommu_alloc_page - allocate a zeroed page + * @gfp: buddy allocator flags + * + * returns the virtual address of the allocated page + */ +static inline void *iommu_alloc_page(gfp_t gfp) +{ + return iommu_alloc_pages(gfp, 0); +} + +/** + * iommu_free_pages - free page of a given order + * @virt: virtual address of the page to be freed. + * @order: page order + */ +static inline void iommu_free_pages(void *virt, int order) +{ + if (!virt) + return; + + __iommu_free_pages(virt_to_page(virt), order); +} + +/** + * iommu_free_page - free page + * @virt: virtual address of the page to be freed. + */ +static inline void iommu_free_page(void *virt) +{ + iommu_free_pages(virt, 0); +} + +/** + * iommu_put_pages_list - free a list of pages. + * @page: the head of the lru list to be freed. + * + * There are no locking requirement for these pages, as they are going to be + * put on a free list as soon as refcount reaches 0. Pages are put on this LRU + * list once they are removed from the IOMMU page tables. However, they can + * still be access through debugfs. + */ +static inline void iommu_put_pages_list(struct list_head *page) +{ + while (!list_empty(page)) { + struct page *p = list_entry(page->prev, struct page, lru); + + list_del(&p->lru); + put_page(p); + } +} + +#endif /* __IOMMU_PAGES_H */