Message ID | 20240201193014.2785570-1-tatashin@google.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | iommu/iova: use named kmem_cache for iova magazines | expand |
On 2024-02-01 7:30 pm, Pasha Tatashin wrote: > From: Pasha Tatashin <pasha.tatashin@soleen.com> > > The magazine buffers can take gigabytes of kmem memory, dominating all > other allocations. For observability prurpose create named slab cache so > the iova magazine memory overhead can be clearly observed. > > With this change: > >> slabtop -o | head > Active / Total Objects (% used) : 869731 / 952904 (91.3%) > Active / Total Slabs (% used) : 103411 / 103974 (99.5%) > Active / Total Caches (% used) : 135 / 211 (64.0%) > Active / Total Size (% used) : 395389.68K / 411430.20K (96.1%) > Minimum / Average / Maximum Object : 0.02K / 0.43K / 8.00K > > OBJS ACTIVE USE OBJ SIZE SLABS OBJ/SLAB CACHE SIZE NAME > 244412 244239 99% 1.00K 61103 4 244412K iommu_iova_magazine > 91636 88343 96% 0.03K 739 124 2956K kmalloc-32 > 75744 74844 98% 0.12K 2367 32 9468K kernfs_node_cache > > On this machine it is now clear that magazine use 242M of kmem memory. Hmm, something smells there... In the "worst" case there should be a maximum of 6 * 2 * num_online_cpus() empty magazines in the iova_cpu_rcache structures, i.e., 12KB per CPU. Under normal use those will contain at least some PFNs, but mainly every additional magazine stored in a depot is full with 127 PFNs, and each one of those PFNs is backed by a 40-byte struct iova, i.e. ~5KB per 1KB magazine. Unless that machine has many thousands of CPUs, if iova_magazine allocations are the top consumer of memory then something's gone wrong. Thanks, Robin. > Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com> > --- > drivers/iommu/iova.c | 57 +++++++++++++++++++++++++++++++++++++++++--- > 1 file changed, 54 insertions(+), 3 deletions(-) > > diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c > index d30e453d0fb4..617bbc2b79f5 100644 > --- a/drivers/iommu/iova.c > +++ b/drivers/iommu/iova.c > @@ -630,6 +630,10 @@ EXPORT_SYMBOL_GPL(reserve_iova); > > #define IOVA_DEPOT_DELAY msecs_to_jiffies(100) > > +static struct kmem_cache *iova_magazine_cache; > +static unsigned int iova_magazine_cache_users; > +static DEFINE_MUTEX(iova_magazine_cache_mutex); > + > struct iova_magazine { > union { > unsigned long size; > @@ -654,11 +658,51 @@ struct iova_rcache { > struct delayed_work work; > }; > > +static int iova_magazine_cache_init(void) > +{ > + int ret = 0; > + > + mutex_lock(&iova_magazine_cache_mutex); > + > + iova_magazine_cache_users++; > + if (iova_magazine_cache_users > 1) > + goto out_unlock; > + > + iova_magazine_cache = kmem_cache_create("iommu_iova_magazine", > + sizeof(struct iova_magazine), > + 0, SLAB_HWCACHE_ALIGN, NULL); > + > + if (!iova_magazine_cache) { > + pr_err("Couldn't create iova magazine cache\n"); > + ret = -ENOMEM; > + } > + > +out_unlock: > + mutex_unlock(&iova_magazine_cache_mutex); > + > + return ret; > +} > + > +static void iova_magazine_cache_fini(void) > +{ > + mutex_lock(&iova_magazine_cache_mutex); > + > + if (WARN_ON(!iova_magazine_cache_users)) > + goto out_unlock; > + > + iova_magazine_cache_users--; > + if (!iova_magazine_cache_users) > + kmem_cache_destroy(iova_magazine_cache); > + > +out_unlock: > + mutex_unlock(&iova_magazine_cache_mutex); > +} > + > static struct iova_magazine *iova_magazine_alloc(gfp_t flags) > { > struct iova_magazine *mag; > > - mag = kmalloc(sizeof(*mag), flags); > + mag = kmem_cache_alloc(iova_magazine_cache, flags); > if (mag) > mag->size = 0; > > @@ -667,7 +711,7 @@ static struct iova_magazine *iova_magazine_alloc(gfp_t flags) > > static void iova_magazine_free(struct iova_magazine *mag) > { > - kfree(mag); > + kmem_cache_free(iova_magazine_cache, mag); > } > > static void > @@ -766,11 +810,17 @@ int iova_domain_init_rcaches(struct iova_domain *iovad) > unsigned int cpu; > int i, ret; > > + ret = iova_magazine_cache_init(); > + if (ret) > + return -ENOMEM; > + > iovad->rcaches = kcalloc(IOVA_RANGE_CACHE_MAX_SIZE, > sizeof(struct iova_rcache), > GFP_KERNEL); > - if (!iovad->rcaches) > + if (!iovad->rcaches) { > + iova_magazine_cache_fini(); > return -ENOMEM; > + } > > for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) { > struct iova_cpu_rcache *cpu_rcache; > @@ -948,6 +998,7 @@ static void free_iova_rcaches(struct iova_domain *iovad) > > kfree(iovad->rcaches); > iovad->rcaches = NULL; > + iova_magazine_cache_fini(); > } > > /*
On Thu, Feb 1, 2024 at 3:56 PM Robin Murphy <robin.murphy@arm.com> wrote: > > On 2024-02-01 7:30 pm, Pasha Tatashin wrote: > > From: Pasha Tatashin <pasha.tatashin@soleen.com> > > > > The magazine buffers can take gigabytes of kmem memory, dominating all > > other allocations. For observability prurpose create named slab cache so > > the iova magazine memory overhead can be clearly observed. > > > > With this change: > > > >> slabtop -o | head > > Active / Total Objects (% used) : 869731 / 952904 (91.3%) > > Active / Total Slabs (% used) : 103411 / 103974 (99.5%) > > Active / Total Caches (% used) : 135 / 211 (64.0%) > > Active / Total Size (% used) : 395389.68K / 411430.20K (96.1%) > > Minimum / Average / Maximum Object : 0.02K / 0.43K / 8.00K > > > > OBJS ACTIVE USE OBJ SIZE SLABS OBJ/SLAB CACHE SIZE NAME > > 244412 244239 99% 1.00K 61103 4 244412K iommu_iova_magazine > > 91636 88343 96% 0.03K 739 124 2956K kmalloc-32 > > 75744 74844 98% 0.12K 2367 32 9468K kernfs_node_cache > > > > On this machine it is now clear that magazine use 242M of kmem memory. > > Hmm, something smells there... > > In the "worst" case there should be a maximum of 6 * 2 * > num_online_cpus() empty magazines in the iova_cpu_rcache structures, > i.e., 12KB per CPU. Under normal use those will contain at least some > PFNs, but mainly every additional magazine stored in a depot is full > with 127 PFNs, and each one of those PFNs is backed by a 40-byte struct > iova, i.e. ~5KB per 1KB magazine. Unless that machine has many thousands > of CPUs, if iova_magazine allocations are the top consumer of memory > then something's gone wrong. This is an upstream kernel + few drivers that is booted on AMD EPYC, with 128 CPUs. It has allocations stacks like these: init_iova_domain+0x1ed/0x230 iommu_setup_dma_ops+0xf8/0x4b0 amd_iommu_probe_finalize. And also init_iova_domain() calls for Google's TPU drivers 242M is actually not that much, compared to the size of the system. Pasha > > Thanks, > Robin. > > > Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com> > > --- > > drivers/iommu/iova.c | 57 +++++++++++++++++++++++++++++++++++++++++--- > > 1 file changed, 54 insertions(+), 3 deletions(-) > > > > diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c > > index d30e453d0fb4..617bbc2b79f5 100644 > > --- a/drivers/iommu/iova.c > > +++ b/drivers/iommu/iova.c > > @@ -630,6 +630,10 @@ EXPORT_SYMBOL_GPL(reserve_iova); > > > > #define IOVA_DEPOT_DELAY msecs_to_jiffies(100) > > > > +static struct kmem_cache *iova_magazine_cache; > > +static unsigned int iova_magazine_cache_users; > > +static DEFINE_MUTEX(iova_magazine_cache_mutex); > > + > > struct iova_magazine { > > union { > > unsigned long size; > > @@ -654,11 +658,51 @@ struct iova_rcache { > > struct delayed_work work; > > }; > > > > +static int iova_magazine_cache_init(void) > > +{ > > + int ret = 0; > > + > > + mutex_lock(&iova_magazine_cache_mutex); > > + > > + iova_magazine_cache_users++; > > + if (iova_magazine_cache_users > 1) > > + goto out_unlock; > > + > > + iova_magazine_cache = kmem_cache_create("iommu_iova_magazine", > > + sizeof(struct iova_magazine), > > + 0, SLAB_HWCACHE_ALIGN, NULL); > > + > > + if (!iova_magazine_cache) { > > + pr_err("Couldn't create iova magazine cache\n"); > > + ret = -ENOMEM; > > + } > > + > > +out_unlock: > > + mutex_unlock(&iova_magazine_cache_mutex); > > + > > + return ret; > > +} > > + > > +static void iova_magazine_cache_fini(void) > > +{ > > + mutex_lock(&iova_magazine_cache_mutex); > > + > > + if (WARN_ON(!iova_magazine_cache_users)) > > + goto out_unlock; > > + > > + iova_magazine_cache_users--; > > + if (!iova_magazine_cache_users) > > + kmem_cache_destroy(iova_magazine_cache); > > + > > +out_unlock: > > + mutex_unlock(&iova_magazine_cache_mutex); > > +} > > + > > static struct iova_magazine *iova_magazine_alloc(gfp_t flags) > > { > > struct iova_magazine *mag; > > > > - mag = kmalloc(sizeof(*mag), flags); > > + mag = kmem_cache_alloc(iova_magazine_cache, flags); > > if (mag) > > mag->size = 0; > > > > @@ -667,7 +711,7 @@ static struct iova_magazine *iova_magazine_alloc(gfp_t flags) > > > > static void iova_magazine_free(struct iova_magazine *mag) > > { > > - kfree(mag); > > + kmem_cache_free(iova_magazine_cache, mag); > > } > > > > static void > > @@ -766,11 +810,17 @@ int iova_domain_init_rcaches(struct iova_domain *iovad) > > unsigned int cpu; > > int i, ret; > > > > + ret = iova_magazine_cache_init(); > > + if (ret) > > + return -ENOMEM; > > + > > iovad->rcaches = kcalloc(IOVA_RANGE_CACHE_MAX_SIZE, > > sizeof(struct iova_rcache), > > GFP_KERNEL); > > - if (!iovad->rcaches) > > + if (!iovad->rcaches) { > > + iova_magazine_cache_fini(); > > return -ENOMEM; > > + } > > > > for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) { > > struct iova_cpu_rcache *cpu_rcache; > > @@ -948,6 +998,7 @@ static void free_iova_rcaches(struct iova_domain *iovad) > > > > kfree(iovad->rcaches); > > iovad->rcaches = NULL; > > + iova_magazine_cache_fini(); > > } > > > > /*
On 2024-02-01 9:06 pm, Pasha Tatashin wrote: > On Thu, Feb 1, 2024 at 3:56 PM Robin Murphy <robin.murphy@arm.com> wrote: >> >> On 2024-02-01 7:30 pm, Pasha Tatashin wrote: >>> From: Pasha Tatashin <pasha.tatashin@soleen.com> >>> >>> The magazine buffers can take gigabytes of kmem memory, dominating all >>> other allocations. For observability prurpose create named slab cache so >>> the iova magazine memory overhead can be clearly observed. >>> >>> With this change: >>> >>>> slabtop -o | head >>> Active / Total Objects (% used) : 869731 / 952904 (91.3%) >>> Active / Total Slabs (% used) : 103411 / 103974 (99.5%) >>> Active / Total Caches (% used) : 135 / 211 (64.0%) >>> Active / Total Size (% used) : 395389.68K / 411430.20K (96.1%) >>> Minimum / Average / Maximum Object : 0.02K / 0.43K / 8.00K >>> >>> OBJS ACTIVE USE OBJ SIZE SLABS OBJ/SLAB CACHE SIZE NAME >>> 244412 244239 99% 1.00K 61103 4 244412K iommu_iova_magazine >>> 91636 88343 96% 0.03K 739 124 2956K kmalloc-32 >>> 75744 74844 98% 0.12K 2367 32 9468K kernfs_node_cache >>> >>> On this machine it is now clear that magazine use 242M of kmem memory. >> >> Hmm, something smells there... >> >> In the "worst" case there should be a maximum of 6 * 2 * >> num_online_cpus() empty magazines in the iova_cpu_rcache structures, >> i.e., 12KB per CPU. Under normal use those will contain at least some >> PFNs, but mainly every additional magazine stored in a depot is full >> with 127 PFNs, and each one of those PFNs is backed by a 40-byte struct >> iova, i.e. ~5KB per 1KB magazine. Unless that machine has many thousands >> of CPUs, if iova_magazine allocations are the top consumer of memory >> then something's gone wrong. > > This is an upstream kernel + few drivers that is booted on AMD EPYC, > with 128 CPUs. > > It has allocations stacks like these: > init_iova_domain+0x1ed/0x230 iommu_setup_dma_ops+0xf8/0x4b0 > amd_iommu_probe_finalize. > And also init_iova_domain() calls for Google's TPU drivers 242M is > actually not that much, compared to the size of the system. Hmm, I did misspeak slightly (it's late and I really should have left this for tomorrow...) - that's 12KB per CPU *per domain*, but still that would seem to imply well over 100 domains if you have 242MB of magazine allocations while the iommu_iova cache isn't even on the charts... what the heck is that driver doing? (I don't necessarily disagree with the spirit of the patch BTW, I just really want to understand the situation that prompted it, and make sure we don't actually have a subtle leak somewhere.) Thanks, Robin. > > Pasha > >> >> Thanks, >> Robin. >> >>> Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com> >>> --- >>> drivers/iommu/iova.c | 57 +++++++++++++++++++++++++++++++++++++++++--- >>> 1 file changed, 54 insertions(+), 3 deletions(-) >>> >>> diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c >>> index d30e453d0fb4..617bbc2b79f5 100644 >>> --- a/drivers/iommu/iova.c >>> +++ b/drivers/iommu/iova.c >>> @@ -630,6 +630,10 @@ EXPORT_SYMBOL_GPL(reserve_iova); >>> >>> #define IOVA_DEPOT_DELAY msecs_to_jiffies(100) >>> >>> +static struct kmem_cache *iova_magazine_cache; >>> +static unsigned int iova_magazine_cache_users; >>> +static DEFINE_MUTEX(iova_magazine_cache_mutex); >>> + >>> struct iova_magazine { >>> union { >>> unsigned long size; >>> @@ -654,11 +658,51 @@ struct iova_rcache { >>> struct delayed_work work; >>> }; >>> >>> +static int iova_magazine_cache_init(void) >>> +{ >>> + int ret = 0; >>> + >>> + mutex_lock(&iova_magazine_cache_mutex); >>> + >>> + iova_magazine_cache_users++; >>> + if (iova_magazine_cache_users > 1) >>> + goto out_unlock; >>> + >>> + iova_magazine_cache = kmem_cache_create("iommu_iova_magazine", >>> + sizeof(struct iova_magazine), >>> + 0, SLAB_HWCACHE_ALIGN, NULL); >>> + >>> + if (!iova_magazine_cache) { >>> + pr_err("Couldn't create iova magazine cache\n"); >>> + ret = -ENOMEM; >>> + } >>> + >>> +out_unlock: >>> + mutex_unlock(&iova_magazine_cache_mutex); >>> + >>> + return ret; >>> +} >>> + >>> +static void iova_magazine_cache_fini(void) >>> +{ >>> + mutex_lock(&iova_magazine_cache_mutex); >>> + >>> + if (WARN_ON(!iova_magazine_cache_users)) >>> + goto out_unlock; >>> + >>> + iova_magazine_cache_users--; >>> + if (!iova_magazine_cache_users) >>> + kmem_cache_destroy(iova_magazine_cache); >>> + >>> +out_unlock: >>> + mutex_unlock(&iova_magazine_cache_mutex); >>> +} >>> + >>> static struct iova_magazine *iova_magazine_alloc(gfp_t flags) >>> { >>> struct iova_magazine *mag; >>> >>> - mag = kmalloc(sizeof(*mag), flags); >>> + mag = kmem_cache_alloc(iova_magazine_cache, flags); >>> if (mag) >>> mag->size = 0; >>> >>> @@ -667,7 +711,7 @@ static struct iova_magazine *iova_magazine_alloc(gfp_t flags) >>> >>> static void iova_magazine_free(struct iova_magazine *mag) >>> { >>> - kfree(mag); >>> + kmem_cache_free(iova_magazine_cache, mag); >>> } >>> >>> static void >>> @@ -766,11 +810,17 @@ int iova_domain_init_rcaches(struct iova_domain *iovad) >>> unsigned int cpu; >>> int i, ret; >>> >>> + ret = iova_magazine_cache_init(); >>> + if (ret) >>> + return -ENOMEM; >>> + >>> iovad->rcaches = kcalloc(IOVA_RANGE_CACHE_MAX_SIZE, >>> sizeof(struct iova_rcache), >>> GFP_KERNEL); >>> - if (!iovad->rcaches) >>> + if (!iovad->rcaches) { >>> + iova_magazine_cache_fini(); >>> return -ENOMEM; >>> + } >>> >>> for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) { >>> struct iova_cpu_rcache *cpu_rcache; >>> @@ -948,6 +998,7 @@ static void free_iova_rcaches(struct iova_domain *iovad) >>> >>> kfree(iovad->rcaches); >>> iovad->rcaches = NULL; >>> + iova_magazine_cache_fini(); >>> } >>> >>> /*
On Thu, Feb 1, 2024 at 4:23 PM Robin Murphy <robin.murphy@arm.com> wrote: > > On 2024-02-01 9:06 pm, Pasha Tatashin wrote: > > On Thu, Feb 1, 2024 at 3:56 PM Robin Murphy <robin.murphy@arm.com> wrote: > >> > >> On 2024-02-01 7:30 pm, Pasha Tatashin wrote: > >>> From: Pasha Tatashin <pasha.tatashin@soleen.com> > >>> > >>> The magazine buffers can take gigabytes of kmem memory, dominating all > >>> other allocations. For observability prurpose create named slab cache so > >>> the iova magazine memory overhead can be clearly observed. > >>> > >>> With this change: > >>> > >>>> slabtop -o | head > >>> Active / Total Objects (% used) : 869731 / 952904 (91.3%) > >>> Active / Total Slabs (% used) : 103411 / 103974 (99.5%) > >>> Active / Total Caches (% used) : 135 / 211 (64.0%) > >>> Active / Total Size (% used) : 395389.68K / 411430.20K (96.1%) > >>> Minimum / Average / Maximum Object : 0.02K / 0.43K / 8.00K > >>> > >>> OBJS ACTIVE USE OBJ SIZE SLABS OBJ/SLAB CACHE SIZE NAME > >>> 244412 244239 99% 1.00K 61103 4 244412K iommu_iova_magazine > >>> 91636 88343 96% 0.03K 739 124 2956K kmalloc-32 > >>> 75744 74844 98% 0.12K 2367 32 9468K kernfs_node_cache > >>> > >>> On this machine it is now clear that magazine use 242M of kmem memory. > >> > >> Hmm, something smells there... > >> > >> In the "worst" case there should be a maximum of 6 * 2 * > >> num_online_cpus() empty magazines in the iova_cpu_rcache structures, > >> i.e., 12KB per CPU. Under normal use those will contain at least some > >> PFNs, but mainly every additional magazine stored in a depot is full > >> with 127 PFNs, and each one of those PFNs is backed by a 40-byte struct > >> iova, i.e. ~5KB per 1KB magazine. Unless that machine has many thousands > >> of CPUs, if iova_magazine allocations are the top consumer of memory > >> then something's gone wrong. > > > > This is an upstream kernel + few drivers that is booted on AMD EPYC, > > with 128 CPUs. > > > > It has allocations stacks like these: > > init_iova_domain+0x1ed/0x230 iommu_setup_dma_ops+0xf8/0x4b0 > > amd_iommu_probe_finalize. > > And also init_iova_domain() calls for Google's TPU drivers 242M is > > actually not that much, compared to the size of the system. > > Hmm, I did misspeak slightly (it's late and I really should have left > this for tomorrow...) - that's 12KB per CPU *per domain*, but still that > would seem to imply well over 100 domains if you have 242MB of magazine > allocations while the iommu_iova cache isn't even on the charts... what > the heck is that driver doing? I am not sure what the driver is doing. However, I can check the actual allocation sizes for each init_iova_domain() and report on that later. > > (I don't necessarily disagree with the spirit of the patch BTW, I just > really want to understand the situation that prompted it, and make sure > we don't actually have a subtle leak somewhere.) Yes, the observability is needed here, because there were several optimizations that reduced the size of these magazines, and they still can be large. For example, for a while we had 1032-bytes per-magazine instead of 1024, this caused wasting almost half of magazine memroy with 2K slabs. This was fixed with: b4c9bf178ace iommu/iova: change IOVA_MAG_SIZE to 127 to save memory Also, earlier there was another optimization "32e92d9f6f87 iommu/iova: Separate out rcache init" that reduced cases when magazines need to be allocated. That also reduced overhead on our systems by a factor of 10. Yet, the magazines are still large, and I think it is time to improve observability for the future optimizations, and avoiding future regressions. Pasha
On Thu, Feb 1, 2024 at 11:30 AM Pasha Tatashin <pasha.tatashin@soleen.com> wrote: > > From: Pasha Tatashin <pasha.tatashin@soleen.com> > > The magazine buffers can take gigabytes of kmem memory, dominating all > other allocations. For observability prurpose create named slab cache so > the iova magazine memory overhead can be clearly observed. > > With this change: > > > slabtop -o | head > Active / Total Objects (% used) : 869731 / 952904 (91.3%) > Active / Total Slabs (% used) : 103411 / 103974 (99.5%) > Active / Total Caches (% used) : 135 / 211 (64.0%) > Active / Total Size (% used) : 395389.68K / 411430.20K (96.1%) > Minimum / Average / Maximum Object : 0.02K / 0.43K / 8.00K > > OBJS ACTIVE USE OBJ SIZE SLABS OBJ/SLAB CACHE SIZE NAME > 244412 244239 99% 1.00K 61103 4 244412K iommu_iova_magazine > 91636 88343 96% 0.03K 739 124 2956K kmalloc-32 > 75744 74844 98% 0.12K 2367 32 9468K kernfs_node_cache > > On this machine it is now clear that magazine use 242M of kmem memory. > > Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com> > --- > drivers/iommu/iova.c | 57 +++++++++++++++++++++++++++++++++++++++++--- > 1 file changed, 54 insertions(+), 3 deletions(-) > > diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c > index d30e453d0fb4..617bbc2b79f5 100644 > --- a/drivers/iommu/iova.c > +++ b/drivers/iommu/iova.c > @@ -630,6 +630,10 @@ EXPORT_SYMBOL_GPL(reserve_iova); > > #define IOVA_DEPOT_DELAY msecs_to_jiffies(100) > > +static struct kmem_cache *iova_magazine_cache; > +static unsigned int iova_magazine_cache_users; > +static DEFINE_MUTEX(iova_magazine_cache_mutex); > + > struct iova_magazine { > union { > unsigned long size; > @@ -654,11 +658,51 @@ struct iova_rcache { > struct delayed_work work; > }; > > +static int iova_magazine_cache_init(void) > +{ > + int ret = 0; > + > + mutex_lock(&iova_magazine_cache_mutex); > + > + iova_magazine_cache_users++; > + if (iova_magazine_cache_users > 1) > + goto out_unlock; > + > + iova_magazine_cache = kmem_cache_create("iommu_iova_magazine", > + sizeof(struct iova_magazine), > + 0, SLAB_HWCACHE_ALIGN, NULL); Could this slab cache be merged with a compatible one in the slab code? If this happens, do we still get a separate entry in /proc/slabinfo? It may be useful to use SLAB_NO_MERGE if the purpose is to specifically have observability into this slab cache, but the comments above the flag make me think I may be misunderstanding it.
> > +static int iova_magazine_cache_init(void) > > +{ > > + int ret = 0; > > + > > + mutex_lock(&iova_magazine_cache_mutex); > > + > > + iova_magazine_cache_users++; > > + if (iova_magazine_cache_users > 1) > > + goto out_unlock; > > + > > + iova_magazine_cache = kmem_cache_create("iommu_iova_magazine", > > + sizeof(struct iova_magazine), > > + 0, SLAB_HWCACHE_ALIGN, NULL); > > Could this slab cache be merged with a compatible one in the slab > code? If this happens, do we still get a separate entry in > /proc/slabinfo? Hi Yosry, Good suggestion to check for merges. I have checked, iommu_iova_magazine is not merged. > It may be useful to use SLAB_NO_MERGE if the purpose is to > specifically have observability into this slab cache, but the comments > above the flag make me think I may be misunderstanding it. SLAB_NO_MERGE may reduce performance, and fragmentation efficiency, it is better to keep it as-is. Pasha On Thu, Feb 1, 2024 at 5:29 PM Yosry Ahmed <yosryahmed@google.com> wrote: > > On Thu, Feb 1, 2024 at 11:30 AM Pasha Tatashin > <pasha.tatashin@soleen.com> wrote: > > > > From: Pasha Tatashin <pasha.tatashin@soleen.com> > > > > The magazine buffers can take gigabytes of kmem memory, dominating all > > other allocations. For observability prurpose create named slab cache so > > the iova magazine memory overhead can be clearly observed. > > > > With this change: > > > > > slabtop -o | head > > Active / Total Objects (% used) : 869731 / 952904 (91.3%) > > Active / Total Slabs (% used) : 103411 / 103974 (99.5%) > > Active / Total Caches (% used) : 135 / 211 (64.0%) > > Active / Total Size (% used) : 395389.68K / 411430.20K (96.1%) > > Minimum / Average / Maximum Object : 0.02K / 0.43K / 8.00K > > > > OBJS ACTIVE USE OBJ SIZE SLABS OBJ/SLAB CACHE SIZE NAME > > 244412 244239 99% 1.00K 61103 4 244412K iommu_iova_magazine > > 91636 88343 96% 0.03K 739 124 2956K kmalloc-32 > > 75744 74844 98% 0.12K 2367 32 9468K kernfs_node_cache > > > > On this machine it is now clear that magazine use 242M of kmem memory. > > > > Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com> > > --- > > drivers/iommu/iova.c | 57 +++++++++++++++++++++++++++++++++++++++++--- > > 1 file changed, 54 insertions(+), 3 deletions(-) > > > > diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c > > index d30e453d0fb4..617bbc2b79f5 100644 > > --- a/drivers/iommu/iova.c > > +++ b/drivers/iommu/iova.c > > @@ -630,6 +630,10 @@ EXPORT_SYMBOL_GPL(reserve_iova); > > > > #define IOVA_DEPOT_DELAY msecs_to_jiffies(100) > > > > +static struct kmem_cache *iova_magazine_cache; > > +static unsigned int iova_magazine_cache_users; > > +static DEFINE_MUTEX(iova_magazine_cache_mutex); > > + > > struct iova_magazine { > > union { > > unsigned long size; > > @@ -654,11 +658,51 @@ struct iova_rcache { > > struct delayed_work work; > > }; > > > > +static int iova_magazine_cache_init(void) > > +{ > > + int ret = 0; > > + > > + mutex_lock(&iova_magazine_cache_mutex); > > + > > + iova_magazine_cache_users++; > > + if (iova_magazine_cache_users > 1) > > + goto out_unlock; > > + > > + iova_magazine_cache = kmem_cache_create("iommu_iova_magazine", > > + sizeof(struct iova_magazine), > > + 0, SLAB_HWCACHE_ALIGN, NULL); > > Could this slab cache be merged with a compatible one in the slab > code? If this happens, do we still get a separate entry in > /proc/slabinfo? > > It may be useful to use SLAB_NO_MERGE if the purpose is to > specifically have observability into this slab cache, but the comments > above the flag make me think I may be misunderstanding it.
> Hmm, I did misspeak slightly (it's late and I really should have left > this for tomorrow...) - that's 12KB per CPU *per domain*, but still that > would seem to imply well over 100 domains if you have 242MB of magazine > allocations while the iommu_iova cache isn't even on the charts... what > the heck is that driver doing? > > (I don't necessarily disagree with the spirit of the patch BTW, I just > really want to understand the situation that prompted it, and make sure > we don't actually have a subtle leak somewhere.) Hi Robin, The following tracing is without Google TPU, simply upstream kernel: The iova_domain_init_rcaches is called 159 with the following stack: iova_domain_init_rcaches iommu_setup_dma_ops amd_iommu_probe_finalize bus_iommu_probe iommu_device_register iommu_init_pci amd_iommu_init_pci state_next iommu_go_to_state amd_iommu_init pci_iommu_init do_one_initcall Each time 1536K is allocated: in total 159 * 1536K = 238.5M The allocation happens like this: for (IOVA_RANGE_CACHE_MAX_SIZE) for_each_possible_cpu() iova_magazine_alloc iova_magazine_alloc IOVA_RANGE_CACHE_MAX_SIZE = 6 ncpu = 128 sizeof (struct iova_magazine) = 1K 6 * 128 * (1K + 1K) = 1536K Pasha
On 02/02/2024 6:04 pm, Pasha Tatashin wrote: >> Hmm, I did misspeak slightly (it's late and I really should have left >> this for tomorrow...) - that's 12KB per CPU *per domain*, but still that >> would seem to imply well over 100 domains if you have 242MB of magazine >> allocations while the iommu_iova cache isn't even on the charts... what >> the heck is that driver doing? >> >> (I don't necessarily disagree with the spirit of the patch BTW, I just >> really want to understand the situation that prompted it, and make sure >> we don't actually have a subtle leak somewhere.) > > Hi Robin, > > The following tracing is without Google TPU, simply upstream kernel: > > The iova_domain_init_rcaches is called 159 with the following stack: > > iova_domain_init_rcaches > iommu_setup_dma_ops > amd_iommu_probe_finalize > bus_iommu_probe > iommu_device_register > iommu_init_pci > amd_iommu_init_pci > state_next > iommu_go_to_state > amd_iommu_init > pci_iommu_init > do_one_initcall > > Each time 1536K is allocated: in total 159 * 1536K = 238.5M Yikes, so it really does just have that many IOMMU groups? OK, fair enough, call me convinced :) On balance though, I think I'd prefer to just stick the lifecycle management into iova_cache_{get,put} for simplicity - spending ~256 bytes on another kmem_cache we might not use can hardly be significantly more than the extra code and static data necessary to track its usage separately anyway. Thanks, Robin. > The allocation happens like this: > for (IOVA_RANGE_CACHE_MAX_SIZE) > for_each_possible_cpu() > iova_magazine_alloc > iova_magazine_alloc > > IOVA_RANGE_CACHE_MAX_SIZE = 6 > ncpu = 128 > sizeof (struct iova_magazine) = 1K > > 6 * 128 * (1K + 1K) = 1536K > > Pasha
> Yikes, so it really does just have that many IOMMU groups? OK, fair > enough, call me convinced :) > > On balance though, I think I'd prefer to just stick the lifecycle > management into iova_cache_{get,put} for simplicity - spending ~256 > bytes on another kmem_cache we might not use can hardly be significantly > more than the extra code and static data necessary to track its usage > separately anyway. Sure, I will send a v2 soon. Pasha
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index d30e453d0fb4..617bbc2b79f5 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -630,6 +630,10 @@ EXPORT_SYMBOL_GPL(reserve_iova); #define IOVA_DEPOT_DELAY msecs_to_jiffies(100) +static struct kmem_cache *iova_magazine_cache; +static unsigned int iova_magazine_cache_users; +static DEFINE_MUTEX(iova_magazine_cache_mutex); + struct iova_magazine { union { unsigned long size; @@ -654,11 +658,51 @@ struct iova_rcache { struct delayed_work work; }; +static int iova_magazine_cache_init(void) +{ + int ret = 0; + + mutex_lock(&iova_magazine_cache_mutex); + + iova_magazine_cache_users++; + if (iova_magazine_cache_users > 1) + goto out_unlock; + + iova_magazine_cache = kmem_cache_create("iommu_iova_magazine", + sizeof(struct iova_magazine), + 0, SLAB_HWCACHE_ALIGN, NULL); + + if (!iova_magazine_cache) { + pr_err("Couldn't create iova magazine cache\n"); + ret = -ENOMEM; + } + +out_unlock: + mutex_unlock(&iova_magazine_cache_mutex); + + return ret; +} + +static void iova_magazine_cache_fini(void) +{ + mutex_lock(&iova_magazine_cache_mutex); + + if (WARN_ON(!iova_magazine_cache_users)) + goto out_unlock; + + iova_magazine_cache_users--; + if (!iova_magazine_cache_users) + kmem_cache_destroy(iova_magazine_cache); + +out_unlock: + mutex_unlock(&iova_magazine_cache_mutex); +} + static struct iova_magazine *iova_magazine_alloc(gfp_t flags) { struct iova_magazine *mag; - mag = kmalloc(sizeof(*mag), flags); + mag = kmem_cache_alloc(iova_magazine_cache, flags); if (mag) mag->size = 0; @@ -667,7 +711,7 @@ static struct iova_magazine *iova_magazine_alloc(gfp_t flags) static void iova_magazine_free(struct iova_magazine *mag) { - kfree(mag); + kmem_cache_free(iova_magazine_cache, mag); } static void @@ -766,11 +810,17 @@ int iova_domain_init_rcaches(struct iova_domain *iovad) unsigned int cpu; int i, ret; + ret = iova_magazine_cache_init(); + if (ret) + return -ENOMEM; + iovad->rcaches = kcalloc(IOVA_RANGE_CACHE_MAX_SIZE, sizeof(struct iova_rcache), GFP_KERNEL); - if (!iovad->rcaches) + if (!iovad->rcaches) { + iova_magazine_cache_fini(); return -ENOMEM; + } for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) { struct iova_cpu_rcache *cpu_rcache; @@ -948,6 +998,7 @@ static void free_iova_rcaches(struct iova_domain *iovad) kfree(iovad->rcaches); iovad->rcaches = NULL; + iova_magazine_cache_fini(); } /*