diff mbox

[3/3] dcache: account external names as indirectly reclaimable memory

Message ID 20180305133743.12746-5-guro@fb.com (mailing list archive)
State New, archived
Headers show

Commit Message

Roman Gushchin March 5, 2018, 1:37 p.m. UTC
I was reported about suspicious growth of unreclaimable slabs
on some machines. I've found that it happens on machines
with low memory pressure, and these unreclaimable slabs
are external names attached to dentries.

External names are allocated using generic kmalloc() function,
so they are accounted as unreclaimable. But they are held
by dentries, which are reclaimable, and they will be reclaimed
under the memory pressure.

In particular, this breaks MemAvailable calculation, as it
doesn't take unreclaimable slabs into account.
This leads to a silly situation, when a machine is almost idle,
has no memory pressure and therefore has a big dentry cache.
And the resulting MemAvailable is too low to start a new workload.

To address the issue, the NR_INDIRECTLY_RECLAIMABLE_BYTES counter
is used to track the amount of memory, consumed by external names.
The counter is increased in the dentry allocation path, if an external
name structure is allocated; and it's decreased in the dentry freeing
path.

To reproduce the problem I've used the following Python script:
  import os

  for iter in range (0, 10000000):
      try:
          name = ("/some_long_name_%d" % iter) + "_" * 220
          os.stat(name)
      except Exception:
          pass

Without this patch:
  $ cat /proc/meminfo | grep MemAvailable
  MemAvailable:    7811688 kB
  $ python indirect.py
  $ cat /proc/meminfo | grep MemAvailable
  MemAvailable:    2753052 kB

With the patch:
  $ cat /proc/meminfo | grep MemAvailable
  MemAvailable:    7809516 kB
  $ python indirect.py
  $ cat /proc/meminfo | grep MemAvailable
  MemAvailable:    7749144 kB

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: kernel-team@fb.com
---
 fs/dcache.c | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

Comments

Al Viro March 12, 2018, 9:17 p.m. UTC | #1
On Mon, Mar 05, 2018 at 01:37:43PM +0000, Roman Gushchin wrote:
> diff --git a/fs/dcache.c b/fs/dcache.c
> index 5c7df1df81ff..a0312d73f575 100644
> --- a/fs/dcache.c
> +++ b/fs/dcache.c
> @@ -273,8 +273,16 @@ static void __d_free(struct rcu_head *head)
>  static void __d_free_external(struct rcu_head *head)
>  {
>  	struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
> -	kfree(external_name(dentry));
> -	kmem_cache_free(dentry_cache, dentry); 
> +	struct external_name *name = external_name(dentry);
> +	unsigned long bytes;
> +
> +	bytes = dentry->d_name.len + offsetof(struct external_name, name[1]);
> +	mod_node_page_state(page_pgdat(virt_to_page(name)),
> +			    NR_INDIRECTLY_RECLAIMABLE_BYTES,
> +			    -kmalloc_size(kmalloc_index(bytes)));
> +
> +	kfree(name);
> +	kmem_cache_free(dentry_cache, dentry);
>  }

That can't be right - external names can be freed in release_dentry_name_snapshot()
and copy_name() as well.  When do you want kfree_rcu() paths accounted for, BTW?
At the point where we are freeing them, or where we are scheduling their freeing?
Minchan Kim April 13, 2018, 1:35 p.m. UTC | #2
On Mon, Mar 05, 2018 at 01:37:43PM +0000, Roman Gushchin wrote:
> I was reported about suspicious growth of unreclaimable slabs
> on some machines. I've found that it happens on machines
> with low memory pressure, and these unreclaimable slabs
> are external names attached to dentries.
> 
> External names are allocated using generic kmalloc() function,
> so they are accounted as unreclaimable. But they are held
> by dentries, which are reclaimable, and they will be reclaimed
> under the memory pressure.
> 
> In particular, this breaks MemAvailable calculation, as it
> doesn't take unreclaimable slabs into account.
> This leads to a silly situation, when a machine is almost idle,
> has no memory pressure and therefore has a big dentry cache.
> And the resulting MemAvailable is too low to start a new workload.
> 
> To address the issue, the NR_INDIRECTLY_RECLAIMABLE_BYTES counter
> is used to track the amount of memory, consumed by external names.
> The counter is increased in the dentry allocation path, if an external
> name structure is allocated; and it's decreased in the dentry freeing
> path.
> 
> To reproduce the problem I've used the following Python script:
>   import os
> 
>   for iter in range (0, 10000000):
>       try:
>           name = ("/some_long_name_%d" % iter) + "_" * 220
>           os.stat(name)
>       except Exception:
>           pass
> 
> Without this patch:
>   $ cat /proc/meminfo | grep MemAvailable
>   MemAvailable:    7811688 kB
>   $ python indirect.py
>   $ cat /proc/meminfo | grep MemAvailable
>   MemAvailable:    2753052 kB
> 
> With the patch:
>   $ cat /proc/meminfo | grep MemAvailable
>   MemAvailable:    7809516 kB
>   $ python indirect.py
>   $ cat /proc/meminfo | grep MemAvailable
>   MemAvailable:    7749144 kB
> 
> Signed-off-by: Roman Gushchin <guro@fb.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: linux-fsdevel@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Cc: linux-mm@kvack.org
> Cc: kernel-team@fb.com
> ---
>  fs/dcache.c | 29 ++++++++++++++++++++++++-----
>  1 file changed, 24 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/dcache.c b/fs/dcache.c
> index 5c7df1df81ff..a0312d73f575 100644
> --- a/fs/dcache.c
> +++ b/fs/dcache.c
> @@ -273,8 +273,16 @@ static void __d_free(struct rcu_head *head)
>  static void __d_free_external(struct rcu_head *head)
>  {
>  	struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
> -	kfree(external_name(dentry));
> -	kmem_cache_free(dentry_cache, dentry); 
> +	struct external_name *name = external_name(dentry);
> +	unsigned long bytes;
> +
> +	bytes = dentry->d_name.len + offsetof(struct external_name, name[1]);
> +	mod_node_page_state(page_pgdat(virt_to_page(name)),
> +			    NR_INDIRECTLY_RECLAIMABLE_BYTES,
> +			    -kmalloc_size(kmalloc_index(bytes)));
> +
> +	kfree(name);
> +	kmem_cache_free(dentry_cache, dentry);
>  }
>  
>  static inline int dname_external(const struct dentry *dentry)
> @@ -1598,6 +1606,7 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
>  	struct dentry *dentry;
>  	char *dname;
>  	int err;
> +	size_t reclaimable = 0;
>  
>  	dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
>  	if (!dentry)
> @@ -1614,9 +1623,11 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
>  		name = &slash_name;
>  		dname = dentry->d_iname;
>  	} else if (name->len > DNAME_INLINE_LEN-1) {
> -		size_t size = offsetof(struct external_name, name[1]);
> -		struct external_name *p = kmalloc(size + name->len,
> -						  GFP_KERNEL_ACCOUNT);
> +		struct external_name *p;
> +
> +		reclaimable = offsetof(struct external_name, name[1]) +
> +			name->len;
> +		p = kmalloc(reclaimable, GFP_KERNEL_ACCOUNT);

Can't we use kmem_cache_alloc with own cache created with SLAB_RECLAIM_ACCOUNT
if they are reclaimable? 
With that, it would help fragmentation problem with __GFP_RECLAIMABLE for
page allocation as well as counting problem, IMHO.


>  		if (!p) {
>  			kmem_cache_free(dentry_cache, dentry); 
>  			return NULL;
> @@ -1665,6 +1676,14 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
>  		}
>  	}
>  
> +	if (unlikely(reclaimable)) {
> +		pg_data_t *pgdat;
> +
> +		pgdat = page_pgdat(virt_to_page(external_name(dentry)));
> +		mod_node_page_state(pgdat, NR_INDIRECTLY_RECLAIMABLE_BYTES,
> +				    kmalloc_size(kmalloc_index(reclaimable)));
> +	}
> +
>  	this_cpu_inc(nr_dentry);
>  
>  	return dentry;
> -- 
> 2.14.3
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
Michal Hocko April 13, 2018, 1:59 p.m. UTC | #3
On Fri 13-04-18 22:35:19, Minchan Kim wrote:
> On Mon, Mar 05, 2018 at 01:37:43PM +0000, Roman Gushchin wrote:
[...]
> > @@ -1614,9 +1623,11 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
> >  		name = &slash_name;
> >  		dname = dentry->d_iname;
> >  	} else if (name->len > DNAME_INLINE_LEN-1) {
> > -		size_t size = offsetof(struct external_name, name[1]);
> > -		struct external_name *p = kmalloc(size + name->len,
> > -						  GFP_KERNEL_ACCOUNT);
> > +		struct external_name *p;
> > +
> > +		reclaimable = offsetof(struct external_name, name[1]) +
> > +			name->len;
> > +		p = kmalloc(reclaimable, GFP_KERNEL_ACCOUNT);
> 
> Can't we use kmem_cache_alloc with own cache created with SLAB_RECLAIM_ACCOUNT
> if they are reclaimable? 

No, because names have different sizes and so we would basically have to
duplicate many caches.
Vlastimil Babka April 13, 2018, 2:20 p.m. UTC | #4
On 04/13/2018 03:59 PM, Michal Hocko wrote:
> On Fri 13-04-18 22:35:19, Minchan Kim wrote:
>> On Mon, Mar 05, 2018 at 01:37:43PM +0000, Roman Gushchin wrote:
> [...]
>>> @@ -1614,9 +1623,11 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
>>>  		name = &slash_name;
>>>  		dname = dentry->d_iname;
>>>  	} else if (name->len > DNAME_INLINE_LEN-1) {
>>> -		size_t size = offsetof(struct external_name, name[1]);
>>> -		struct external_name *p = kmalloc(size + name->len,
>>> -						  GFP_KERNEL_ACCOUNT);
>>> +		struct external_name *p;
>>> +
>>> +		reclaimable = offsetof(struct external_name, name[1]) +
>>> +			name->len;
>>> +		p = kmalloc(reclaimable, GFP_KERNEL_ACCOUNT);
>>
>> Can't we use kmem_cache_alloc with own cache created with SLAB_RECLAIM_ACCOUNT
>> if they are reclaimable? 
> 
> No, because names have different sizes and so we would basically have to
> duplicate many caches.

We would need kmalloc-reclaimable-X variants. It could be worth it,
especially if we find more similar usages. I suspect they would be more
useful than the existing dma-kmalloc-X :)

Maybe create both (dma and reclaimable) on demand?
Michal Hocko April 13, 2018, 2:28 p.m. UTC | #5
On Fri 13-04-18 16:20:00, Vlastimil Babka wrote:
> On 04/13/2018 03:59 PM, Michal Hocko wrote:
> > On Fri 13-04-18 22:35:19, Minchan Kim wrote:
> >> On Mon, Mar 05, 2018 at 01:37:43PM +0000, Roman Gushchin wrote:
> > [...]
> >>> @@ -1614,9 +1623,11 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
> >>>  		name = &slash_name;
> >>>  		dname = dentry->d_iname;
> >>>  	} else if (name->len > DNAME_INLINE_LEN-1) {
> >>> -		size_t size = offsetof(struct external_name, name[1]);
> >>> -		struct external_name *p = kmalloc(size + name->len,
> >>> -						  GFP_KERNEL_ACCOUNT);
> >>> +		struct external_name *p;
> >>> +
> >>> +		reclaimable = offsetof(struct external_name, name[1]) +
> >>> +			name->len;
> >>> +		p = kmalloc(reclaimable, GFP_KERNEL_ACCOUNT);
> >>
> >> Can't we use kmem_cache_alloc with own cache created with SLAB_RECLAIM_ACCOUNT
> >> if they are reclaimable? 
> > 
> > No, because names have different sizes and so we would basically have to
> > duplicate many caches.
> 
> We would need kmalloc-reclaimable-X variants. It could be worth it,
> especially if we find more similar usages. I suspect they would be more
> useful than the existing dma-kmalloc-X :)

I am still not sure why __GFP_RECLAIMABLE cannot be made work as
expected and account slab pages as SLAB_RECLAIMABLE
Johannes Weiner April 13, 2018, 2:37 p.m. UTC | #6
On Fri, Apr 13, 2018 at 04:28:21PM +0200, Michal Hocko wrote:
> On Fri 13-04-18 16:20:00, Vlastimil Babka wrote:
> > We would need kmalloc-reclaimable-X variants. It could be worth it,
> > especially if we find more similar usages. I suspect they would be more
> > useful than the existing dma-kmalloc-X :)
> 
> I am still not sure why __GFP_RECLAIMABLE cannot be made work as
> expected and account slab pages as SLAB_RECLAIMABLE

Can you outline how this would work without separate caches?
Michal Hocko April 16, 2018, 11:41 a.m. UTC | #7
On Fri 13-04-18 10:37:16, Johannes Weiner wrote:
> On Fri, Apr 13, 2018 at 04:28:21PM +0200, Michal Hocko wrote:
> > On Fri 13-04-18 16:20:00, Vlastimil Babka wrote:
> > > We would need kmalloc-reclaimable-X variants. It could be worth it,
> > > especially if we find more similar usages. I suspect they would be more
> > > useful than the existing dma-kmalloc-X :)
> > 
> > I am still not sure why __GFP_RECLAIMABLE cannot be made work as
> > expected and account slab pages as SLAB_RECLAIMABLE
> 
> Can you outline how this would work without separate caches?

I thought that the cache would only maintain two sets of slab pages
depending on the allocation reuquests. I am pretty sure there will be
other details to iron out and maybe it will turn out that such a large
portion of the chache would need to duplicate the state that a
completely new cache would be more reasonable. Is this worth exploring
at least? I mean something like this should help with the fragmentation
already AFAIU. Accounting would be just free on top.
Vlastimil Babka April 16, 2018, 12:06 p.m. UTC | #8
On 04/16/2018 01:41 PM, Michal Hocko wrote:
> On Fri 13-04-18 10:37:16, Johannes Weiner wrote:
>> On Fri, Apr 13, 2018 at 04:28:21PM +0200, Michal Hocko wrote:
>>> On Fri 13-04-18 16:20:00, Vlastimil Babka wrote:
>>>> We would need kmalloc-reclaimable-X variants. It could be worth it,
>>>> especially if we find more similar usages. I suspect they would be more
>>>> useful than the existing dma-kmalloc-X :)
>>>
>>> I am still not sure why __GFP_RECLAIMABLE cannot be made work as
>>> expected and account slab pages as SLAB_RECLAIMABLE
>>
>> Can you outline how this would work without separate caches?
> 
> I thought that the cache would only maintain two sets of slab pages
> depending on the allocation reuquests. I am pretty sure there will be
> other details to iron out and

For example the percpu (and other) array caches...

> maybe it will turn out that such a large
> portion of the chache would need to duplicate the state that a
> completely new cache would be more reasonable.

I'm afraid that's the case, yes.

> Is this worth exploring
> at least? I mean something like this should help with the fragmentation
> already AFAIU. Accounting would be just free on top.

Yep. It could be also CONFIG_urable so smaller systems don't need to
deal with the memory overhead of this.

So do we put it on LSF/MM agenda?
Michal Hocko April 16, 2018, 12:27 p.m. UTC | #9
On Mon 16-04-18 14:06:21, Vlastimil Babka wrote:
> On 04/16/2018 01:41 PM, Michal Hocko wrote:
> > On Fri 13-04-18 10:37:16, Johannes Weiner wrote:
> >> On Fri, Apr 13, 2018 at 04:28:21PM +0200, Michal Hocko wrote:
> >>> On Fri 13-04-18 16:20:00, Vlastimil Babka wrote:
> >>>> We would need kmalloc-reclaimable-X variants. It could be worth it,
> >>>> especially if we find more similar usages. I suspect they would be more
> >>>> useful than the existing dma-kmalloc-X :)
> >>>
> >>> I am still not sure why __GFP_RECLAIMABLE cannot be made work as
> >>> expected and account slab pages as SLAB_RECLAIMABLE
> >>
> >> Can you outline how this would work without separate caches?
> > 
> > I thought that the cache would only maintain two sets of slab pages
> > depending on the allocation reuquests. I am pretty sure there will be
> > other details to iron out and
> 
> For example the percpu (and other) array caches...
> 
> > maybe it will turn out that such a large
> > portion of the chache would need to duplicate the state that a
> > completely new cache would be more reasonable.
> 
> I'm afraid that's the case, yes.
> 
> > Is this worth exploring
> > at least? I mean something like this should help with the fragmentation
> > already AFAIU. Accounting would be just free on top.
> 
> Yep. It could be also CONFIG_urable so smaller systems don't need to
> deal with the memory overhead of this.
> 
> So do we put it on LSF/MM agenda?

If you volunteer to lead the discussion, then I do not have any
objections.
Matthew Wilcox April 16, 2018, 1:09 p.m. UTC | #10
On Mon, Apr 16, 2018 at 02:06:21PM +0200, Vlastimil Babka wrote:
> On 04/16/2018 01:41 PM, Michal Hocko wrote:
> > On Fri 13-04-18 10:37:16, Johannes Weiner wrote:
> >> On Fri, Apr 13, 2018 at 04:28:21PM +0200, Michal Hocko wrote:
> >>> On Fri 13-04-18 16:20:00, Vlastimil Babka wrote:
> >>>> We would need kmalloc-reclaimable-X variants. It could be worth it,
> >>>> especially if we find more similar usages. I suspect they would be more
> >>>> useful than the existing dma-kmalloc-X :)
> >>>
> >>> I am still not sure why __GFP_RECLAIMABLE cannot be made work as
> >>> expected and account slab pages as SLAB_RECLAIMABLE
> >>
> >> Can you outline how this would work without separate caches?
> > 
> > I thought that the cache would only maintain two sets of slab pages
> > depending on the allocation reuquests. I am pretty sure there will be
> > other details to iron out and
> 
> For example the percpu (and other) array caches...
> 
> > maybe it will turn out that such a large
> > portion of the chache would need to duplicate the state that a
> > completely new cache would be more reasonable.
> 
> I'm afraid that's the case, yes.

I'm not sure it'll be so bad, at least for SLUB ... I think everything
we need to duplicate is already percpu, and if we combine GFP_DMA
and GFP_RECLAIMABLE into this, we might even get more savings.  Also,
we only need to do this for the kmalloc slabs; currently 13 of them.
So we eliminate 13 caches and in return allocate 13 * 2 * NR_CPU pointers.
That'll be a win on some machines and a loss on others, but the machines
where it's consuming more memory should have more memory to begin with,
so I'd count it as a win.

The node partial list probably wants to be trebled in size to have one
list per memory type.  But I think the allocation path only changes
like this:

@@ -2663,10 +2663,13 @@ static __always_inline void *slab_alloc_node(struct kmem
_cache *s,
        struct kmem_cache_cpu *c;
        struct page *page;
        unsigned long tid;
+       unsigned int offset = 0;
 
        s = slab_pre_alloc_hook(s, gfpflags);
        if (!s)
                return NULL;
        if (s->flags & SLAB_KMALLOC)
                offset = flags_to_slab_id(gfpflags);
 redo:
        /*
         * Must read kmem_cache cpu data via this cpu ptr. Preemption is
@@ -2679,8 +2682,8 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
         * to check if it is matched or not.
         */
        do {
-               tid = this_cpu_read(s->cpu_slab->tid);
-               c = raw_cpu_ptr(s->cpu_slab);
+               tid = this_cpu_read((&s->cpu_slab[offset])->tid);
+               c = raw_cpu_ptr(&s->cpu_slab[offset]);
        } while (IS_ENABLED(CONFIG_PREEMPT) &&
                 unlikely(tid != READ_ONCE(c->tid)));
 

> > Is this worth exploring
> > at least? I mean something like this should help with the fragmentation
> > already AFAIU. Accounting would be just free on top.
> 
> Yep. It could be also CONFIG_urable so smaller systems don't need to
> deal with the memory overhead of this.
> 
> So do we put it on LSF/MM agenda?

We have an agenda?  :-)
Vlastimil Babka April 16, 2018, 7:57 p.m. UTC | #11
On 04/16/2018 02:27 PM, Michal Hocko wrote:
> On Mon 16-04-18 14:06:21, Vlastimil Babka wrote:
>>
>> For example the percpu (and other) array caches...
>>
>>> maybe it will turn out that such a large
>>> portion of the chache would need to duplicate the state that a
>>> completely new cache would be more reasonable.
>>
>> I'm afraid that's the case, yes.
>>
>>> Is this worth exploring
>>> at least? I mean something like this should help with the fragmentation
>>> already AFAIU. Accounting would be just free on top.
>>
>> Yep. It could be also CONFIG_urable so smaller systems don't need to
>> deal with the memory overhead of this.
>>
>> So do we put it on LSF/MM agenda?
> 
> If you volunteer to lead the discussion, then I do not have any
> objections.

Sure, let's add the topic of SLAB_MINIMIZE_WASTE [1] as well.

Something like "Supporting reclaimable kmalloc caches and large
non-buddy-sized objects in slab allocators" ?

[1] https://marc.info/?l=linux-mm&m=152156671614796&w=2
Michal Hocko April 17, 2018, 6:44 a.m. UTC | #12
[the head of the thread is http://lkml.kernel.org/r/08524819-14ef-81d0-fa90-d7af13c6b9d5@suse.cz]

On Mon 16-04-18 21:57:50, Vlastimil Babka wrote:
> On 04/16/2018 02:27 PM, Michal Hocko wrote:
> > On Mon 16-04-18 14:06:21, Vlastimil Babka wrote:
> >>
> >> For example the percpu (and other) array caches...
> >>
> >>> maybe it will turn out that such a large
> >>> portion of the chache would need to duplicate the state that a
> >>> completely new cache would be more reasonable.
> >>
> >> I'm afraid that's the case, yes.
> >>
> >>> Is this worth exploring
> >>> at least? I mean something like this should help with the fragmentation
> >>> already AFAIU. Accounting would be just free on top.
> >>
> >> Yep. It could be also CONFIG_urable so smaller systems don't need to
> >> deal with the memory overhead of this.
> >>
> >> So do we put it on LSF/MM agenda?
> > 
> > If you volunteer to lead the discussion, then I do not have any
> > objections.
> 
> Sure, let's add the topic of SLAB_MINIMIZE_WASTE [1] as well.
> 
> Something like "Supporting reclaimable kmalloc caches and large
> non-buddy-sized objects in slab allocators" ?
> 
> [1] https://marc.info/?l=linux-mm&m=152156671614796&w=2

OK, noted.
Roman Gushchin April 17, 2018, 11:24 a.m. UTC | #13
On Mon, Apr 16, 2018 at 01:41:44PM +0200, Michal Hocko wrote:
> On Fri 13-04-18 10:37:16, Johannes Weiner wrote:
> > On Fri, Apr 13, 2018 at 04:28:21PM +0200, Michal Hocko wrote:
> > > On Fri 13-04-18 16:20:00, Vlastimil Babka wrote:
> > > > We would need kmalloc-reclaimable-X variants. It could be worth it,
> > > > especially if we find more similar usages. I suspect they would be more
> > > > useful than the existing dma-kmalloc-X :)
> > > 
> > > I am still not sure why __GFP_RECLAIMABLE cannot be made work as
> > > expected and account slab pages as SLAB_RECLAIMABLE
> > 
> > Can you outline how this would work without separate caches?
> 
> I thought that the cache would only maintain two sets of slab pages
> depending on the allocation reuquests. I am pretty sure there will be
> other details to iron out and maybe it will turn out that such a large
> portion of the chache would need to duplicate the state that a
> completely new cache would be more reasonable. Is this worth exploring
> at least? I mean something like this should help with the fragmentation
> already AFAIU. Accounting would be just free on top.

IMO, this approach is much better than duplicating all kmalloc caches.
It's definitely has to be explored and discussed.

Thank you!
diff mbox

Patch

diff --git a/fs/dcache.c b/fs/dcache.c
index 5c7df1df81ff..a0312d73f575 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -273,8 +273,16 @@  static void __d_free(struct rcu_head *head)
 static void __d_free_external(struct rcu_head *head)
 {
 	struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
-	kfree(external_name(dentry));
-	kmem_cache_free(dentry_cache, dentry); 
+	struct external_name *name = external_name(dentry);
+	unsigned long bytes;
+
+	bytes = dentry->d_name.len + offsetof(struct external_name, name[1]);
+	mod_node_page_state(page_pgdat(virt_to_page(name)),
+			    NR_INDIRECTLY_RECLAIMABLE_BYTES,
+			    -kmalloc_size(kmalloc_index(bytes)));
+
+	kfree(name);
+	kmem_cache_free(dentry_cache, dentry);
 }
 
 static inline int dname_external(const struct dentry *dentry)
@@ -1598,6 +1606,7 @@  struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
 	struct dentry *dentry;
 	char *dname;
 	int err;
+	size_t reclaimable = 0;
 
 	dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
 	if (!dentry)
@@ -1614,9 +1623,11 @@  struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
 		name = &slash_name;
 		dname = dentry->d_iname;
 	} else if (name->len > DNAME_INLINE_LEN-1) {
-		size_t size = offsetof(struct external_name, name[1]);
-		struct external_name *p = kmalloc(size + name->len,
-						  GFP_KERNEL_ACCOUNT);
+		struct external_name *p;
+
+		reclaimable = offsetof(struct external_name, name[1]) +
+			name->len;
+		p = kmalloc(reclaimable, GFP_KERNEL_ACCOUNT);
 		if (!p) {
 			kmem_cache_free(dentry_cache, dentry); 
 			return NULL;
@@ -1665,6 +1676,14 @@  struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
 		}
 	}
 
+	if (unlikely(reclaimable)) {
+		pg_data_t *pgdat;
+
+		pgdat = page_pgdat(virt_to_page(external_name(dentry)));
+		mod_node_page_state(pgdat, NR_INDIRECTLY_RECLAIMABLE_BYTES,
+				    kmalloc_size(kmalloc_index(reclaimable)));
+	}
+
 	this_cpu_inc(nr_dentry);
 
 	return dentry;