diff mbox series

[v3,5/5] mm/memcg: Optimize user context object stock access

Message ID 20210414012027.5352-6-longman@redhat.com (mailing list archive)
State New, archived
Headers show
Series mm/memcg: Reduce kmemcache memory accounting overhead | expand

Commit Message

Waiman Long April 14, 2021, 1:20 a.m. UTC
Most kmem_cache_alloc() calls are from user context. With instrumentation
enabled, the measured amount of kmem_cache_alloc() calls from non-task
context was about 0.01% of the total.

The irq disable/enable sequence used in this case to access content
from object stock is slow.  To optimize for user context access, there
are now two object stocks for task context and interrupt context access
respectively.

The task context object stock can be accessed after disabling preemption
which is cheap in non-preempt kernel. The interrupt context object stock
can only be accessed after disabling interrupt. User context code can
access interrupt object stock, but not vice versa.

The mod_objcg_state() function is also modified to make sure that memcg
and lruvec stat updates are done with interrupted disabled.

The downside of this change is that there are more data stored in local
object stocks and not reflected in the charge counter and the vmstat
arrays.  However, this is a small price to pay for better performance.

Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
---
 mm/memcontrol.c | 74 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 59 insertions(+), 15 deletions(-)

Comments

Masayoshi Mizuma April 15, 2021, 3:28 a.m. UTC | #1
On Tue, Apr 13, 2021 at 09:20:27PM -0400, Waiman Long wrote:
> Most kmem_cache_alloc() calls are from user context. With instrumentation
> enabled, the measured amount of kmem_cache_alloc() calls from non-task
> context was about 0.01% of the total.
> 
> The irq disable/enable sequence used in this case to access content
> from object stock is slow.  To optimize for user context access, there
> are now two object stocks for task context and interrupt context access
> respectively.
> 
> The task context object stock can be accessed after disabling preemption
> which is cheap in non-preempt kernel. The interrupt context object stock
> can only be accessed after disabling interrupt. User context code can
> access interrupt object stock, but not vice versa.
> 
> The mod_objcg_state() function is also modified to make sure that memcg
> and lruvec stat updates are done with interrupted disabled.
> 
> The downside of this change is that there are more data stored in local
> object stocks and not reflected in the charge counter and the vmstat
> arrays.  However, this is a small price to pay for better performance.
> 
> Signed-off-by: Waiman Long <longman@redhat.com>
> Acked-by: Roman Gushchin <guro@fb.com>
> Reviewed-by: Shakeel Butt <shakeelb@google.com>
> ---
>  mm/memcontrol.c | 74 +++++++++++++++++++++++++++++++++++++++----------
>  1 file changed, 59 insertions(+), 15 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 69f728383efe..8875e896e52b 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2229,7 +2229,8 @@ struct obj_stock {
>  struct memcg_stock_pcp {
>  	struct mem_cgroup *cached; /* this never be root cgroup */
>  	unsigned int nr_pages;
> -	struct obj_stock obj;
> +	struct obj_stock task_obj;
> +	struct obj_stock irq_obj;
>  
>  	struct work_struct work;
>  	unsigned long flags;
> @@ -2254,11 +2255,48 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
>  }
>  #endif
>  
> +/*
> + * Most kmem_cache_alloc() calls are from user context. The irq disable/enable
> + * sequence used in this case to access content from object stock is slow.
> + * To optimize for user context access, there are now two object stocks for
> + * task context and interrupt context access respectively.
> + *
> + * The task context object stock can be accessed by disabling preemption only
> + * which is cheap in non-preempt kernel. The interrupt context object stock
> + * can only be accessed after disabling interrupt. User context code can
> + * access interrupt object stock, but not vice versa.
> + */
>  static inline struct obj_stock *current_obj_stock(void)
>  {
>  	struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock);
>  
> -	return &stock->obj;
> +	return in_task() ? &stock->task_obj : &stock->irq_obj;
> +}
> +
> +#define get_obj_stock(flags)				\
> +({							\
> +	struct memcg_stock_pcp *stock;			\
> +	struct obj_stock *obj_stock;			\
> +							\
> +	if (in_task()) {				\
> +		preempt_disable();			\
> +		(flags) = -1L;				\
> +		stock = this_cpu_ptr(&memcg_stock);	\
> +		obj_stock = &stock->task_obj;		\
> +	} else {					\
> +		local_irq_save(flags);			\
> +		stock = this_cpu_ptr(&memcg_stock);	\
> +		obj_stock = &stock->irq_obj;		\
> +	}						\
> +	obj_stock;					\
> +})
> +
> +static inline void put_obj_stock(unsigned long flags)
> +{
> +	if (flags == -1L)
> +		preempt_enable();
> +	else
> +		local_irq_restore(flags);
>  }
>  
>  /**
> @@ -2327,7 +2365,9 @@ static void drain_local_stock(struct work_struct *dummy)
>  	local_irq_save(flags);
>  
>  	stock = this_cpu_ptr(&memcg_stock);
> -	drain_obj_stock(&stock->obj);
> +	drain_obj_stock(&stock->irq_obj);
> +	if (in_task())
> +		drain_obj_stock(&stock->task_obj);
>  	drain_stock(stock);
>  	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
>  
> @@ -3183,7 +3223,7 @@ static inline void mod_objcg_state(struct obj_cgroup *objcg,
>  	memcg = obj_cgroup_memcg(objcg);
>  	if (pgdat)
>  		lruvec = mem_cgroup_lruvec(memcg, pgdat);
> -	__mod_memcg_lruvec_state(memcg, lruvec, idx, nr);
> +	mod_memcg_lruvec_state(memcg, lruvec, idx, nr);
>  	rcu_read_unlock();
>  }
>  
> @@ -3193,15 +3233,14 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
>  	unsigned long flags;
>  	bool ret = false;
>  
> -	local_irq_save(flags);
> +	stock = get_obj_stock(flags);
>  
> -	stock = current_obj_stock();
>  	if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
>  		stock->nr_bytes -= nr_bytes;
>  		ret = true;
>  	}
>  
> -	local_irq_restore(flags);
> +	put_obj_stock(flags);
>  
>  	return ret;
>  }
> @@ -3254,8 +3293,13 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
>  {
>  	struct mem_cgroup *memcg;
>  
> -	if (stock->obj.cached_objcg) {
> -		memcg = obj_cgroup_memcg(stock->obj.cached_objcg);
> +	if (in_task() && stock->task_obj.cached_objcg) {
> +		memcg = obj_cgroup_memcg(stock->task_obj.cached_objcg);
> +		if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
> +			return true;
> +	}
> +	if (stock->irq_obj.cached_objcg) {
> +		memcg = obj_cgroup_memcg(stock->irq_obj.cached_objcg);
>  		if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
>  			return true;
>  	}
> @@ -3283,9 +3327,9 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
>  {
>  	unsigned long flags;
>  
> -	local_irq_save(flags);
> +	get_obj_stock(flags);
>  	__refill_obj_stock(objcg, nr_bytes);
> -	local_irq_restore(flags);
> +	put_obj_stock(flags);
>  }
>  
>  static void __mod_obj_stock_state(struct obj_cgroup *objcg,
> @@ -3325,9 +3369,9 @@ void mod_obj_stock_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
>  {
>  	unsigned long flags;
>  
> -	local_irq_save(flags);
> +	get_obj_stock(flags);
>  	__mod_obj_stock_state(objcg, pgdat, idx, nr);
> -	local_irq_restore(flags);
> +	put_obj_stock(flags);
>  }
>  
>  int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
> @@ -3380,10 +3424,10 @@ void obj_cgroup_uncharge_mod_state(struct obj_cgroup *objcg, size_t size,
>  {
>  	unsigned long flags;
>  
> -	local_irq_save(flags);
> +	get_obj_stock(flags);
>  	__refill_obj_stock(objcg, size);
>  	__mod_obj_stock_state(objcg, pgdat, idx, -(int)size);
> -	local_irq_restore(flags);
> +	put_obj_stock(flags);
>  }
>  
>  #endif /* CONFIG_MEMCG_KMEM */
> -- 
> 2.18.1
> 

Please feel free to add:

Tested-by: Masayoshi Mizuma <m.mizuma@jp.fujitsu.com>

Thanks!
Masa
Christoph Lameter April 15, 2021, 9:44 a.m. UTC | #2
On Wed, 14 Apr 2021, Masayoshi Mizuma wrote:

> Please feel free to add:
>
> Tested-by: Masayoshi Mizuma <m.mizuma@jp.fujitsu.com>
>
> Thanks!
> Masa
>

Would you please stop quoting the whole patch when you have nothing to say
about the details? It is enough to just respond without quoting. I was
looking through this trying to find something you said about individual
sections of code but there was nothing.
Masayoshi Mizuma April 15, 2021, 12:16 p.m. UTC | #3
On Thu, Apr 15, 2021 at 11:44:55AM +0200, Christoph Lameter wrote:
> Would you please stop quoting the whole patch when you have nothing to say
> about the details? It is enough to just respond without quoting. I was
> looking through this trying to find something you said about individual
> sections of code but there was nothing.

Thank you for pointing it out and sorry about that.
I'll do that next time.

- Masa
Johannes Weiner April 15, 2021, 5:53 p.m. UTC | #4
On Tue, Apr 13, 2021 at 09:20:27PM -0400, Waiman Long wrote:
> Most kmem_cache_alloc() calls are from user context. With instrumentation
> enabled, the measured amount of kmem_cache_alloc() calls from non-task
> context was about 0.01% of the total.
> 
> The irq disable/enable sequence used in this case to access content
> from object stock is slow.  To optimize for user context access, there
> are now two object stocks for task context and interrupt context access
> respectively.
> 
> The task context object stock can be accessed after disabling preemption
> which is cheap in non-preempt kernel. The interrupt context object stock
> can only be accessed after disabling interrupt. User context code can
> access interrupt object stock, but not vice versa.
> 
> The mod_objcg_state() function is also modified to make sure that memcg
> and lruvec stat updates are done with interrupted disabled.
> 
> The downside of this change is that there are more data stored in local
> object stocks and not reflected in the charge counter and the vmstat
> arrays.  However, this is a small price to pay for better performance.
> 
> Signed-off-by: Waiman Long <longman@redhat.com>
> Acked-by: Roman Gushchin <guro@fb.com>
> Reviewed-by: Shakeel Butt <shakeelb@google.com>

This makes sense, and also explains the previous patch a bit
better. But please merge those two.

> @@ -2229,7 +2229,8 @@ struct obj_stock {
>  struct memcg_stock_pcp {
>  	struct mem_cgroup *cached; /* this never be root cgroup */
>  	unsigned int nr_pages;
> -	struct obj_stock obj;
> +	struct obj_stock task_obj;
> +	struct obj_stock irq_obj;
>  
>  	struct work_struct work;
>  	unsigned long flags;
> @@ -2254,11 +2255,48 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
>  }
>  #endif
>  
> +/*
> + * Most kmem_cache_alloc() calls are from user context. The irq disable/enable
> + * sequence used in this case to access content from object stock is slow.
> + * To optimize for user context access, there are now two object stocks for
> + * task context and interrupt context access respectively.
> + *
> + * The task context object stock can be accessed by disabling preemption only
> + * which is cheap in non-preempt kernel. The interrupt context object stock
> + * can only be accessed after disabling interrupt. User context code can
> + * access interrupt object stock, but not vice versa.
> + */
>  static inline struct obj_stock *current_obj_stock(void)
>  {
>  	struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock);
>  
> -	return &stock->obj;
> +	return in_task() ? &stock->task_obj : &stock->irq_obj;
> +}
> +
> +#define get_obj_stock(flags)				\
> +({							\
> +	struct memcg_stock_pcp *stock;			\
> +	struct obj_stock *obj_stock;			\
> +							\
> +	if (in_task()) {				\
> +		preempt_disable();			\
> +		(flags) = -1L;				\
> +		stock = this_cpu_ptr(&memcg_stock);	\
> +		obj_stock = &stock->task_obj;		\
> +	} else {					\
> +		local_irq_save(flags);			\
> +		stock = this_cpu_ptr(&memcg_stock);	\
> +		obj_stock = &stock->irq_obj;		\
> +	}						\
> +	obj_stock;					\
> +})
> +
> +static inline void put_obj_stock(unsigned long flags)
> +{
> +	if (flags == -1L)
> +		preempt_enable();
> +	else
> +		local_irq_restore(flags);
>  }

Please make them both functions and use 'unsigned long *flags'.

Also I'm not sure doing in_task() twice would actually be more
expensive than the == -1 special case, and easier to understand.

> @@ -2327,7 +2365,9 @@ static void drain_local_stock(struct work_struct *dummy)
>  	local_irq_save(flags);
>  
>  	stock = this_cpu_ptr(&memcg_stock);
> -	drain_obj_stock(&stock->obj);
> +	drain_obj_stock(&stock->irq_obj);
> +	if (in_task())
> +		drain_obj_stock(&stock->task_obj);
>  	drain_stock(stock);
>  	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
>  
> @@ -3183,7 +3223,7 @@ static inline void mod_objcg_state(struct obj_cgroup *objcg,
>  	memcg = obj_cgroup_memcg(objcg);
>  	if (pgdat)
>  		lruvec = mem_cgroup_lruvec(memcg, pgdat);
> -	__mod_memcg_lruvec_state(memcg, lruvec, idx, nr);
> +	mod_memcg_lruvec_state(memcg, lruvec, idx, nr);
>  	rcu_read_unlock();

This is actually a bug introduced in the earlier patch, isn't it?
Calling __mod_memcg_lruvec_state() without irqs disabled...
Waiman Long April 15, 2021, 6:16 p.m. UTC | #5
On 4/15/21 1:53 PM, Johannes Weiner wrote:
> On Tue, Apr 13, 2021 at 09:20:27PM -0400, Waiman Long wrote:
>> Most kmem_cache_alloc() calls are from user context. With instrumentation
>> enabled, the measured amount of kmem_cache_alloc() calls from non-task
>> context was about 0.01% of the total.
>>
>> The irq disable/enable sequence used in this case to access content
>> from object stock is slow.  To optimize for user context access, there
>> are now two object stocks for task context and interrupt context access
>> respectively.
>>
>> The task context object stock can be accessed after disabling preemption
>> which is cheap in non-preempt kernel. The interrupt context object stock
>> can only be accessed after disabling interrupt. User context code can
>> access interrupt object stock, but not vice versa.
>>
>> The mod_objcg_state() function is also modified to make sure that memcg
>> and lruvec stat updates are done with interrupted disabled.
>>
>> The downside of this change is that there are more data stored in local
>> object stocks and not reflected in the charge counter and the vmstat
>> arrays.  However, this is a small price to pay for better performance.
>>
>> Signed-off-by: Waiman Long <longman@redhat.com>
>> Acked-by: Roman Gushchin <guro@fb.com>
>> Reviewed-by: Shakeel Butt <shakeelb@google.com>
> This makes sense, and also explains the previous patch a bit
> better. But please merge those two.
The reason I broke it into two is so that the patches are individually 
easier to review. I prefer to update the commit log of patch 4 to 
explain why the obj_stock structure is introduced instead of merging the 
two.
>
>> @@ -2229,7 +2229,8 @@ struct obj_stock {
>>   struct memcg_stock_pcp {
>>   	struct mem_cgroup *cached; /* this never be root cgroup */
>>   	unsigned int nr_pages;
>> -	struct obj_stock obj;
>> +	struct obj_stock task_obj;
>> +	struct obj_stock irq_obj;
>>   
>>   	struct work_struct work;
>>   	unsigned long flags;
>> @@ -2254,11 +2255,48 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
>>   }
>>   #endif
>>   
>> +/*
>> + * Most kmem_cache_alloc() calls are from user context. The irq disable/enable
>> + * sequence used in this case to access content from object stock is slow.
>> + * To optimize for user context access, there are now two object stocks for
>> + * task context and interrupt context access respectively.
>> + *
>> + * The task context object stock can be accessed by disabling preemption only
>> + * which is cheap in non-preempt kernel. The interrupt context object stock
>> + * can only be accessed after disabling interrupt. User context code can
>> + * access interrupt object stock, but not vice versa.
>> + */
>>   static inline struct obj_stock *current_obj_stock(void)
>>   {
>>   	struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock);
>>   
>> -	return &stock->obj;
>> +	return in_task() ? &stock->task_obj : &stock->irq_obj;
>> +}
>> +
>> +#define get_obj_stock(flags)				\
>> +({							\
>> +	struct memcg_stock_pcp *stock;			\
>> +	struct obj_stock *obj_stock;			\
>> +							\
>> +	if (in_task()) {				\
>> +		preempt_disable();			\
>> +		(flags) = -1L;				\
>> +		stock = this_cpu_ptr(&memcg_stock);	\
>> +		obj_stock = &stock->task_obj;		\
>> +	} else {					\
>> +		local_irq_save(flags);			\
>> +		stock = this_cpu_ptr(&memcg_stock);	\
>> +		obj_stock = &stock->irq_obj;		\
>> +	}						\
>> +	obj_stock;					\
>> +})
>> +
>> +static inline void put_obj_stock(unsigned long flags)
>> +{
>> +	if (flags == -1L)
>> +		preempt_enable();
>> +	else
>> +		local_irq_restore(flags);
>>   }
> Please make them both functions and use 'unsigned long *flags'.
Sure, I can do that.
>
> Also I'm not sure doing in_task() twice would actually be more
> expensive than the == -1 special case, and easier to understand.
I can make that change too. Either way is fine with me.
>
>> @@ -2327,7 +2365,9 @@ static void drain_local_stock(struct work_struct *dummy)
>>   	local_irq_save(flags);
>>   
>>   	stock = this_cpu_ptr(&memcg_stock);
>> -	drain_obj_stock(&stock->obj);
>> +	drain_obj_stock(&stock->irq_obj);
>> +	if (in_task())
>> +		drain_obj_stock(&stock->task_obj);
>>   	drain_stock(stock);
>>   	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
>>   
>> @@ -3183,7 +3223,7 @@ static inline void mod_objcg_state(struct obj_cgroup *objcg,
>>   	memcg = obj_cgroup_memcg(objcg);
>>   	if (pgdat)
>>   		lruvec = mem_cgroup_lruvec(memcg, pgdat);
>> -	__mod_memcg_lruvec_state(memcg, lruvec, idx, nr);
>> +	mod_memcg_lruvec_state(memcg, lruvec, idx, nr);
>>   	rcu_read_unlock();
> This is actually a bug introduced in the earlier patch, isn't it?
> Calling __mod_memcg_lruvec_state() without irqs disabled...
>
Not really, in patch 3, mod_objcg_state() is called only in the stock 
update context where interrupt had already been disabled. But now, that 
is no longer the case, that is why i need to update mod_objcg_state() to 
make sure irq is disabled before updating vmstat data array.

Cheers,
Longman
Johannes Weiner April 15, 2021, 6:53 p.m. UTC | #6
On Thu, Apr 15, 2021 at 02:16:17PM -0400, Waiman Long wrote:
> On 4/15/21 1:53 PM, Johannes Weiner wrote:
> > On Tue, Apr 13, 2021 at 09:20:27PM -0400, Waiman Long wrote:
> > > Most kmem_cache_alloc() calls are from user context. With instrumentation
> > > enabled, the measured amount of kmem_cache_alloc() calls from non-task
> > > context was about 0.01% of the total.
> > > 
> > > The irq disable/enable sequence used in this case to access content
> > > from object stock is slow.  To optimize for user context access, there
> > > are now two object stocks for task context and interrupt context access
> > > respectively.
> > > 
> > > The task context object stock can be accessed after disabling preemption
> > > which is cheap in non-preempt kernel. The interrupt context object stock
> > > can only be accessed after disabling interrupt. User context code can
> > > access interrupt object stock, but not vice versa.
> > > 
> > > The mod_objcg_state() function is also modified to make sure that memcg
> > > and lruvec stat updates are done with interrupted disabled.
> > > 
> > > The downside of this change is that there are more data stored in local
> > > object stocks and not reflected in the charge counter and the vmstat
> > > arrays.  However, this is a small price to pay for better performance.
> > > 
> > > Signed-off-by: Waiman Long <longman@redhat.com>
> > > Acked-by: Roman Gushchin <guro@fb.com>
> > > Reviewed-by: Shakeel Butt <shakeelb@google.com>
> > This makes sense, and also explains the previous patch a bit
> > better. But please merge those two.
> The reason I broke it into two is so that the patches are individually
> easier to review. I prefer to update the commit log of patch 4 to explain
> why the obj_stock structure is introduced instead of merging the two.

Well I did not find them easier to review separately.

> > > @@ -2327,7 +2365,9 @@ static void drain_local_stock(struct work_struct *dummy)
> > >   	local_irq_save(flags);
> > >   	stock = this_cpu_ptr(&memcg_stock);
> > > -	drain_obj_stock(&stock->obj);
> > > +	drain_obj_stock(&stock->irq_obj);
> > > +	if (in_task())
> > > +		drain_obj_stock(&stock->task_obj);
> > >   	drain_stock(stock);
> > >   	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
> > > @@ -3183,7 +3223,7 @@ static inline void mod_objcg_state(struct obj_cgroup *objcg,
> > >   	memcg = obj_cgroup_memcg(objcg);
> > >   	if (pgdat)
> > >   		lruvec = mem_cgroup_lruvec(memcg, pgdat);
> > > -	__mod_memcg_lruvec_state(memcg, lruvec, idx, nr);
> > > +	mod_memcg_lruvec_state(memcg, lruvec, idx, nr);
> > >   	rcu_read_unlock();
> > This is actually a bug introduced in the earlier patch, isn't it?
> > Calling __mod_memcg_lruvec_state() without irqs disabled...
> > 
> Not really, in patch 3, mod_objcg_state() is called only in the stock update
> context where interrupt had already been disabled. But now, that is no
> longer the case, that is why i need to update mod_objcg_state() to make sure
> irq is disabled before updating vmstat data array.

Oh, I see it now. Man, that's subtle. We've had several very hard to
track down preemption bugs in those stats, because they manifest as
counter imbalances and you have no idea if there is a leak somewhere.

The convention for these functions is that the __ prefix indicates
that preemption has been suitably disabled. Please always follow this
convention, even if the semantic change is temporary.

Btw, is there a reason why the stock caching isn't just part of
mod_objcg_state()? Why does the user need to choose if they want the
caching or not? It's not like we ask for this when charging, either.
Waiman Long April 15, 2021, 7:06 p.m. UTC | #7
On 4/15/21 2:53 PM, Johannes Weiner wrote:
> On Thu, Apr 15, 2021 at 02:16:17PM -0400, Waiman Long wrote:
>> On 4/15/21 1:53 PM, Johannes Weiner wrote:
>>> On Tue, Apr 13, 2021 at 09:20:27PM -0400, Waiman Long wrote:
>>>> Most kmem_cache_alloc() calls are from user context. With instrumentation
>>>> enabled, the measured amount of kmem_cache_alloc() calls from non-task
>>>> context was about 0.01% of the total.
>>>>
>>>> The irq disable/enable sequence used in this case to access content
>>>> from object stock is slow.  To optimize for user context access, there
>>>> are now two object stocks for task context and interrupt context access
>>>> respectively.
>>>>
>>>> The task context object stock can be accessed after disabling preemption
>>>> which is cheap in non-preempt kernel. The interrupt context object stock
>>>> can only be accessed after disabling interrupt. User context code can
>>>> access interrupt object stock, but not vice versa.
>>>>
>>>> The mod_objcg_state() function is also modified to make sure that memcg
>>>> and lruvec stat updates are done with interrupted disabled.
>>>>
>>>> The downside of this change is that there are more data stored in local
>>>> object stocks and not reflected in the charge counter and the vmstat
>>>> arrays.  However, this is a small price to pay for better performance.
>>>>
>>>> Signed-off-by: Waiman Long <longman@redhat.com>
>>>> Acked-by: Roman Gushchin <guro@fb.com>
>>>> Reviewed-by: Shakeel Butt <shakeelb@google.com>
>>> This makes sense, and also explains the previous patch a bit
>>> better. But please merge those two.
>> The reason I broke it into two is so that the patches are individually
>> easier to review. I prefer to update the commit log of patch 4 to explain
>> why the obj_stock structure is introduced instead of merging the two.
> Well I did not find them easier to review separately.
>
>>>> @@ -2327,7 +2365,9 @@ static void drain_local_stock(struct work_struct *dummy)
>>>>    	local_irq_save(flags);
>>>>    	stock = this_cpu_ptr(&memcg_stock);
>>>> -	drain_obj_stock(&stock->obj);
>>>> +	drain_obj_stock(&stock->irq_obj);
>>>> +	if (in_task())
>>>> +		drain_obj_stock(&stock->task_obj);
>>>>    	drain_stock(stock);
>>>>    	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
>>>> @@ -3183,7 +3223,7 @@ static inline void mod_objcg_state(struct obj_cgroup *objcg,
>>>>    	memcg = obj_cgroup_memcg(objcg);
>>>>    	if (pgdat)
>>>>    		lruvec = mem_cgroup_lruvec(memcg, pgdat);
>>>> -	__mod_memcg_lruvec_state(memcg, lruvec, idx, nr);
>>>> +	mod_memcg_lruvec_state(memcg, lruvec, idx, nr);
>>>>    	rcu_read_unlock();
>>> This is actually a bug introduced in the earlier patch, isn't it?
>>> Calling __mod_memcg_lruvec_state() without irqs disabled...
>>>
>> Not really, in patch 3, mod_objcg_state() is called only in the stock update
>> context where interrupt had already been disabled. But now, that is no
>> longer the case, that is why i need to update mod_objcg_state() to make sure
>> irq is disabled before updating vmstat data array.
> Oh, I see it now. Man, that's subtle. We've had several very hard to
> track down preemption bugs in those stats, because they manifest as
> counter imbalances and you have no idea if there is a leak somewhere.
>
> The convention for these functions is that the __ prefix indicates
> that preemption has been suitably disabled. Please always follow this
> convention, even if the semantic change is temporary.
I see. I will fix that in the next version.
>
> Btw, is there a reason why the stock caching isn't just part of
> mod_objcg_state()? Why does the user need to choose if they want the
> caching or not? It's not like we ask for this when charging, either.
>
Yes, I can revert it to make mod_objcg_state() to call 
mod_obj_stock_state() internally instead of the other way around.

Cheers,
Longman
diff mbox series

Patch

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 69f728383efe..8875e896e52b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2229,7 +2229,8 @@  struct obj_stock {
 struct memcg_stock_pcp {
 	struct mem_cgroup *cached; /* this never be root cgroup */
 	unsigned int nr_pages;
-	struct obj_stock obj;
+	struct obj_stock task_obj;
+	struct obj_stock irq_obj;
 
 	struct work_struct work;
 	unsigned long flags;
@@ -2254,11 +2255,48 @@  static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
 }
 #endif
 
+/*
+ * Most kmem_cache_alloc() calls are from user context. The irq disable/enable
+ * sequence used in this case to access content from object stock is slow.
+ * To optimize for user context access, there are now two object stocks for
+ * task context and interrupt context access respectively.
+ *
+ * The task context object stock can be accessed by disabling preemption only
+ * which is cheap in non-preempt kernel. The interrupt context object stock
+ * can only be accessed after disabling interrupt. User context code can
+ * access interrupt object stock, but not vice versa.
+ */
 static inline struct obj_stock *current_obj_stock(void)
 {
 	struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock);
 
-	return &stock->obj;
+	return in_task() ? &stock->task_obj : &stock->irq_obj;
+}
+
+#define get_obj_stock(flags)				\
+({							\
+	struct memcg_stock_pcp *stock;			\
+	struct obj_stock *obj_stock;			\
+							\
+	if (in_task()) {				\
+		preempt_disable();			\
+		(flags) = -1L;				\
+		stock = this_cpu_ptr(&memcg_stock);	\
+		obj_stock = &stock->task_obj;		\
+	} else {					\
+		local_irq_save(flags);			\
+		stock = this_cpu_ptr(&memcg_stock);	\
+		obj_stock = &stock->irq_obj;		\
+	}						\
+	obj_stock;					\
+})
+
+static inline void put_obj_stock(unsigned long flags)
+{
+	if (flags == -1L)
+		preempt_enable();
+	else
+		local_irq_restore(flags);
 }
 
 /**
@@ -2327,7 +2365,9 @@  static void drain_local_stock(struct work_struct *dummy)
 	local_irq_save(flags);
 
 	stock = this_cpu_ptr(&memcg_stock);
-	drain_obj_stock(&stock->obj);
+	drain_obj_stock(&stock->irq_obj);
+	if (in_task())
+		drain_obj_stock(&stock->task_obj);
 	drain_stock(stock);
 	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
 
@@ -3183,7 +3223,7 @@  static inline void mod_objcg_state(struct obj_cgroup *objcg,
 	memcg = obj_cgroup_memcg(objcg);
 	if (pgdat)
 		lruvec = mem_cgroup_lruvec(memcg, pgdat);
-	__mod_memcg_lruvec_state(memcg, lruvec, idx, nr);
+	mod_memcg_lruvec_state(memcg, lruvec, idx, nr);
 	rcu_read_unlock();
 }
 
@@ -3193,15 +3233,14 @@  static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
 	unsigned long flags;
 	bool ret = false;
 
-	local_irq_save(flags);
+	stock = get_obj_stock(flags);
 
-	stock = current_obj_stock();
 	if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
 		stock->nr_bytes -= nr_bytes;
 		ret = true;
 	}
 
-	local_irq_restore(flags);
+	put_obj_stock(flags);
 
 	return ret;
 }
@@ -3254,8 +3293,13 @@  static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
 {
 	struct mem_cgroup *memcg;
 
-	if (stock->obj.cached_objcg) {
-		memcg = obj_cgroup_memcg(stock->obj.cached_objcg);
+	if (in_task() && stock->task_obj.cached_objcg) {
+		memcg = obj_cgroup_memcg(stock->task_obj.cached_objcg);
+		if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
+			return true;
+	}
+	if (stock->irq_obj.cached_objcg) {
+		memcg = obj_cgroup_memcg(stock->irq_obj.cached_objcg);
 		if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
 			return true;
 	}
@@ -3283,9 +3327,9 @@  static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	get_obj_stock(flags);
 	__refill_obj_stock(objcg, nr_bytes);
-	local_irq_restore(flags);
+	put_obj_stock(flags);
 }
 
 static void __mod_obj_stock_state(struct obj_cgroup *objcg,
@@ -3325,9 +3369,9 @@  void mod_obj_stock_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	get_obj_stock(flags);
 	__mod_obj_stock_state(objcg, pgdat, idx, nr);
-	local_irq_restore(flags);
+	put_obj_stock(flags);
 }
 
 int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
@@ -3380,10 +3424,10 @@  void obj_cgroup_uncharge_mod_state(struct obj_cgroup *objcg, size_t size,
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	get_obj_stock(flags);
 	__refill_obj_stock(objcg, size);
 	__mod_obj_stock_state(objcg, pgdat, idx, -(int)size);
-	local_irq_restore(flags);
+	put_obj_stock(flags);
 }
 
 #endif /* CONFIG_MEMCG_KMEM */