diff mbox series

[v3,04/19] mm: slub: implement SLUB version of obj_to_index()

Message ID 20200422204708.2176080-5-guro@fb.com (mailing list archive)
State New, archived
Headers show
Series The new cgroup slab memory controller | expand

Commit Message

Roman Gushchin April 22, 2020, 8:46 p.m. UTC
This commit implements SLUB version of the obj_to_index() function,
which will be required to calculate the offset of obj_cgroup in the
obj_cgroups vector to store/obtain the objcg ownership data.

To make it faster, let's repeat the SLAB's trick introduced by
commit 6a2d7a955d8d ("[PATCH] SLAB: use a multiply instead of a
divide in obj_to_index()") and avoid an expensive division.

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Christoph Lameter <cl@linux.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
---
 include/linux/slub_def.h | 9 +++++++++
 mm/slub.c                | 1 +
 2 files changed, 10 insertions(+)

Comments

Christoph Lameter (Ampere) April 22, 2020, 11:52 p.m. UTC | #1
On Wed, 22 Apr 2020, Roman Gushchin wrote:

>  enum stat_item {
>  	ALLOC_FASTPATH,		/* Allocation from cpu slab */
> @@ -86,6 +87,7 @@ struct kmem_cache {
>  	unsigned long min_partial;
>  	unsigned int size;	/* The size of an object including metadata */
>  	unsigned int object_size;/* The size of an object without metadata */
> +	struct reciprocal_value reciprocal_size;


This needs to be moved further back since it is not an item that needs to
be cache hot for the hotpaths. Place it with "align", inuse etc?

Hmmm. the same applies to min_partial maybe?
Roman Gushchin April 23, 2020, 12:05 a.m. UTC | #2
On Wed, Apr 22, 2020 at 11:52:13PM +0000, Christoph Lameter wrote:
> On Wed, 22 Apr 2020, Roman Gushchin wrote:
> 
> >  enum stat_item {
> >  	ALLOC_FASTPATH,		/* Allocation from cpu slab */
> > @@ -86,6 +87,7 @@ struct kmem_cache {
> >  	unsigned long min_partial;
> >  	unsigned int size;	/* The size of an object including metadata */
> >  	unsigned int object_size;/* The size of an object without metadata */
> > +	struct reciprocal_value reciprocal_size;
> 
> 
> This needs to be moved further back since it is not an item that needs to
> be cache hot for the hotpaths.

It could be relatively hot, because it's accessed for reading on every
accounted allocation.

> Place it with "align", inuse etc?
> 
> Hmmm. the same applies to min_partial maybe?
> 
> 

And min_partial should much colder.

So maybe a patch on top of the series which moves both fields can work?

Thanks!
Roman Gushchin April 23, 2020, 9:01 p.m. UTC | #3
On Wed, Apr 22, 2020 at 11:52:13PM +0000, Christoph Lameter wrote:
> On Wed, 22 Apr 2020, Roman Gushchin wrote:
> 
> >  enum stat_item {
> >  	ALLOC_FASTPATH,		/* Allocation from cpu slab */
> > @@ -86,6 +87,7 @@ struct kmem_cache {
> >  	unsigned long min_partial;
> >  	unsigned int size;	/* The size of an object including metadata */
> >  	unsigned int object_size;/* The size of an object without metadata */
> > +	struct reciprocal_value reciprocal_size;
> 
> 
> This needs to be moved further back since it is not an item that needs to
> be cache hot for the hotpaths. Place it with "align", inuse etc?
> 
> Hmmm. the same applies to min_partial maybe?
> 
>

Something like this?

Thanks!

--

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index cdf4f299c982..6246a3c65cd5 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -84,10 +84,8 @@ struct kmem_cache {
        struct kmem_cache_cpu __percpu *cpu_slab;
        /* Used for retrieving partial slabs, etc. */
        slab_flags_t flags;
-       unsigned long min_partial;
        unsigned int size;      /* The size of an object including metadata */
        unsigned int object_size;/* The size of an object without metadata */
-       struct reciprocal_value reciprocal_size;
        unsigned int offset;    /* Free pointer offset */
 #ifdef CONFIG_SLUB_CPU_PARTIAL
        /* Number of per cpu partial objects to keep around */
@@ -103,6 +101,8 @@ struct kmem_cache {
        void (*ctor)(void *);
        unsigned int inuse;             /* Offset to metadata */
        unsigned int align;             /* Alignment */
+       unsigned long min_partial;
+       struct reciprocal_value reciprocal_size;
        unsigned int red_left_pad;      /* Left redzone padding size */
        const char *name;       /* Name (only for display!) */
        struct list_head list;  /* List of slab caches */
Christoph Lameter (Ampere) April 25, 2020, 2:10 a.m. UTC | #4
On Wed, 22 Apr 2020, Roman Gushchin wrote:

> On Wed, Apr 22, 2020 at 11:52:13PM +0000, Christoph Lameter wrote:
> > On Wed, 22 Apr 2020, Roman Gushchin wrote:
> >
> > >  enum stat_item {
> > >  	ALLOC_FASTPATH,		/* Allocation from cpu slab */
> > > @@ -86,6 +87,7 @@ struct kmem_cache {
> > >  	unsigned long min_partial;
> > >  	unsigned int size;	/* The size of an object including metadata */
> > >  	unsigned int object_size;/* The size of an object without metadata */
> > > +	struct reciprocal_value reciprocal_size;
> >
> >
> > This needs to be moved further back since it is not an item that needs to
> > be cache hot for the hotpaths.
>
> It could be relatively hot, because it's accessed for reading on every
> accounted allocation.

The patch seems to only use it for setup and debugging? It is used for
every "accounted" allocation???? Where? And what is an "accounted"
allocation?
Christoph Lameter (Ampere) April 25, 2020, 2:10 a.m. UTC | #5
On Thu, 23 Apr 2020, Roman Gushchin wrote:

> Something like this?

Yup.
Roman Gushchin April 25, 2020, 2:46 a.m. UTC | #6
On Sat, Apr 25, 2020 at 02:10:24AM +0000, Christoph Lameter wrote:
> On Wed, 22 Apr 2020, Roman Gushchin wrote:
> 
> > On Wed, Apr 22, 2020 at 11:52:13PM +0000, Christoph Lameter wrote:
> > > On Wed, 22 Apr 2020, Roman Gushchin wrote:
> > >
> > > >  enum stat_item {
> > > >  	ALLOC_FASTPATH,		/* Allocation from cpu slab */
> > > > @@ -86,6 +87,7 @@ struct kmem_cache {
> > > >  	unsigned long min_partial;
> > > >  	unsigned int size;	/* The size of an object including metadata */
> > > >  	unsigned int object_size;/* The size of an object without metadata */
> > > > +	struct reciprocal_value reciprocal_size;
> > >
> > >
> > > This needs to be moved further back since it is not an item that needs to
> > > be cache hot for the hotpaths.
> >
> > It could be relatively hot, because it's accessed for reading on every
> > accounted allocation.
> 
> The patch seems to only use it for setup and debugging? It is used for
> every "accounted" allocation???? Where? And what is an "accounted"
> allocation?
> 
>

Please, take a look at the whole series:
https://lore.kernel.org/linux-mm/20200422204708.2176080-1-guro@fb.com/T/#t

I'm sorry, I had to cc you directly for the whole thing. Your feedback
will be highly appreciated.

It's used to calculate the offset of the memcg pointer for every slab
object which is charged to a memory cgroup. So it must be quite hot.

Thanks!

Roman
Christoph Lameter (Ampere) April 27, 2020, 4:21 p.m. UTC | #7
On Fri, 24 Apr 2020, Roman Gushchin wrote:

> > The patch seems to only use it for setup and debugging? It is used for
> > every "accounted" allocation???? Where? And what is an "accounted"
> > allocation?
> >
> >
>
> Please, take a look at the whole series:
> https://lore.kernel.org/linux-mm/20200422204708.2176080-1-guro@fb.com/T/#t
>
> I'm sorry, I had to cc you directly for the whole thing. Your feedback
> will be highly appreciated.
>
> It's used to calculate the offset of the memcg pointer for every slab
> object which is charged to a memory cgroup. So it must be quite hot.


Ahh... Thanks. I just looked at it.

You need this because you have a separate structure attached to a page
that tracks membership of the slab object to the cgroup. This is used to
calculate the offset into that array....

Why do you need this? Just slap a pointer to the cgroup as additional
metadata onto the slab object. Is that not much simpler, safer and faster?
Roman Gushchin April 27, 2020, 4:46 p.m. UTC | #8
On Mon, Apr 27, 2020 at 04:21:01PM +0000, Christoph Lameter wrote:
> On Fri, 24 Apr 2020, Roman Gushchin wrote:
> 
> > > The patch seems to only use it for setup and debugging? It is used for
> > > every "accounted" allocation???? Where? And what is an "accounted"
> > > allocation?
> > >
> > >
> >
> > Please, take a look at the whole series:
> > https://lore.kernel.org/linux-mm/20200422204708.2176080-1-guro@fb.com/T/#t
> >
> > I'm sorry, I had to cc you directly for the whole thing. Your feedback
> > will be highly appreciated.
> >
> > It's used to calculate the offset of the memcg pointer for every slab
> > object which is charged to a memory cgroup. So it must be quite hot.
> 
> 
> Ahh... Thanks. I just looked at it.
> 
> You need this because you have a separate structure attached to a page
> that tracks membership of the slab object to the cgroup. This is used to
> calculate the offset into that array....
> 
> Why do you need this? Just slap a pointer to the cgroup as additional
> metadata onto the slab object. Is that not much simpler, safer and faster?
> 

So, the problem is that not all slab objects are accounted, and sometimes
we don't know if advance if they are accounted or not (with the current semantics
of __GFP_ACCOUNT and SLAB_ACCOUNT flags). So we either have to increase
the size of ALL slab objects, either create a pair of slab caches for each size.

The first option is not that cheap in terms of the memory overhead. Especially
for those who disable cgroups using a boot-time option.
The second should be fine, but it will be less simple in terms of the code complexity
(in comparison to the final result of the current proposal).

I'm not strictly against of either approach, but I'd look for a broader consensus
on what's the best approach here.

Thanks!
Roman Gushchin April 28, 2020, 5:06 p.m. UTC | #9
On Mon, Apr 27, 2020 at 09:46:38AM -0700, Roman Gushchin wrote:
> On Mon, Apr 27, 2020 at 04:21:01PM +0000, Christoph Lameter wrote:
> > On Fri, 24 Apr 2020, Roman Gushchin wrote:
> > 
> > > > The patch seems to only use it for setup and debugging? It is used for
> > > > every "accounted" allocation???? Where? And what is an "accounted"
> > > > allocation?
> > > >
> > > >
> > >
> > > Please, take a look at the whole series:
> > > https://lore.kernel.org/linux-mm/20200422204708.2176080-1-guro@fb.com/T/#t
> > >
> > > I'm sorry, I had to cc you directly for the whole thing. Your feedback
> > > will be highly appreciated.
> > >
> > > It's used to calculate the offset of the memcg pointer for every slab
> > > object which is charged to a memory cgroup. So it must be quite hot.
> > 
> > 
> > Ahh... Thanks. I just looked at it.
> > 
> > You need this because you have a separate structure attached to a page
> > that tracks membership of the slab object to the cgroup. This is used to
> > calculate the offset into that array....
> > 
> > Why do you need this? Just slap a pointer to the cgroup as additional
> > metadata onto the slab object. Is that not much simpler, safer and faster?
> > 
> 
> So, the problem is that not all slab objects are accounted, and sometimes
> we don't know if advance if they are accounted or not (with the current semantics
> of __GFP_ACCOUNT and SLAB_ACCOUNT flags). So we either have to increase
> the size of ALL slab objects, either create a pair of slab caches for each size.
> 
> The first option is not that cheap in terms of the memory overhead. Especially
> for those who disable cgroups using a boot-time option.
> The second should be fine, but it will be less simple in terms of the code complexity
> (in comparison to the final result of the current proposal).
> 
> I'm not strictly against of either approach, but I'd look for a broader consensus
> on what's the best approach here.

To be more clear here: in my original version (prior to v3) I had two sets
of kmem_caches: one for root- and other non accounted allocations, and the
other was shared by all non-root memory cgroups. With this approach it's
easy to switch to your suggestion and put the memcg pointer nearby the object.

Johannes persistently pushed on the design with a single set of kmem_caches,
shared by *all* allocations. I've implemented this approach as a separate patch
on top of the series and added to v3. It allows to dramatically simplify the code
and remove ~0.5k sloc, but with this approach it's not easy to implement what
you're suggesting without increasing the size of *all* slab objects, which is
sub-optimal.

So it looks like there are two options:
1) switch back to a root- and memcg sets of kmem_caches, put the memcg pointer
   just behind the slab object
2) stick with what we've in v3

I guess the first option might be better from the performance POV, the second
is simpler/cleaner in terms of the code. So I'm ok to switch to 1) if there is
a consensus on what's better.

Thanks!
Johannes Weiner April 28, 2020, 5:45 p.m. UTC | #10
On Mon, Apr 27, 2020 at 09:46:38AM -0700, Roman Gushchin wrote:
> On Mon, Apr 27, 2020 at 04:21:01PM +0000, Christoph Lameter wrote:
> > On Fri, 24 Apr 2020, Roman Gushchin wrote:
> > 
> > > > The patch seems to only use it for setup and debugging? It is used for
> > > > every "accounted" allocation???? Where? And what is an "accounted"
> > > > allocation?
> > > >
> > > >
> > >
> > > Please, take a look at the whole series:
> > > https://lore.kernel.org/linux-mm/20200422204708.2176080-1-guro@fb.com/T/#t
> > >
> > > I'm sorry, I had to cc you directly for the whole thing. Your feedback
> > > will be highly appreciated.
> > >
> > > It's used to calculate the offset of the memcg pointer for every slab
> > > object which is charged to a memory cgroup. So it must be quite hot.
> > 
> > 
> > Ahh... Thanks. I just looked at it.
> > 
> > You need this because you have a separate structure attached to a page
> > that tracks membership of the slab object to the cgroup. This is used to
> > calculate the offset into that array....
> > 
> > Why do you need this? Just slap a pointer to the cgroup as additional
> > metadata onto the slab object. Is that not much simpler, safer and faster?
> > 
> 
> So, the problem is that not all slab objects are accounted, and sometimes
> we don't know if advance if they are accounted or not (with the current semantics
> of __GFP_ACCOUNT and SLAB_ACCOUNT flags). So we either have to increase
> the size of ALL slab objects, either create a pair of slab caches for each size.

Both options seem completely disproportionate in their memory cost,
and the latter one in terms of code complexity, to avoid the offset
calculation. As a share of the total object accounting cost, I'd
expect this to be minimal.

Does the mult stand out in an annotated perf profile?

Is it enough to bring back 500+ lines of code, an additional branch on
accounted allocations, and the memory fragmentation of split caches?

I highly doubt it.
Christoph Lameter (Ampere) April 30, 2020, 4:29 p.m. UTC | #11
On Mon, 27 Apr 2020, Roman Gushchin wrote:

> > Why do you need this? Just slap a pointer to the cgroup as additional
> > metadata onto the slab object. Is that not much simpler, safer and faster?
> >
>
> So, the problem is that not all slab objects are accounted, and sometimes
> we don't know if advance if they are accounted or not (with the current semantics
> of __GFP_ACCOUNT and SLAB_ACCOUNT flags). So we either have to increase
> the size of ALL slab objects, either create a pair of slab caches for each size.

>
> The first option is not that cheap in terms of the memory overhead. Especially
> for those who disable cgroups using a boot-time option.


If the cgroups are disabled on boot time then you can switch back to the
compact version. Otherwise just add a pointer to each object. It will make
it consistent and there is not much memory wastage.

The problem comes about with the power of 2 caches in the kmalloc array.
If one keeps the "natural alignment" instead of going for the normal
alignment of slab caches then the alignment will cause a lot of memory
wastage and thus the scheme of off slab metadata is likely going to be
unavoidable.

But I think we are just stacking one bad idea onto another here making
things much more complex than they could be. Well at least this justifies
all our jobs .... (not mine I am out of work... hehehe)
Roman Gushchin April 30, 2020, 5:15 p.m. UTC | #12
On Thu, Apr 30, 2020 at 04:29:50PM +0000, Christoph Lameter wrote:
> On Mon, 27 Apr 2020, Roman Gushchin wrote:
> 
> > > Why do you need this? Just slap a pointer to the cgroup as additional
> > > metadata onto the slab object. Is that not much simpler, safer and faster?
> > >
> >
> > So, the problem is that not all slab objects are accounted, and sometimes
> > we don't know if advance if they are accounted or not (with the current semantics
> > of __GFP_ACCOUNT and SLAB_ACCOUNT flags). So we either have to increase
> > the size of ALL slab objects, either create a pair of slab caches for each size.
> 
> >
> > The first option is not that cheap in terms of the memory overhead. Especially
> > for those who disable cgroups using a boot-time option.
> 
> 
> If the cgroups are disabled on boot time then you can switch back to the
> compact version. Otherwise just add a pointer to each object. It will make
> it consistent and there is not much memory wastage.
> 
> The problem comes about with the power of 2 caches in the kmalloc array.

It's a very good point, and it's an argument to stick with the current design
(an external vector of memcg pointers).

> If one keeps the "natural alignment" instead of going for the normal
> alignment of slab caches then the alignment will cause a lot of memory
> wastage and thus the scheme of off slab metadata is likely going to be
> unavoidable.
> 
> But I think we are just stacking one bad idea onto another here making
> things much more complex than they could be. Well at least this justifies
> all our jobs .... (not mine I am out of work... hehehe)

Sorry, but what exactly do you mean?
I don't think reducing the kernel memory footprint by almost half
is such a bad idea.

Thanks!
Christoph Lameter (Ampere) May 2, 2020, 11:54 p.m. UTC | #13
On Thu, 30 Apr 2020, Roman Gushchin wrote:

> Sorry, but what exactly do you mean?

I think the right approach is to add a pointer to each slab object for
memcg support.
Roman Gushchin May 4, 2020, 6:29 p.m. UTC | #14
On Sat, May 02, 2020 at 11:54:09PM +0000, Christoph Lameter wrote:
> On Thu, 30 Apr 2020, Roman Gushchin wrote:
> 
> > Sorry, but what exactly do you mean?
> 
> I think the right approach is to add a pointer to each slab object for
> memcg support.
>

As I understand, embedding the memcg pointer will hopefully make allocations
cheaper in terms of CPU, but will require more memory. And you think that
it's worth it. Is it a correct understanding?

Can you, please, describe a bit more detailed how it should be done
from your point of view?
I mean where to store the pointer, should it be SLAB/SLUB-specific code
or a generic code, what do to with kmallocs alignments, should we
merge slabs which had a different size before and now have the same
because of the memcg pointer and aligment, etc.

I'm happy to follow your advice and perform some tests to get an idea of
how significant the memory overhead is and how big are CPU savings.
I guess with these numbers it will be easy to make a decision.

Thanks!
Christoph Lameter (Ampere) May 8, 2020, 9:35 p.m. UTC | #15
On Mon, 4 May 2020, Roman Gushchin wrote:

> On Sat, May 02, 2020 at 11:54:09PM +0000, Christoph Lameter wrote:
> > On Thu, 30 Apr 2020, Roman Gushchin wrote:
> >
> > > Sorry, but what exactly do you mean?
> >
> > I think the right approach is to add a pointer to each slab object for
> > memcg support.
> >
>
> As I understand, embedding the memcg pointer will hopefully make allocations
> cheaper in terms of CPU, but will require more memory. And you think that
> it's worth it. Is it a correct understanding?

It definitely makes the code less complex. The additional memory is
minimal. In many cases you have already some space wasted at the end of
the object that could be used for the pointer.

> Can you, please, describe a bit more detailed how it should be done
> from your point of view?

Add it to the metadata at the end of the object. Like the debugging
information or the pointer for RCU freeing.

> I mean where to store the pointer, should it be SLAB/SLUB-specific code
> or a generic code, what do to with kmallocs alignments, should we
> merge slabs which had a different size before and now have the same
> because of the memcg pointer and aligment, etc.

Both SLAB and SLUB have the same capabilities there. Slabs that had
different sizes before will now have different sizes as well. So the
merging does not change.

> I'm happy to follow your advice and perform some tests to get an idea of
> how significant the memory overhead is and how big are CPU savings.
> I guess with these numbers it will be easy to make a decision.

Sure. The main issue are the power of two kmalloc caches and how to add
the pointer to these caches in order not to waste memory. SLAB has done
this in the past by creating additional structues in a page frame.
Roman Gushchin May 13, 2020, 12:57 a.m. UTC | #16
On Fri, May 08, 2020 at 09:35:54PM +0000, Christoph Lameter wrote:
> On Mon, 4 May 2020, Roman Gushchin wrote:
> 
> > On Sat, May 02, 2020 at 11:54:09PM +0000, Christoph Lameter wrote:
> > > On Thu, 30 Apr 2020, Roman Gushchin wrote:
> > >
> > > > Sorry, but what exactly do you mean?
> > >
> > > I think the right approach is to add a pointer to each slab object for
> > > memcg support.
> > >
> >
> > As I understand, embedding the memcg pointer will hopefully make allocations
> > cheaper in terms of CPU, but will require more memory. And you think that
> > it's worth it. Is it a correct understanding?
> 
> It definitely makes the code less complex. The additional memory is
> minimal. In many cases you have already some space wasted at the end of
> the object that could be used for the pointer.
> 
> > Can you, please, describe a bit more detailed how it should be done
> > from your point of view?
> 
> Add it to the metadata at the end of the object. Like the debugging
> information or the pointer for RCU freeing.

Enabling debugging metadata currently disables the cache merging.
I doubt that it's acceptable to sacrifice the cache merging in order
to embed the memcg pointer?

> 
> > I mean where to store the pointer, should it be SLAB/SLUB-specific code
> > or a generic code, what do to with kmallocs alignments, should we
> > merge slabs which had a different size before and now have the same
> > because of the memcg pointer and aligment, etc.
> 
> Both SLAB and SLUB have the same capabilities there. Slabs that had
> different sizes before will now have different sizes as well. So the
> merging does not change.

See above. Or should I add it to the object itself before the metadata?

> 
> > I'm happy to follow your advice and perform some tests to get an idea of
> > how significant the memory overhead is and how big are CPU savings.
> > I guess with these numbers it will be easy to make a decision.
> 
> Sure. The main issue are the power of two kmalloc caches and how to add
> the pointer to these caches in order not to waste memory. SLAB has done
> this in the past by creating additional structues in a page frame.

But isn't it then similar to what I'm doing now?

Btw, I'm trying to build up a prototype with an embedded memcg pointer,
but it seems to be way more tricky than I thought. It requires changes to
shrinkers (as they rely on getting the memcg pointer by an arbitrary
kernel address, not necessarily aligned to the head of slab allocation),
figuring out cache merging, adding SLAB support, natural alignment of
kmallocs etc.

Figuring out all these details will likely take several weeks, so the whole
thing will be delayed for one-two major releases (in the best case). Given that
the current implementation saves ~40% of slab memory, I think there is some value
in delivering it as it is. So I wonder if the idea of embedding the pointer
should be considered a blocker, or it can be implemented of top of the proposed
code (given it's not a user-facing api or something like this)?

Thanks!
Roman Gushchin May 15, 2020, 8:02 p.m. UTC | #17
On Fri, May 08, 2020 at 09:35:54PM +0000, Christoph Lameter wrote:
> On Mon, 4 May 2020, Roman Gushchin wrote:
> 
> > On Sat, May 02, 2020 at 11:54:09PM +0000, Christoph Lameter wrote:
> > > On Thu, 30 Apr 2020, Roman Gushchin wrote:
> > >
> > > > Sorry, but what exactly do you mean?
> > >
> > > I think the right approach is to add a pointer to each slab object for
> > > memcg support.
> > >
> >
> > As I understand, embedding the memcg pointer will hopefully make allocations
> > cheaper in terms of CPU, but will require more memory. And you think that
> > it's worth it. Is it a correct understanding?
> 
> It definitely makes the code less complex. The additional memory is
> minimal. In many cases you have already some space wasted at the end of
> the object that could be used for the pointer.
> 
> > Can you, please, describe a bit more detailed how it should be done
> > from your point of view?
> 
> Add it to the metadata at the end of the object. Like the debugging
> information or the pointer for RCU freeing.

I've tried to make a prototype, but realized that I don't know how to do
it in a right way with SLUB (without disabling caches merging, etc)
and ended up debugging various memory corruptions.

memcg/kmem changes required to switch between different ways of storing
the memcg pointer are pretty minimal (diff below).

There are two functions which SLAB/SLUB should provide:

void set_obj_cgroup(struct kmem_cache *s, void *ptr, struct obj_cgroup *objcg);
struct obj_cgroup *obtain_obj_cgroup(struct kmem_cache *s, void *ptr);

Ideally, obtain_obj_cgroup should work with an arbitrary kernel pointer, e.g.
a pointer to some field in the structure allocated using kmem_cache_alloc().

Christopher, will you be able to help with the SLUB implementation?
It will be highly appreciated.

Thanks!

--

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4af95739ccb6..398a714874d8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2815,15 +2815,11 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
 
 	/*
 	 * Slab objects are accounted individually, not per-page.
-	 * Memcg membership data for each individual object is saved in
-	 * the page->obj_cgroups.
 	 */
-	if (page_has_obj_cgroups(page)) {
+	if (PageSlab(page)) {
 		struct obj_cgroup *objcg;
-		unsigned int off;
 
-		off = obj_to_index(page->slab_cache, page, p);
-		objcg = page_obj_cgroups(page)[off];
+		objcg = obtain_obj_cgroup(page->slab_cache, p);
 		if (objcg)
 			return obj_cgroup_memcg(objcg);
 
diff --git a/mm/slab.h b/mm/slab.h
index 13fadf33be5c..617ce017bc68 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -210,40 +210,15 @@ static inline int cache_vmstat_idx(struct kmem_cache *s)
 }
 
 #ifdef CONFIG_MEMCG_KMEM
-static inline struct obj_cgroup **page_obj_cgroups(struct page *page)
+static inline void set_obj_cgroup(struct kmem_cache *s, void *ptr,
+				  struct obj_cgroup *objcg)
 {
-	/*
-	 * page->mem_cgroup and page->obj_cgroups are sharing the same
-	 * space. To distinguish between them in case we don't know for sure
-	 * that the page is a slab page (e.g. page_cgroup_ino()), let's
-	 * always set the lowest bit of obj_cgroups.
-	 */
-	return (struct obj_cgroup **)
-		((unsigned long)page->obj_cgroups & ~0x1UL);
-}
-
-static inline bool page_has_obj_cgroups(struct page *page)
-{
-	return ((unsigned long)page->obj_cgroups & 0x1UL);
-}
-
-static inline int memcg_alloc_page_obj_cgroups(struct page *page, gfp_t gfp,
-					       unsigned int objects)
-{
-	void *vec;
-
-	vec = kcalloc(objects, sizeof(struct obj_cgroup *), gfp);
-	if (!vec)
-		return -ENOMEM;
-
-	page->obj_cgroups = (struct obj_cgroup **) ((unsigned long)vec | 0x1UL);
-	return 0;
+	// TODO
 }
-
-static inline void memcg_free_page_obj_cgroups(struct page *page)
+static inline struct obj_cgroup *obtain_obj_cgroup(struct kmem_cache *s, void *ptr)
 {
-	kfree(page_obj_cgroups(page));
-	page->obj_cgroups = NULL;
+	// TODO
+	return NULL;
 }
 
 static inline size_t obj_full_size(struct kmem_cache *s)
@@ -296,7 +271,6 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
 					      void **p)
 {
 	struct page *page;
-	unsigned long off;
 	size_t i;
 
 	if (!objcg)
@@ -306,17 +280,8 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
 	for (i = 0; i < size; i++) {
 		if (likely(p[i])) {
 			page = virt_to_head_page(p[i]);
-
-			if (!page_has_obj_cgroups(page) &&
-			    memcg_alloc_page_obj_cgroups(page, flags,
-							 objs_per_slab(s))) {
-				obj_cgroup_uncharge(objcg, obj_full_size(s));
-				continue;
-			}
-
-			off = obj_to_index(s, page, p[i]);
 			obj_cgroup_get(objcg);
-			page_obj_cgroups(page)[off] = objcg;
+			set_obj_cgroup(s, p[i], objcg);
 			mod_objcg_state(objcg, page_pgdat(page),
 					cache_vmstat_idx(s), obj_full_size(s));
 		} else {
@@ -330,21 +295,17 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct page *page,
 					void *p)
 {
 	struct obj_cgroup *objcg;
-	unsigned int off;
 
 	if (!memcg_kmem_enabled())
 		return;
 
-	if (!page_has_obj_cgroups(page))
-		return;
-
-	off = obj_to_index(s, page, p);
-	objcg = page_obj_cgroups(page)[off];
-	page_obj_cgroups(page)[off] = NULL;
+	objcg = obtain_obj_cgroup(s, p);
 
 	if (!objcg)
 		return;
 
+	set_obj_cgroup(s, p, NULL);
+
 	obj_cgroup_uncharge(objcg, obj_full_size(s));
 	mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s),
 			-obj_full_size(s));
@@ -363,16 +324,6 @@ static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr)
 	return NULL;
 }
 
-static inline int memcg_alloc_page_obj_cgroups(struct page *page, gfp_t gfp,
-					       unsigned int objects)
-{
-	return 0;
-}
-
-static inline void memcg_free_page_obj_cgroups(struct page *page)
-{
-}
-
 static inline struct obj_cgroup *memcg_slab_pre_alloc_hook(struct kmem_cache *s,
 							   size_t objects,
 							   gfp_t flags)
@@ -415,8 +366,6 @@ static __always_inline void charge_slab_page(struct page *page,
 static __always_inline void uncharge_slab_page(struct page *page, int order,
 					       struct kmem_cache *s)
 {
-	memcg_free_page_obj_cgroups(page);
-
 	mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
 			    -(PAGE_SIZE << order));
 }
Christoph Lameter (Ampere) May 15, 2020, 9:45 p.m. UTC | #18
On Tue, 12 May 2020, Roman Gushchin wrote:

> > Add it to the metadata at the end of the object. Like the debugging
> > information or the pointer for RCU freeing.
>
> Enabling debugging metadata currently disables the cache merging.
> I doubt that it's acceptable to sacrifice the cache merging in order
> to embed the memcg pointer?

Well then keep the merging even if you have a memcg pointer.

The disabling for debugging is only to simplify debugging. You dont have
to deal with multiple caches actually using the same storage structures.

> Figuring out all these details will likely take several weeks, so the whole
> thing will be delayed for one-two major releases (in the best case). Given that
> the current implementation saves ~40% of slab memory, I think there is some value
> in delivering it as it is. So I wonder if the idea of embedding the pointer
> should be considered a blocker, or it can be implemented of top of the proposed
> code (given it's not a user-facing api or something like this)?

Sorry no idea from my end here.
Roman Gushchin May 15, 2020, 10:12 p.m. UTC | #19
On Fri, May 15, 2020 at 09:45:30PM +0000, Christoph Lameter wrote:
> On Tue, 12 May 2020, Roman Gushchin wrote:
> 
> > > Add it to the metadata at the end of the object. Like the debugging
> > > information or the pointer for RCU freeing.
> >
> > Enabling debugging metadata currently disables the cache merging.
> > I doubt that it's acceptable to sacrifice the cache merging in order
> > to embed the memcg pointer?
> 
> Well then keep the merging even if you have a memcg pointer.
> 
> The disabling for debugging is only to simplify debugging. You dont have
> to deal with multiple caches actually using the same storage structures.
> 
> > Figuring out all these details will likely take several weeks, so the whole
> > thing will be delayed for one-two major releases (in the best case). Given that
> > the current implementation saves ~40% of slab memory, I think there is some value
> > in delivering it as it is. So I wonder if the idea of embedding the pointer
> > should be considered a blocker, or it can be implemented of top of the proposed
> > code (given it's not a user-facing api or something like this)?
> 
> Sorry no idea from my end here.

Ok, then I'll continue working on the embedding the pointer as an enhancement
*on top* of the current patchset. As I showed in my other e-mail, switching
to a different way of obj_cgroup storage is fairly trivial and doesn't change
much in the rest of the patchset.

Please, let me know if you're not ok with it.

Thanks!
Vlastimil Babka May 20, 2020, 9:51 a.m. UTC | #20
On 5/13/20 2:57 AM, Roman Gushchin wrote:
> 
> Btw, I'm trying to build up a prototype with an embedded memcg pointer,
> but it seems to be way more tricky than I thought. It requires changes to
> shrinkers (as they rely on getting the memcg pointer by an arbitrary
> kernel address, not necessarily aligned to the head of slab allocation),
> figuring out cache merging, adding SLAB support, natural alignment of
> kmallocs etc.

Is the natural alignment of kmallocs a problem right now? As kmalloc()
allocations are AFAIK not kmemcg-accounted? Or does your implementation add
memcg awareness to everything, even if non-__GFP_ACCOUNT allocations just get a
root memcg pointer?

> Figuring out all these details will likely take several weeks, so the whole
> thing will be delayed for one-two major releases (in the best case). Given that
> the current implementation saves ~40% of slab memory, I think there is some value
> in delivering it as it is. So I wonder if the idea of embedding the pointer
> should be considered a blocker, or it can be implemented of top of the proposed
> code (given it's not a user-facing api or something like this)?
> 
> Thanks!
>
Vlastimil Babka May 20, 2020, 1:51 p.m. UTC | #21
On 4/22/20 10:46 PM, Roman Gushchin wrote:
> This commit implements SLUB version of the obj_to_index() function,
> which will be required to calculate the offset of obj_cgroup in the
> obj_cgroups vector to store/obtain the objcg ownership data.
> 
> To make it faster, let's repeat the SLAB's trick introduced by
> commit 6a2d7a955d8d ("[PATCH] SLAB: use a multiply instead of a
> divide in obj_to_index()") and avoid an expensive division.
> 
> Signed-off-by: Roman Gushchin <guro@fb.com>
> Acked-by: Christoph Lameter <cl@linux.com>
> Acked-by: Johannes Weiner <hannes@cmpxchg.org>

There's already a slab_index() doing the same without the trick, with only
SLUB_DEBUG callers. Maybe just improve it and perhaps rename? (obj_to_index()
seems more descriptive). The difference is that it takes the result of
page_addr() instead of doing that, as it's being called in a loop on objects
from a single page, so you'd have to perhaps split to obj_to_index(page) and
__obj_to_index(addr) or something.

> ---
>  include/linux/slub_def.h | 9 +++++++++
>  mm/slub.c                | 1 +
>  2 files changed, 10 insertions(+)
> 
> diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
> index d2153789bd9f..200ea292f250 100644
> --- a/include/linux/slub_def.h
> +++ b/include/linux/slub_def.h
> @@ -8,6 +8,7 @@
>   * (C) 2007 SGI, Christoph Lameter
>   */
>  #include <linux/kobject.h>
> +#include <linux/reciprocal_div.h>
>  
>  enum stat_item {
>  	ALLOC_FASTPATH,		/* Allocation from cpu slab */
> @@ -86,6 +87,7 @@ struct kmem_cache {
>  	unsigned long min_partial;
>  	unsigned int size;	/* The size of an object including metadata */
>  	unsigned int object_size;/* The size of an object without metadata */
> +	struct reciprocal_value reciprocal_size;
>  	unsigned int offset;	/* Free pointer offset */
>  #ifdef CONFIG_SLUB_CPU_PARTIAL
>  	/* Number of per cpu partial objects to keep around */
> @@ -182,4 +184,11 @@ static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
>  	return result;
>  }
>  
> +static inline unsigned int obj_to_index(const struct kmem_cache *cache,
> +					const struct page *page, void *obj)
> +{
> +	return reciprocal_divide(kasan_reset_tag(obj) - page_address(page),
> +				 cache->reciprocal_size);
> +}
> +
>  #endif /* _LINUX_SLUB_DEF_H */
> diff --git a/mm/slub.c b/mm/slub.c
> index 03071ae5ff07..8d16babe1829 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -3660,6 +3660,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
>  	 */
>  	size = ALIGN(size, s->align);
>  	s->size = size;
> +	s->reciprocal_size = reciprocal_value(size);
>  	if (forced_order >= 0)
>  		order = forced_order;
>  	else
>
Roman Gushchin May 20, 2020, 8:57 p.m. UTC | #22
On Wed, May 20, 2020 at 11:51:51AM +0200, Vlastimil Babka wrote:
> On 5/13/20 2:57 AM, Roman Gushchin wrote:
> > 
> > Btw, I'm trying to build up a prototype with an embedded memcg pointer,
> > but it seems to be way more tricky than I thought. It requires changes to
> > shrinkers (as they rely on getting the memcg pointer by an arbitrary
> > kernel address, not necessarily aligned to the head of slab allocation),
> > figuring out cache merging, adding SLAB support, natural alignment of
> > kmallocs etc.
> 
> Is the natural alignment of kmallocs a problem right now? As kmalloc()
> allocations are AFAIK not kmemcg-accounted? Or does your implementation add
> memcg awareness to everything, even if non-__GFP_ACCOUNT allocations just get a
> root memcg pointer?

There is at least a dozen of accounted kmallocs as now, please search for kmalloc
with GFP_KERNEL_ACCOUNT.

Natural alignment is not an issue with the proposed implementation, but it becomes
a problem as soon as we try to embed the memcg pointer into the object
(as Christopher is suggesting). I'm actually not opposing his suggestion, just
want to settle down the memcg part first, and then discuss the best way
to store the memcg metadata information. As I shown, the required changes to switch
between different ways of storing the data are minimal and do not affect
the rest of the patchset.

Thanks!
Roman Gushchin May 20, 2020, 9 p.m. UTC | #23
On Wed, May 20, 2020 at 03:51:45PM +0200, Vlastimil Babka wrote:
> On 4/22/20 10:46 PM, Roman Gushchin wrote:
> > This commit implements SLUB version of the obj_to_index() function,
> > which will be required to calculate the offset of obj_cgroup in the
> > obj_cgroups vector to store/obtain the objcg ownership data.
> > 
> > To make it faster, let's repeat the SLAB's trick introduced by
> > commit 6a2d7a955d8d ("[PATCH] SLAB: use a multiply instead of a
> > divide in obj_to_index()") and avoid an expensive division.
> > 
> > Signed-off-by: Roman Gushchin <guro@fb.com>
> > Acked-by: Christoph Lameter <cl@linux.com>
> > Acked-by: Johannes Weiner <hannes@cmpxchg.org>
> 
> There's already a slab_index() doing the same without the trick, with only
> SLUB_DEBUG callers. Maybe just improve it and perhaps rename? (obj_to_index()
> seems more descriptive). The difference is that it takes the result of
> page_addr() instead of doing that, as it's being called in a loop on objects
> from a single page, so you'd have to perhaps split to obj_to_index(page) and
> __obj_to_index(addr) or something.

Good point! How about this one?

--

From beeaecdac85c3a395dcfb99944dc8c858b541cbf Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Mon, 29 Jul 2019 18:18:42 -0700
Subject: [PATCH v3.2 04/19] mm: slub: implement SLUB version of obj_to_index()

This commit implements SLUB version of the obj_to_index() function,
which will be required to calculate the offset of obj_cgroup in the
obj_cgroups vector to store/obtain the objcg ownership data.

To make it faster, let's repeat the SLAB's trick introduced by
commit 6a2d7a955d8d ("[PATCH] SLAB: use a multiply instead of a
divide in obj_to_index()") and avoid an expensive division.

Vlastimil Babka noticed, that SLUB does have already a similar
function called slab_index(), which is defined only if SLUB_DEBUG
is enabled. The function does a similar math, but with a division,
and it also takes a page address instead of a page pointer.

Let's remove slab_index() and replace it with the new helper
__obj_to_index(), which takes a page address. obj_to_index()
will be a simple wrapper taking a page pointer and passing
page_address(page) into __obj_to_index().

Signed-off-by: Roman Gushchin <guro@fb.com>
---
 include/linux/slub_def.h | 16 ++++++++++++++++
 mm/slub.c                | 15 +++++----------
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index d2153789bd9f..30e91c83d401 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -8,6 +8,7 @@
  * (C) 2007 SGI, Christoph Lameter
  */
 #include <linux/kobject.h>
+#include <linux/reciprocal_div.h>
 
 enum stat_item {
 	ALLOC_FASTPATH,		/* Allocation from cpu slab */
@@ -86,6 +87,7 @@ struct kmem_cache {
 	unsigned long min_partial;
 	unsigned int size;	/* The size of an object including metadata */
 	unsigned int object_size;/* The size of an object without metadata */
+	struct reciprocal_value reciprocal_size;
 	unsigned int offset;	/* Free pointer offset */
 #ifdef CONFIG_SLUB_CPU_PARTIAL
 	/* Number of per cpu partial objects to keep around */
@@ -182,4 +184,18 @@ static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
 	return result;
 }
 
+/* Determine object index from a given position */
+static inline unsigned int __obj_to_index(const struct kmem_cache *cache,
+					  void *addr, void *obj)
+{
+	return reciprocal_divide(kasan_reset_tag(obj) - addr,
+				 cache->reciprocal_size);
+}
+
+static inline unsigned int obj_to_index(const struct kmem_cache *cache,
+					const struct page *page, void *obj)
+{
+	return __obj_to_index(cache, page_address(page), obj);
+}
+
 #endif /* _LINUX_SLUB_DEF_H */
diff --git a/mm/slub.c b/mm/slub.c
index 2df4d4a420d1..d605d18b3c1b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -313,12 +313,6 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
 		__p < (__addr) + (__objects) * (__s)->size; \
 		__p += (__s)->size)
 
-/* Determine object index from a given position */
-static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr)
-{
-	return (kasan_reset_tag(p) - addr) / s->size;
-}
-
 static inline unsigned int order_objects(unsigned int order, unsigned int size)
 {
 	return ((unsigned int)PAGE_SIZE << order) / size;
@@ -461,7 +455,7 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page)
 	bitmap_zero(object_map, page->objects);
 
 	for (p = page->freelist; p; p = get_freepointer(s, p))
-		set_bit(slab_index(p, s, addr), object_map);
+		set_bit(__obj_to_index(s, addr, p), object_map);
 
 	return object_map;
 }
@@ -3682,6 +3676,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 	 */
 	size = ALIGN(size, s->align);
 	s->size = size;
+	s->reciprocal_size = reciprocal_value(size);
 	if (forced_order >= 0)
 		order = forced_order;
 	else
@@ -3788,7 +3783,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
 	map = get_map(s, page);
 	for_each_object(p, s, addr, page->objects) {
 
-		if (!test_bit(slab_index(p, s, addr), map)) {
+		if (!test_bit(__obj_to_index(s, addr, p), map)) {
 			pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
 			print_tracking(s, p);
 		}
@@ -4513,7 +4508,7 @@ static void validate_slab(struct kmem_cache *s, struct page *page)
 	/* Now we know that a valid freelist exists */
 	map = get_map(s, page);
 	for_each_object(p, s, addr, page->objects) {
-		u8 val = test_bit(slab_index(p, s, addr), map) ?
+		u8 val = test_bit(__obj_to_index(s, addr, p), map) ?
 			 SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
 
 		if (!check_object(s, page, p, val))
@@ -4704,7 +4699,7 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s,
 
 	map = get_map(s, page);
 	for_each_object(p, s, addr, page->objects)
-		if (!test_bit(slab_index(p, s, addr), map))
+		if (!test_bit(__obj_to_index(s, addr, p), map))
 			add_location(t, s, get_track(s, p, alloc));
 	put_map(map);
 }
Vlastimil Babka May 21, 2020, 11:01 a.m. UTC | #24
On 5/20/20 11:00 PM, Roman Gushchin wrote:
> 
> From beeaecdac85c3a395dcfb99944dc8c858b541cbf Mon Sep 17 00:00:00 2001
> From: Roman Gushchin <guro@fb.com>
> Date: Mon, 29 Jul 2019 18:18:42 -0700
> Subject: [PATCH v3.2 04/19] mm: slub: implement SLUB version of obj_to_index()
> 
> This commit implements SLUB version of the obj_to_index() function,
> which will be required to calculate the offset of obj_cgroup in the
> obj_cgroups vector to store/obtain the objcg ownership data.
> 
> To make it faster, let's repeat the SLAB's trick introduced by
> commit 6a2d7a955d8d ("[PATCH] SLAB: use a multiply instead of a
> divide in obj_to_index()") and avoid an expensive division.
> 
> Vlastimil Babka noticed, that SLUB does have already a similar
> function called slab_index(), which is defined only if SLUB_DEBUG
> is enabled. The function does a similar math, but with a division,
> and it also takes a page address instead of a page pointer.
> 
> Let's remove slab_index() and replace it with the new helper
> __obj_to_index(), which takes a page address. obj_to_index()
> will be a simple wrapper taking a page pointer and passing
> page_address(page) into __obj_to_index().
> 
> Signed-off-by: Roman Gushchin <guro@fb.com>

Looks good!

Reviewed-by: Vlastimil Babka <vbabka@suse.cz>

> ---
>  include/linux/slub_def.h | 16 ++++++++++++++++
>  mm/slub.c                | 15 +++++----------
>  2 files changed, 21 insertions(+), 10 deletions(-)
> 
> diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
> index d2153789bd9f..30e91c83d401 100644
> --- a/include/linux/slub_def.h
> +++ b/include/linux/slub_def.h
> @@ -8,6 +8,7 @@
>   * (C) 2007 SGI, Christoph Lameter
>   */
>  #include <linux/kobject.h>
> +#include <linux/reciprocal_div.h>
>  
>  enum stat_item {
>  	ALLOC_FASTPATH,		/* Allocation from cpu slab */
> @@ -86,6 +87,7 @@ struct kmem_cache {
>  	unsigned long min_partial;
>  	unsigned int size;	/* The size of an object including metadata */
>  	unsigned int object_size;/* The size of an object without metadata */
> +	struct reciprocal_value reciprocal_size;
>  	unsigned int offset;	/* Free pointer offset */
>  #ifdef CONFIG_SLUB_CPU_PARTIAL
>  	/* Number of per cpu partial objects to keep around */
> @@ -182,4 +184,18 @@ static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
>  	return result;
>  }
>  
> +/* Determine object index from a given position */
> +static inline unsigned int __obj_to_index(const struct kmem_cache *cache,
> +					  void *addr, void *obj)
> +{
> +	return reciprocal_divide(kasan_reset_tag(obj) - addr,
> +				 cache->reciprocal_size);
> +}
> +
> +static inline unsigned int obj_to_index(const struct kmem_cache *cache,
> +					const struct page *page, void *obj)
> +{
> +	return __obj_to_index(cache, page_address(page), obj);
> +}
> +
>  #endif /* _LINUX_SLUB_DEF_H */
> diff --git a/mm/slub.c b/mm/slub.c
> index 2df4d4a420d1..d605d18b3c1b 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -313,12 +313,6 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
>  		__p < (__addr) + (__objects) * (__s)->size; \
>  		__p += (__s)->size)
>  
> -/* Determine object index from a given position */
> -static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr)
> -{
> -	return (kasan_reset_tag(p) - addr) / s->size;
> -}
> -
>  static inline unsigned int order_objects(unsigned int order, unsigned int size)
>  {
>  	return ((unsigned int)PAGE_SIZE << order) / size;
> @@ -461,7 +455,7 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page)
>  	bitmap_zero(object_map, page->objects);
>  
>  	for (p = page->freelist; p; p = get_freepointer(s, p))
> -		set_bit(slab_index(p, s, addr), object_map);
> +		set_bit(__obj_to_index(s, addr, p), object_map);
>  
>  	return object_map;
>  }
> @@ -3682,6 +3676,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
>  	 */
>  	size = ALIGN(size, s->align);
>  	s->size = size;
> +	s->reciprocal_size = reciprocal_value(size);
>  	if (forced_order >= 0)
>  		order = forced_order;
>  	else
> @@ -3788,7 +3783,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
>  	map = get_map(s, page);
>  	for_each_object(p, s, addr, page->objects) {
>  
> -		if (!test_bit(slab_index(p, s, addr), map)) {
> +		if (!test_bit(__obj_to_index(s, addr, p), map)) {
>  			pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
>  			print_tracking(s, p);
>  		}
> @@ -4513,7 +4508,7 @@ static void validate_slab(struct kmem_cache *s, struct page *page)
>  	/* Now we know that a valid freelist exists */
>  	map = get_map(s, page);
>  	for_each_object(p, s, addr, page->objects) {
> -		u8 val = test_bit(slab_index(p, s, addr), map) ?
> +		u8 val = test_bit(__obj_to_index(s, addr, p), map) ?
>  			 SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
>  
>  		if (!check_object(s, page, p, val))
> @@ -4704,7 +4699,7 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s,
>  
>  	map = get_map(s, page);
>  	for_each_object(p, s, addr, page->objects)
> -		if (!test_bit(slab_index(p, s, addr), map))
> +		if (!test_bit(__obj_to_index(s, addr, p), map))
>  			add_location(t, s, get_track(s, p, alloc));
>  	put_map(map);
>  }
>
Roman Gushchin May 21, 2020, 9:06 p.m. UTC | #25
On Thu, May 21, 2020 at 01:01:38PM +0200, Vlastimil Babka wrote:
> On 5/20/20 11:00 PM, Roman Gushchin wrote:
> > 
> > From beeaecdac85c3a395dcfb99944dc8c858b541cbf Mon Sep 17 00:00:00 2001
> > From: Roman Gushchin <guro@fb.com>
> > Date: Mon, 29 Jul 2019 18:18:42 -0700
> > Subject: [PATCH v3.2 04/19] mm: slub: implement SLUB version of obj_to_index()
> > 
> > This commit implements SLUB version of the obj_to_index() function,
> > which will be required to calculate the offset of obj_cgroup in the
> > obj_cgroups vector to store/obtain the objcg ownership data.
> > 
> > To make it faster, let's repeat the SLAB's trick introduced by
> > commit 6a2d7a955d8d ("[PATCH] SLAB: use a multiply instead of a
> > divide in obj_to_index()") and avoid an expensive division.
> > 
> > Vlastimil Babka noticed, that SLUB does have already a similar
> > function called slab_index(), which is defined only if SLUB_DEBUG
> > is enabled. The function does a similar math, but with a division,
> > and it also takes a page address instead of a page pointer.
> > 
> > Let's remove slab_index() and replace it with the new helper
> > __obj_to_index(), which takes a page address. obj_to_index()
> > will be a simple wrapper taking a page pointer and passing
> > page_address(page) into __obj_to_index().
> > 
> > Signed-off-by: Roman Gushchin <guro@fb.com>
> 
> Looks good!
> 
> Reviewed-by: Vlastimil Babka <vbabka@suse.cz>

Thanks!
diff mbox series

Patch

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index d2153789bd9f..200ea292f250 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -8,6 +8,7 @@ 
  * (C) 2007 SGI, Christoph Lameter
  */
 #include <linux/kobject.h>
+#include <linux/reciprocal_div.h>
 
 enum stat_item {
 	ALLOC_FASTPATH,		/* Allocation from cpu slab */
@@ -86,6 +87,7 @@  struct kmem_cache {
 	unsigned long min_partial;
 	unsigned int size;	/* The size of an object including metadata */
 	unsigned int object_size;/* The size of an object without metadata */
+	struct reciprocal_value reciprocal_size;
 	unsigned int offset;	/* Free pointer offset */
 #ifdef CONFIG_SLUB_CPU_PARTIAL
 	/* Number of per cpu partial objects to keep around */
@@ -182,4 +184,11 @@  static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
 	return result;
 }
 
+static inline unsigned int obj_to_index(const struct kmem_cache *cache,
+					const struct page *page, void *obj)
+{
+	return reciprocal_divide(kasan_reset_tag(obj) - page_address(page),
+				 cache->reciprocal_size);
+}
+
 #endif /* _LINUX_SLUB_DEF_H */
diff --git a/mm/slub.c b/mm/slub.c
index 03071ae5ff07..8d16babe1829 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3660,6 +3660,7 @@  static int calculate_sizes(struct kmem_cache *s, int forced_order)
 	 */
 	size = ALIGN(size, s->align);
 	s->size = size;
+	s->reciprocal_size = reciprocal_value(size);
 	if (forced_order >= 0)
 		order = forced_order;
 	else