diff mbox series

[v4,2/2] mm: avoid slub allocation while holding list_lock

Message ID 20191108193958.205102-2-yuzhao@google.com (mailing list archive)
State New, archived
Headers show
Series [v4,1/2] mm: clean up validate_slab() | expand

Commit Message

Yu Zhao Nov. 8, 2019, 7:39 p.m. UTC
If we are already under list_lock, don't call kmalloc(). Otherwise we
will run into deadlock because kmalloc() also tries to grab the same
lock.

Fixing the problem by using a static bitmap instead.

  WARNING: possible recursive locking detected
  --------------------------------------------
  mount-encrypted/4921 is trying to acquire lock:
  (&(&n->list_lock)->rlock){-.-.}, at: ___slab_alloc+0x104/0x437

  but task is already holding lock:
  (&(&n->list_lock)->rlock){-.-.}, at: __kmem_cache_shutdown+0x81/0x3cb

  other info that might help us debug this:
   Possible unsafe locking scenario:

         CPU0
         ----
    lock(&(&n->list_lock)->rlock);
    lock(&(&n->list_lock)->rlock);

   *** DEADLOCK ***

Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Yu Zhao <yuzhao@google.com>
---
 mm/slub.c | 88 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 47 insertions(+), 41 deletions(-)

Comments

Christoph Lameter (Ampere) Nov. 9, 2019, 8:52 p.m. UTC | #1
On Fri, 8 Nov 2019, Yu Zhao wrote:

> If we are already under list_lock, don't call kmalloc(). Otherwise we
> will run into deadlock because kmalloc() also tries to grab the same
> lock.

How did this happen? The kmalloc needs to be always done before the
list_lock is taken.

> Fixing the problem by using a static bitmap instead.
>
>   WARNING: possible recursive locking detected
>   --------------------------------------------
>   mount-encrypted/4921 is trying to acquire lock:
>   (&(&n->list_lock)->rlock){-.-.}, at: ___slab_alloc+0x104/0x437
>
>   but task is already holding lock:
>   (&(&n->list_lock)->rlock){-.-.}, at: __kmem_cache_shutdown+0x81/0x3cb
>
>   other info that might help us debug this:
>    Possible unsafe locking scenario:
>
>          CPU0
>          ----
>     lock(&(&n->list_lock)->rlock);
>     lock(&(&n->list_lock)->rlock);
>
>    *** DEADLOCK ***


Ahh. list_slab_objects() in shutdown?

There is a much easier fix for this:



[FIX] slub: Remove kmalloc under list_lock from list_slab_objects()

list_slab_objects() is called when a slab is destroyed and there are objects still left
to list the objects in the syslog. This is a pretty rare event.

And there it seems we take the list_lock and call kmalloc while holding that lock.

Perform the allocation in free_partial() before the list_lock is taken.

Fixes: bbd7d57bfe852d9788bae5fb171c7edb4021d8ac ("slub: Potential stack overflow")
Signed-off-by: Christoph Lameter <cl@linux.com>

Index: linux/mm/slub.c
===================================================================
--- linux.orig/mm/slub.c	2019-10-15 13:54:57.032655296 +0000
+++ linux/mm/slub.c	2019-11-09 20:43:52.374187381 +0000
@@ -3690,14 +3690,11 @@ error:
 }

 static void list_slab_objects(struct kmem_cache *s, struct page *page,
-							const char *text)
+					const char *text, unsigned long *map)
 {
 #ifdef CONFIG_SLUB_DEBUG
 	void *addr = page_address(page);
 	void *p;
-	unsigned long *map = bitmap_zalloc(page->objects, GFP_ATOMIC);
-	if (!map)
-		return;
 	slab_err(s, page, text, s->name);
 	slab_lock(page);

@@ -3723,6 +3720,10 @@ static void free_partial(struct kmem_cac
 {
 	LIST_HEAD(discard);
 	struct page *page, *h;
+	unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
+
+	if (!map)
+		return;

 	BUG_ON(irqs_disabled());
 	spin_lock_irq(&n->list_lock);
@@ -3732,7 +3733,8 @@ static void free_partial(struct kmem_cac
 			list_add(&page->slab_list, &discard);
 		} else {
 			list_slab_objects(s, page,
-			"Objects remaining in %s on __kmem_cache_shutdown()");
+			"Objects remaining in %s on __kmem_cache_shutdown()",
+			map);
 		}
 	}
 	spin_unlock_irq(&n->list_lock);
Yu Zhao Nov. 9, 2019, 11:01 p.m. UTC | #2
On Sat, Nov 09, 2019 at 08:52:29PM +0000, Christopher Lameter wrote:
> On Fri, 8 Nov 2019, Yu Zhao wrote:
> 
> > If we are already under list_lock, don't call kmalloc(). Otherwise we
> > will run into deadlock because kmalloc() also tries to grab the same
> > lock.
> 
> How did this happen? The kmalloc needs to be always done before the
> list_lock is taken.
> 
> > Fixing the problem by using a static bitmap instead.
> >
> >   WARNING: possible recursive locking detected
> >   --------------------------------------------
> >   mount-encrypted/4921 is trying to acquire lock:
> >   (&(&n->list_lock)->rlock){-.-.}, at: ___slab_alloc+0x104/0x437
> >
> >   but task is already holding lock:
> >   (&(&n->list_lock)->rlock){-.-.}, at: __kmem_cache_shutdown+0x81/0x3cb
> >
> >   other info that might help us debug this:
> >    Possible unsafe locking scenario:
> >
> >          CPU0
> >          ----
> >     lock(&(&n->list_lock)->rlock);
> >     lock(&(&n->list_lock)->rlock);
> >
> >    *** DEADLOCK ***
> 
> 
> Ahh. list_slab_objects() in shutdown?
> 
> There is a much easier fix for this:
> 
> 
> 
> [FIX] slub: Remove kmalloc under list_lock from list_slab_objects()
> 
> list_slab_objects() is called when a slab is destroyed and there are objects still left
> to list the objects in the syslog. This is a pretty rare event.
> 
> And there it seems we take the list_lock and call kmalloc while holding that lock.
> 
> Perform the allocation in free_partial() before the list_lock is taken.
> 
> Fixes: bbd7d57bfe852d9788bae5fb171c7edb4021d8ac ("slub: Potential stack overflow")
> Signed-off-by: Christoph Lameter <cl@linux.com>
> 
> Index: linux/mm/slub.c
> ===================================================================
> --- linux.orig/mm/slub.c	2019-10-15 13:54:57.032655296 +0000
> +++ linux/mm/slub.c	2019-11-09 20:43:52.374187381 +0000
> @@ -3690,14 +3690,11 @@ error:
>  }
> 
>  static void list_slab_objects(struct kmem_cache *s, struct page *page,
> -							const char *text)
> +					const char *text, unsigned long *map)
>  {
>  #ifdef CONFIG_SLUB_DEBUG
>  	void *addr = page_address(page);
>  	void *p;
> -	unsigned long *map = bitmap_zalloc(page->objects, GFP_ATOMIC);
> -	if (!map)
> -		return;
>  	slab_err(s, page, text, s->name);
>  	slab_lock(page);
> 
> @@ -3723,6 +3720,10 @@ static void free_partial(struct kmem_cac
>  {
>  	LIST_HEAD(discard);
>  	struct page *page, *h;
> +	unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
> +
> +	if (!map)
> +		return;

What would happen if we are trying to allocate from the slab that is
being shut down? And shouldn't the allocation be conditional (i.e.,
only when CONFIG_SLUB_DEBUG=y)?
Christoph Lameter (Ampere) Nov. 9, 2019, 11:16 p.m. UTC | #3
On Sat, 9 Nov 2019, Yu Zhao wrote:

> >  	struct page *page, *h;
> > +	unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
> > +
> > +	if (!map)
> > +		return;
>
> What would happen if we are trying to allocate from the slab that is
> being shut down? And shouldn't the allocation be conditional (i.e.,
> only when CONFIG_SLUB_DEBUG=y)?

Kmalloc slabs are never shut down.

The allocation does not hurt and CONFIG_SLUB_DEBUG is on in most
configurations.
Yu Zhao Nov. 10, 2019, 6:47 p.m. UTC | #4
On Sat, Nov 09, 2019 at 11:16:28PM +0000, Christopher Lameter wrote:
> On Sat, 9 Nov 2019, Yu Zhao wrote:
> 
> > >  	struct page *page, *h;
> > > +	unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
> > > +
> > > +	if (!map)
> > > +		return;
> >
> > What would happen if we are trying to allocate from the slab that is
> > being shut down? And shouldn't the allocation be conditional (i.e.,
> > only when CONFIG_SLUB_DEBUG=y)?
> 
> Kmalloc slabs are never shut down.

Maybe I'm not thinking straight -- isn't it what caused the deadlock in
the first place?

Kmalloc slabs can be shut down when memcg is on.
Christoph Lameter (Ampere) Nov. 11, 2019, 3:47 p.m. UTC | #5
On Sun, 10 Nov 2019, Yu Zhao wrote:

> On Sat, Nov 09, 2019 at 11:16:28PM +0000, Christopher Lameter wrote:
> > On Sat, 9 Nov 2019, Yu Zhao wrote:
> >
> > > >  	struct page *page, *h;
> > > > +	unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
> > > > +
> > > > +	if (!map)
> > > > +		return;
> > >
> > > What would happen if we are trying to allocate from the slab that is
> > > being shut down? And shouldn't the allocation be conditional (i.e.,
> > > only when CONFIG_SLUB_DEBUG=y)?
> >
> > Kmalloc slabs are never shut down.
>
> Maybe I'm not thinking straight -- isn't it what caused the deadlock in
> the first place?

Well if kmalloc allocations become a problem then we have numerous
issues all over the kernel to fix.

> Kmalloc slabs can be shut down when memcg is on.

Kmalloc needs to work even during shutdown of a memcg.

Maybe we need to fix memcg to not allocate from the current memcg during
shutdown?
Shakeel Butt Nov. 11, 2019, 6:15 p.m. UTC | #6
+Roman Gushchin

On Mon, Nov 11, 2019 at 7:47 AM Christopher Lameter <cl@linux.com> wrote:
>
> On Sun, 10 Nov 2019, Yu Zhao wrote:
>
> > On Sat, Nov 09, 2019 at 11:16:28PM +0000, Christopher Lameter wrote:
> > > On Sat, 9 Nov 2019, Yu Zhao wrote:
> > >
> > > > >         struct page *page, *h;
> > > > > +       unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
> > > > > +
> > > > > +       if (!map)
> > > > > +               return;
> > > >
> > > > What would happen if we are trying to allocate from the slab that is
> > > > being shut down? And shouldn't the allocation be conditional (i.e.,
> > > > only when CONFIG_SLUB_DEBUG=y)?
> > >
> > > Kmalloc slabs are never shut down.
> >
> > Maybe I'm not thinking straight -- isn't it what caused the deadlock in
> > the first place?
>
> Well if kmalloc allocations become a problem then we have numerous
> issues all over the kernel to fix.
>
> > Kmalloc slabs can be shut down when memcg is on.
>
> Kmalloc needs to work even during shutdown of a memcg.
>
> Maybe we need to fix memcg to not allocate from the current memcg during
> shutdown?
>
>

Roman recently added reparenting of memcg kmem caches on memcg offline
and can comment in more detail but we don't shutdown a kmem cache
until all the in-fly memcg allocations are resolved. Also the
allocation here does not look like a __GFP_ACCOUNT allocation.
diff mbox series

Patch

diff --git a/mm/slub.c b/mm/slub.c
index 6930c3febad7..7a4ec3c4b4d9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -441,19 +441,38 @@  static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
 }
 
 #ifdef CONFIG_SLUB_DEBUG
+static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
+static DEFINE_SPINLOCK(object_map_lock);
+
 /*
  * Determine a map of object in use on a page.
  *
  * Node listlock must be held to guarantee that the page does
  * not vanish from under us.
  */
-static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
+static unsigned long *get_map(struct kmem_cache *s, struct page *page)
 {
 	void *p;
 	void *addr = page_address(page);
 
+	VM_BUG_ON(!irqs_disabled());
+
+	spin_lock(&object_map_lock);
+
+	bitmap_zero(object_map, page->objects);
+
 	for (p = page->freelist; p; p = get_freepointer(s, p))
-		set_bit(slab_index(p, s, addr), map);
+		set_bit(slab_index(p, s, addr), object_map);
+
+	return object_map;
+}
+
+static void put_map(unsigned long *map)
+{
+	VM_BUG_ON(map != object_map);
+	lockdep_assert_held(&object_map_lock);
+
+	spin_unlock(&object_map_lock);
 }
 
 static inline unsigned int size_from_object(struct kmem_cache *s)
@@ -3695,13 +3714,12 @@  static void list_slab_objects(struct kmem_cache *s, struct page *page,
 #ifdef CONFIG_SLUB_DEBUG
 	void *addr = page_address(page);
 	void *p;
-	unsigned long *map = bitmap_zalloc(page->objects, GFP_ATOMIC);
-	if (!map)
-		return;
+	unsigned long *map;
+
 	slab_err(s, page, text, s->name);
 	slab_lock(page);
 
-	get_map(s, page, map);
+	map = get_map(s, page);
 	for_each_object(p, s, addr, page->objects) {
 
 		if (!test_bit(slab_index(p, s, addr), map)) {
@@ -3709,8 +3727,9 @@  static void list_slab_objects(struct kmem_cache *s, struct page *page,
 			print_tracking(s, p);
 		}
 	}
+	put_map(map);
+
 	slab_unlock(page);
-	bitmap_free(map);
 #endif
 }
 
@@ -4404,19 +4423,19 @@  static int count_total(struct page *page)
 #endif
 
 #ifdef CONFIG_SLUB_DEBUG
-static void validate_slab(struct kmem_cache *s, struct page *page,
-						unsigned long *map)
+static void validate_slab(struct kmem_cache *s, struct page *page)
 {
 	void *p;
 	void *addr = page_address(page);
+	unsigned long *map;
+
+	slab_lock(page);
 
 	if (!check_slab(s, page) || !on_freelist(s, page, NULL))
-		return;
+		goto unlock;
 
 	/* Now we know that a valid freelist exists */
-	bitmap_zero(map, page->objects);
-
-	get_map(s, page, map);
+	map = get_map(s, page);
 	for_each_object(p, s, addr, page->objects) {
 		u8 val = test_bit(slab_index(p, s, addr), map) ?
 			 SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
@@ -4424,18 +4443,13 @@  static void validate_slab(struct kmem_cache *s, struct page *page,
 		if (!check_object(s, page, p, val))
 			break;
 	}
-}
-
-static void validate_slab_slab(struct kmem_cache *s, struct page *page,
-						unsigned long *map)
-{
-	slab_lock(page);
-	validate_slab(s, page, map);
+	put_map(map);
+unlock:
 	slab_unlock(page);
 }
 
 static int validate_slab_node(struct kmem_cache *s,
-		struct kmem_cache_node *n, unsigned long *map)
+		struct kmem_cache_node *n)
 {
 	unsigned long count = 0;
 	struct page *page;
@@ -4444,7 +4458,7 @@  static int validate_slab_node(struct kmem_cache *s,
 	spin_lock_irqsave(&n->list_lock, flags);
 
 	list_for_each_entry(page, &n->partial, slab_list) {
-		validate_slab_slab(s, page, map);
+		validate_slab(s, page);
 		count++;
 	}
 	if (count != n->nr_partial)
@@ -4455,7 +4469,7 @@  static int validate_slab_node(struct kmem_cache *s,
 		goto out;
 
 	list_for_each_entry(page, &n->full, slab_list) {
-		validate_slab_slab(s, page, map);
+		validate_slab(s, page);
 		count++;
 	}
 	if (count != atomic_long_read(&n->nr_slabs))
@@ -4472,15 +4486,11 @@  static long validate_slab_cache(struct kmem_cache *s)
 	int node;
 	unsigned long count = 0;
 	struct kmem_cache_node *n;
-	unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
-
-	if (!map)
-		return -ENOMEM;
 
 	flush_all(s);
 	for_each_kmem_cache_node(s, node, n)
-		count += validate_slab_node(s, n, map);
-	bitmap_free(map);
+		count += validate_slab_node(s, n);
+
 	return count;
 }
 /*
@@ -4610,18 +4620,17 @@  static int add_location(struct loc_track *t, struct kmem_cache *s,
 }
 
 static void process_slab(struct loc_track *t, struct kmem_cache *s,
-		struct page *page, enum track_item alloc,
-		unsigned long *map)
+		struct page *page, enum track_item alloc)
 {
 	void *addr = page_address(page);
 	void *p;
+	unsigned long *map;
 
-	bitmap_zero(map, page->objects);
-	get_map(s, page, map);
-
+	map = get_map(s, page);
 	for_each_object(p, s, addr, page->objects)
 		if (!test_bit(slab_index(p, s, addr), map))
 			add_location(t, s, get_track(s, p, alloc));
+	put_map(map);
 }
 
 static int list_locations(struct kmem_cache *s, char *buf,
@@ -4632,11 +4641,9 @@  static int list_locations(struct kmem_cache *s, char *buf,
 	struct loc_track t = { 0, 0, NULL };
 	int node;
 	struct kmem_cache_node *n;
-	unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
 
-	if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
-				     GFP_KERNEL)) {
-		bitmap_free(map);
+	if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
+			     GFP_KERNEL)) {
 		return sprintf(buf, "Out of memory\n");
 	}
 	/* Push back cpu slabs */
@@ -4651,9 +4658,9 @@  static int list_locations(struct kmem_cache *s, char *buf,
 
 		spin_lock_irqsave(&n->list_lock, flags);
 		list_for_each_entry(page, &n->partial, slab_list)
-			process_slab(&t, s, page, alloc, map);
+			process_slab(&t, s, page, alloc);
 		list_for_each_entry(page, &n->full, slab_list)
-			process_slab(&t, s, page, alloc, map);
+			process_slab(&t, s, page, alloc);
 		spin_unlock_irqrestore(&n->list_lock, flags);
 	}
 
@@ -4702,7 +4709,6 @@  static int list_locations(struct kmem_cache *s, char *buf,
 	}
 
 	free_loc_track(&t);
-	bitmap_free(map);
 	if (!t.count)
 		len += sprintf(buf, "No data\n");
 	return len;