diff mbox series

slub: Introduce CONFIG_SLUB_RCU_DEBUG

Message ID 20230825211426.3798691-1-jannh@google.com (mailing list archive)
State Handled Elsewhere
Headers show
Series slub: Introduce CONFIG_SLUB_RCU_DEBUG | expand

Commit Message

Jann Horn Aug. 25, 2023, 9:14 p.m. UTC
Currently, KASAN is unable to catch use-after-free in SLAB_TYPESAFE_BY_RCU
slabs because use-after-free is allowed within the RCU grace period by
design.

Add a SLUB debugging feature which RCU-delays every individual
kmem_cache_free() before either actually freeing the object or handing it
off to KASAN, and change KASAN to poison freed objects as normal when this
option is enabled.

Note that this creates a 16-byte unpoisoned area in the middle of the
slab metadata area, which kinda sucks but seems to be necessary in order
to be able to store an rcu_head in there without triggering an ASAN
splat during RCU callback processing.

For now I've configured Kconfig.kasan to always enable this feature in the
GENERIC and SW_TAGS modes; I'm not forcibly enabling it in HW_TAGS mode
because I'm not sure if it might have unwanted performance degradation
effects there.

Signed-off-by: Jann Horn <jannh@google.com>
---
can I get a review from the KASAN folks of this?
I have been running it on my laptop for a bit and it seems to be working
fine.

Notes:
    With this patch, a UAF on a TYPESAFE_BY_RCU will splat with an error
    like this (tested by reverting a security bugfix).
    Note that, in the ASAN memory state dump, we can see the little
    unpoisoned 16-byte areas storing the rcu_head.
    
    BUG: KASAN: slab-use-after-free in folio_lock_anon_vma_read+0x129/0x4c0
    Read of size 8 at addr ffff888004e85b00 by task forkforkfork/592
    
    CPU: 0 PID: 592 Comm: forkforkfork Not tainted 6.5.0-rc7-00105-gae70c1e1f6f5-dirty #334
    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
    Call Trace:
     <TASK>
     dump_stack_lvl+0x4a/0x80
     print_report+0xcf/0x660
     kasan_report+0xd4/0x110
     folio_lock_anon_vma_read+0x129/0x4c0
     rmap_walk_anon+0x1cc/0x290
     folio_referenced+0x277/0x2a0
     shrink_folio_list+0xb8c/0x1680
     reclaim_folio_list+0xdc/0x1f0
     reclaim_pages+0x211/0x280
     madvise_cold_or_pageout_pte_range+0x812/0xb70
     walk_pgd_range+0x70b/0xce0
     __walk_page_range+0x343/0x360
     walk_page_range+0x227/0x280
     madvise_pageout+0x1cd/0x2d0
     do_madvise+0x552/0x15a0
     __x64_sys_madvise+0x62/0x70
     do_syscall_64+0x3b/0x90
     entry_SYSCALL_64_after_hwframe+0x6e/0xd8
    [...]
     </TASK>
    
    Allocated by task 574:
     kasan_save_stack+0x33/0x60
     kasan_set_track+0x25/0x30
     __kasan_slab_alloc+0x6e/0x70
     kmem_cache_alloc+0xfd/0x2b0
     anon_vma_fork+0x88/0x270
     dup_mmap+0x87c/0xc10
     copy_process+0x3399/0x3590
     kernel_clone+0x10e/0x480
     __do_sys_clone+0xa1/0xe0
     do_syscall_64+0x3b/0x90
     entry_SYSCALL_64_after_hwframe+0x6e/0xd8
    
    Freed by task 0:
     kasan_save_stack+0x33/0x60
     kasan_set_track+0x25/0x30
     kasan_save_free_info+0x2b/0x50
     __kasan_slab_free+0xfe/0x180
     slab_free_after_rcu_debug+0xad/0x200
     rcu_core+0x638/0x1620
     __do_softirq+0x14c/0x581
    
    Last potentially related work creation:
     kasan_save_stack+0x33/0x60
     __kasan_record_aux_stack+0x94/0xa0
     __call_rcu_common.constprop.0+0x47/0x730
     __put_anon_vma+0x6e/0x150
     unlink_anon_vmas+0x277/0x2e0
     vma_complete+0x341/0x580
     vma_merge+0x613/0xff0
     mprotect_fixup+0x1c0/0x510
     do_mprotect_pkey+0x5a7/0x710
     __x64_sys_mprotect+0x47/0x60
     do_syscall_64+0x3b/0x90
     entry_SYSCALL_64_after_hwframe+0x6e/0xd8
    
    Second to last potentially related work creation:
    [...]
    
    The buggy address belongs to the object at ffff888004e85b00
     which belongs to the cache anon_vma of size 192
    The buggy address is located 0 bytes inside of
     freed 192-byte region [ffff888004e85b00, ffff888004e85bc0)
    
    The buggy address belongs to the physical page:
    [...]
    
    Memory state around the buggy address:
     ffff888004e85a00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
     ffff888004e85a80: 00 00 00 00 00 00 00 00 fc 00 00 fc fc fc fc fc
    >ffff888004e85b00: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                       ^
     ffff888004e85b80: fb fb fb fb fb fb fb fb fc 00 00 fc fc fc fc fc
     ffff888004e85c00: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb

 include/linux/kasan.h    |  6 ++++
 include/linux/slub_def.h |  3 ++
 lib/Kconfig.kasan        |  2 ++
 mm/Kconfig.debug         | 21 +++++++++++++
 mm/kasan/common.c        | 15 ++++++++-
 mm/slub.c                | 66 +++++++++++++++++++++++++++++++++++++---
 6 files changed, 107 insertions(+), 6 deletions(-)


base-commit: 4f9e7fabf8643003afefc172e62dd276686f016e

Comments

Dmitry Vyukov Aug. 26, 2023, 3:32 a.m. UTC | #1
On Fri, 25 Aug 2023 at 23:15, Jann Horn <jannh@google.com> wrote:
>
> Currently, KASAN is unable to catch use-after-free in SLAB_TYPESAFE_BY_RCU
> slabs because use-after-free is allowed within the RCU grace period by
> design.
>
> Add a SLUB debugging feature which RCU-delays every individual
> kmem_cache_free() before either actually freeing the object or handing it
> off to KASAN, and change KASAN to poison freed objects as normal when this
> option is enabled.
>
> Note that this creates a 16-byte unpoisoned area in the middle of the
> slab metadata area, which kinda sucks but seems to be necessary in order
> to be able to store an rcu_head in there without triggering an ASAN
> splat during RCU callback processing.

Nice!

Can't we unpoision this rcu_head right before call_rcu() and repoison
after receiving the callback?

What happens on cache destruction?
Currently we purge quarantine on cache destruction to be able to
safely destroy the cache. I suspect we may need to somehow purge rcu
callbacks as well, or do something else.



> For now I've configured Kconfig.kasan to always enable this feature in the
> GENERIC and SW_TAGS modes; I'm not forcibly enabling it in HW_TAGS mode
> because I'm not sure if it might have unwanted performance degradation
> effects there.
>
> Signed-off-by: Jann Horn <jannh@google.com>
> ---
> can I get a review from the KASAN folks of this?
> I have been running it on my laptop for a bit and it seems to be working
> fine.
>
> Notes:
>     With this patch, a UAF on a TYPESAFE_BY_RCU will splat with an error
>     like this (tested by reverting a security bugfix).
>     Note that, in the ASAN memory state dump, we can see the little
>     unpoisoned 16-byte areas storing the rcu_head.
>
>     BUG: KASAN: slab-use-after-free in folio_lock_anon_vma_read+0x129/0x4c0
>     Read of size 8 at addr ffff888004e85b00 by task forkforkfork/592
>
>     CPU: 0 PID: 592 Comm: forkforkfork Not tainted 6.5.0-rc7-00105-gae70c1e1f6f5-dirty #334
>     Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
>     Call Trace:
>      <TASK>
>      dump_stack_lvl+0x4a/0x80
>      print_report+0xcf/0x660
>      kasan_report+0xd4/0x110
>      folio_lock_anon_vma_read+0x129/0x4c0
>      rmap_walk_anon+0x1cc/0x290
>      folio_referenced+0x277/0x2a0
>      shrink_folio_list+0xb8c/0x1680
>      reclaim_folio_list+0xdc/0x1f0
>      reclaim_pages+0x211/0x280
>      madvise_cold_or_pageout_pte_range+0x812/0xb70
>      walk_pgd_range+0x70b/0xce0
>      __walk_page_range+0x343/0x360
>      walk_page_range+0x227/0x280
>      madvise_pageout+0x1cd/0x2d0
>      do_madvise+0x552/0x15a0
>      __x64_sys_madvise+0x62/0x70
>      do_syscall_64+0x3b/0x90
>      entry_SYSCALL_64_after_hwframe+0x6e/0xd8
>     [...]
>      </TASK>
>
>     Allocated by task 574:
>      kasan_save_stack+0x33/0x60
>      kasan_set_track+0x25/0x30
>      __kasan_slab_alloc+0x6e/0x70
>      kmem_cache_alloc+0xfd/0x2b0
>      anon_vma_fork+0x88/0x270
>      dup_mmap+0x87c/0xc10
>      copy_process+0x3399/0x3590
>      kernel_clone+0x10e/0x480
>      __do_sys_clone+0xa1/0xe0
>      do_syscall_64+0x3b/0x90
>      entry_SYSCALL_64_after_hwframe+0x6e/0xd8
>
>     Freed by task 0:
>      kasan_save_stack+0x33/0x60
>      kasan_set_track+0x25/0x30
>      kasan_save_free_info+0x2b/0x50
>      __kasan_slab_free+0xfe/0x180
>      slab_free_after_rcu_debug+0xad/0x200
>      rcu_core+0x638/0x1620
>      __do_softirq+0x14c/0x581
>
>     Last potentially related work creation:
>      kasan_save_stack+0x33/0x60
>      __kasan_record_aux_stack+0x94/0xa0
>      __call_rcu_common.constprop.0+0x47/0x730
>      __put_anon_vma+0x6e/0x150
>      unlink_anon_vmas+0x277/0x2e0
>      vma_complete+0x341/0x580
>      vma_merge+0x613/0xff0
>      mprotect_fixup+0x1c0/0x510
>      do_mprotect_pkey+0x5a7/0x710
>      __x64_sys_mprotect+0x47/0x60
>      do_syscall_64+0x3b/0x90
>      entry_SYSCALL_64_after_hwframe+0x6e/0xd8
>
>     Second to last potentially related work creation:
>     [...]
>
>     The buggy address belongs to the object at ffff888004e85b00
>      which belongs to the cache anon_vma of size 192
>     The buggy address is located 0 bytes inside of
>      freed 192-byte region [ffff888004e85b00, ffff888004e85bc0)
>
>     The buggy address belongs to the physical page:
>     [...]
>
>     Memory state around the buggy address:
>      ffff888004e85a00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>      ffff888004e85a80: 00 00 00 00 00 00 00 00 fc 00 00 fc fc fc fc fc
>     >ffff888004e85b00: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>                        ^
>      ffff888004e85b80: fb fb fb fb fb fb fb fb fc 00 00 fc fc fc fc fc
>      ffff888004e85c00: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>
>  include/linux/kasan.h    |  6 ++++
>  include/linux/slub_def.h |  3 ++
>  lib/Kconfig.kasan        |  2 ++
>  mm/Kconfig.debug         | 21 +++++++++++++
>  mm/kasan/common.c        | 15 ++++++++-
>  mm/slub.c                | 66 +++++++++++++++++++++++++++++++++++++---
>  6 files changed, 107 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/kasan.h b/include/linux/kasan.h
> index 819b6bc8ac08..45e07caf4704 100644
> --- a/include/linux/kasan.h
> +++ b/include/linux/kasan.h
> @@ -229,6 +229,8 @@ static __always_inline bool kasan_check_byte(const void *addr)
>         return true;
>  }
>
> +size_t kasan_align(size_t size);
> +
>  #else /* CONFIG_KASAN */
>
>  static inline void kasan_unpoison_range(const void *address, size_t size) {}
> @@ -278,6 +280,10 @@ static inline bool kasan_check_byte(const void *address)
>  {
>         return true;
>  }
> +static inline size_t kasan_align(size_t size)
> +{
> +       return size;
> +}
>
>  #endif /* CONFIG_KASAN */
>
> diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
> index deb90cf4bffb..b87be8fce64a 100644
> --- a/include/linux/slub_def.h
> +++ b/include/linux/slub_def.h
> @@ -120,6 +120,9 @@ struct kmem_cache {
>         int refcount;           /* Refcount for slab cache destroy */
>         void (*ctor)(void *);
>         unsigned int inuse;             /* Offset to metadata */
> +#ifdef CONFIG_SLUB_RCU_DEBUG
> +       unsigned int debug_rcu_head_offset;
> +#endif
>         unsigned int align;             /* Alignment */
>         unsigned int red_left_pad;      /* Left redzone padding size */
>         const char *name;       /* Name (only for display!) */
> diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
> index fdca89c05745..7ff7de96c6e4 100644
> --- a/lib/Kconfig.kasan
> +++ b/lib/Kconfig.kasan
> @@ -79,6 +79,7 @@ config KASAN_GENERIC
>         depends on HAVE_ARCH_KASAN && CC_HAS_KASAN_GENERIC
>         depends on CC_HAS_WORKING_NOSANITIZE_ADDRESS
>         select SLUB_DEBUG if SLUB
> +       select SLUB_RCU_DEBUG if SLUB_DEBUG
>         select CONSTRUCTORS
>         help
>           Enables Generic KASAN.
> @@ -96,6 +97,7 @@ config KASAN_SW_TAGS
>         depends on HAVE_ARCH_KASAN_SW_TAGS && CC_HAS_KASAN_SW_TAGS
>         depends on CC_HAS_WORKING_NOSANITIZE_ADDRESS
>         select SLUB_DEBUG if SLUB
> +       select SLUB_RCU_DEBUG if SLUB_DEBUG
>         select CONSTRUCTORS
>         help
>           Enables Software Tag-Based KASAN.
> diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
> index 018a5bd2f576..99cce7f0fbef 100644
> --- a/mm/Kconfig.debug
> +++ b/mm/Kconfig.debug
> @@ -78,6 +78,27 @@ config SLUB_DEBUG_ON
>           off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying
>           "slub_debug=-".
>
> +config SLUB_RCU_DEBUG
> +       bool "Make use-after-free detection possible in TYPESAFE_BY_RCU caches"
> +       depends on SLUB && SLUB_DEBUG
> +       default n
> +       help
> +         Make SLAB_TYPESAFE_BY_RCU caches behave approximately as if the cache
> +         was not marked as SLAB_TYPESAFE_BY_RCU and every caller used
> +         kfree_rcu() instead.
> +
> +         This is intended for use in combination with KASAN, to enable KASAN to
> +         detect use-after-free accesses in such caches.
> +         (KFENCE is able to do that independent of this flag.)
> +
> +         This might degrade performance.
> +
> +         If you're using this for testing bugs / fuzzing and care about
> +         catching all the bugs WAY more than performance, you might want to
> +         also turn on CONFIG_RCU_STRICT_GRACE_PERIOD.
> +
> +         If unsure, say N.
> +
>  config PAGE_OWNER
>         bool "Track page owner"
>         depends on DEBUG_KERNEL && STACKTRACE_SUPPORT
> diff --git a/mm/kasan/common.c b/mm/kasan/common.c
> index 256930da578a..b4a3504f9f5e 100644
> --- a/mm/kasan/common.c
> +++ b/mm/kasan/common.c
> @@ -191,6 +191,13 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache,
>         if (kasan_requires_meta())
>                 kasan_init_object_meta(cache, object);
>
> +#ifdef CONFIG_SLUB_RCU_DEBUG
> +       if (cache->flags & SLAB_TYPESAFE_BY_RCU) {
> +               kasan_unpoison(object + cache->debug_rcu_head_offset,
> +                              sizeof(struct rcu_head), false);
> +       }
> +#endif /* CONFIG_SLUB_RCU_DEBUG */
> +
>         /* Tag is ignored in set_tag() without CONFIG_KASAN_SW/HW_TAGS */
>         object = set_tag(object, assign_tag(cache, object, true));
>
> @@ -218,7 +225,8 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
>         }
>
>         /* RCU slabs could be legally used after free within the RCU period */
> -       if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU))
> +       if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU) &&
> +           !IS_ENABLED(CONFIG_SLUB_RCU_DEBUG))
>                 return false;
>
>         if (!kasan_byte_accessible(tagged_object)) {
> @@ -450,3 +458,8 @@ bool __kasan_check_byte(const void *address, unsigned long ip)
>         }
>         return true;
>  }
> +
> +size_t kasan_align(size_t size)
> +{
> +       return round_up(size, KASAN_GRANULE_SIZE);
> +}
> diff --git a/mm/slub.c b/mm/slub.c
> index e3b5d5c0eb3a..bae6c2bc1e5f 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1108,7 +1108,8 @@ static int check_bytes_and_report(struct kmem_cache *s, struct slab *slab,
>   *     A. Free pointer (if we cannot overwrite object on free)
>   *     B. Tracking data for SLAB_STORE_USER
>   *     C. Original request size for kmalloc object (SLAB_STORE_USER enabled)
> - *     D. Padding to reach required alignment boundary or at minimum
> + *     D. RCU head for CONFIG_SLUB_RCU_DEBUG (with padding around it)
> + *     E. Padding to reach required alignment boundary or at minimum
>   *             one word if debugging is on to be able to detect writes
>   *             before the word boundary.
>   *
> @@ -1134,6 +1135,11 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p)
>                         off += sizeof(unsigned int);
>         }
>
> +#ifdef CONFIG_SLUB_RCU_DEBUG
> +       if (s->flags & SLAB_TYPESAFE_BY_RCU)
> +               off = kasan_align(s->debug_rcu_head_offset + sizeof(struct rcu_head));
> +#endif /* CONFIG_SLUB_RCU_DEBUG */
> +
>         off += kasan_metadata_size(s, false);
>
>         if (size_from_object(s) == off)
> @@ -1751,12 +1757,17 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
>  #endif
>  #endif /* CONFIG_SLUB_DEBUG */
>
> +#ifdef CONFIG_SLUB_RCU_DEBUG
> +static void slab_free_after_rcu_debug(struct rcu_head *rcu_head);
> +#endif
> +
>  /*
>   * Hooks for other subsystems that check memory allocations. In a typical
>   * production configuration these hooks all should produce no code at all.
>   */
>  static __always_inline bool slab_free_hook(struct kmem_cache *s,
> -                                               void *x, bool init)
> +                                               void *x, bool init,
> +                                               bool after_rcu_delay)
>  {
>         kmemleak_free_recursive(x, s->flags);
>         kmsan_slab_free(s, x);
> @@ -1766,8 +1777,18 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s,
>         if (!(s->flags & SLAB_DEBUG_OBJECTS))
>                 debug_check_no_obj_freed(x, s->object_size);
>
> +#ifdef CONFIG_SLUB_RCU_DEBUG
> +       /* kfence does its own RCU delay */
> +       if ((s->flags & SLAB_TYPESAFE_BY_RCU) && !after_rcu_delay &&
> +           !is_kfence_address(x)) {
> +               call_rcu(kasan_reset_tag(x) + s->debug_rcu_head_offset,
> +                        slab_free_after_rcu_debug);
> +               return true;
> +       }
> +#endif /* CONFIG_SLUB_RCU_DEBUG */
> +
>         /* Use KCSAN to help debug racy use-after-free. */
> -       if (!(s->flags & SLAB_TYPESAFE_BY_RCU))
> +       if (!(s->flags & SLAB_TYPESAFE_BY_RCU) || after_rcu_delay)
>                 __kcsan_check_access(x, s->object_size,
>                                      KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
>
> @@ -1802,7 +1823,7 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
>         void *old_tail = *tail ? *tail : *head;
>
>         if (is_kfence_address(next)) {
> -               slab_free_hook(s, next, false);
> +               slab_free_hook(s, next, false, false);
>                 return true;
>         }
>
> @@ -1815,7 +1836,7 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
>                 next = get_freepointer(s, object);
>
>                 /* If object's reuse doesn't have to be delayed */
> -               if (!slab_free_hook(s, object, slab_want_init_on_free(s))) {
> +               if (!slab_free_hook(s, object, slab_want_init_on_free(s), false)) {
>                         /* Move object to the new freelist */
>                         set_freepointer(s, object, *head);
>                         *head = object;
> @@ -3802,6 +3823,31 @@ static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab,
>                 do_slab_free(s, slab, head, tail, cnt, addr);
>  }
>
> +#ifdef CONFIG_SLUB_RCU_DEBUG
> +static void slab_free_after_rcu_debug(struct rcu_head *rcu_head)
> +{
> +       struct slab *slab = virt_to_slab(rcu_head);
> +       struct kmem_cache *s;
> +       void *object;
> +
> +       if (WARN_ON(is_kfence_address(rcu_head)))
> +               return;
> +
> +       /* find the object and the cache again */
> +       if (WARN_ON(!slab))
> +               return;
> +       s = slab->slab_cache;
> +       if (WARN_ON(!(s->flags & SLAB_TYPESAFE_BY_RCU)))
> +               return;
> +       object = (void *)rcu_head - s->debug_rcu_head_offset;
> +
> +       /* resume freeing */
> +       if (slab_free_hook(s, object, slab_want_init_on_free(s), true))
> +               return;
> +       do_slab_free(s, slab, object, NULL, 1, _THIS_IP_);
> +}
> +#endif /* CONFIG_SLUB_RCU_DEBUG */
> +
>  #ifdef CONFIG_KASAN_GENERIC
>  void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
>  {
> @@ -4443,6 +4489,16 @@ static int calculate_sizes(struct kmem_cache *s)
>                 if (flags & SLAB_KMALLOC)
>                         size += sizeof(unsigned int);
>         }
> +
> +#ifdef CONFIG_SLUB_RCU_DEBUG
> +       if (flags & SLAB_TYPESAFE_BY_RCU) {
> +               size = kasan_align(size);
> +               size = ALIGN(size, __alignof__(struct rcu_head));
> +               s->debug_rcu_head_offset = size;
> +               size += sizeof(struct rcu_head);
> +               size = kasan_align(size);
> +       }
> +#endif /* CONFIG_SLUB_RCU_DEBUG */
>  #endif
>
>         kasan_cache_create(s, &size, &s->flags);
>
> base-commit: 4f9e7fabf8643003afefc172e62dd276686f016e
> --
> 2.42.0.rc1.204.g551eb34607-goog
>
Jann Horn Aug. 28, 2023, 2:39 p.m. UTC | #2
On Sat, Aug 26, 2023 at 5:32 AM Dmitry Vyukov <dvyukov@google.com> wrote:
> On Fri, 25 Aug 2023 at 23:15, Jann Horn <jannh@google.com> wrote:
> > Currently, KASAN is unable to catch use-after-free in SLAB_TYPESAFE_BY_RCU
> > slabs because use-after-free is allowed within the RCU grace period by
> > design.
> >
> > Add a SLUB debugging feature which RCU-delays every individual
> > kmem_cache_free() before either actually freeing the object or handing it
> > off to KASAN, and change KASAN to poison freed objects as normal when this
> > option is enabled.
> >
> > Note that this creates a 16-byte unpoisoned area in the middle of the
> > slab metadata area, which kinda sucks but seems to be necessary in order
> > to be able to store an rcu_head in there without triggering an ASAN
> > splat during RCU callback processing.
>
> Nice!
>
> Can't we unpoision this rcu_head right before call_rcu() and repoison
> after receiving the callback?

Yeah, I think that should work. It looks like currently
kasan_unpoison() is exposed in include/linux/kasan.h but
kasan_poison() is not, and its inline definition probably means I
can't just move it out of mm/kasan/kasan.h into include/linux/kasan.h;
do you have a preference for how I should handle this? Hmm, and it
also looks like code outside of mm/kasan/ anyway wouldn't know what
are valid values for the "value" argument to kasan_poison().
I also have another feature idea that would also benefit from having
something like kasan_poison() available in include/linux/kasan.h, so I
would prefer that over adding another special-case function inside
KASAN for poisoning this piece of slab metadata...

I guess I could define a wrapper around kasan_poison() in
mm/kasan/generic.c that uses a new poison value for "some other part
of the kernel told us to poison this area", and then expose that
wrapper with a declaration in include/mm/kasan.h? Something like:

void kasan_poison_outline(const void *addr, size_t size, bool init)
{
  kasan_poison(addr, size, KASAN_CUSTOM, init);
}

> What happens on cache destruction?
> Currently we purge quarantine on cache destruction to be able to
> safely destroy the cache. I suspect we may need to somehow purge rcu
> callbacks as well, or do something else.

Ooh, good point, I hadn't thought about that... currently
shutdown_cache() assumes that all the objects have already been freed,
then puts the kmem_cache on a list for
slab_caches_to_rcu_destroy_workfn(), which then waits with an
rcu_barrier() until the slab's pages are all gone.

Luckily kmem_cache_destroy() is already a sleepable operation, so
maybe I should just slap another rcu_barrier() in there for builds
with this config option enabled... I think that should be fine for an
option mostly intended for debugging.
kernel test robot Sept. 5, 2023, 7:57 a.m. UTC | #3
Hello,

kernel test robot noticed "BUG_bio-#(Tainted:G_S):Objects_remaining_in_bio-#on__kmem_cache_shutdown()" on:

commit: d0dc03872a9d13afc16b9f12e69cb0dc60437198 ("[PATCH] slub: Introduce CONFIG_SLUB_RCU_DEBUG")
url: https://github.com/intel-lab-lkp/linux/commits/Jann-Horn/slub-Introduce-CONFIG_SLUB_RCU_DEBUG/20230826-051804
patch link: https://lore.kernel.org/all/20230825211426.3798691-1-jannh@google.com/
patch subject: [PATCH] slub: Introduce CONFIG_SLUB_RCU_DEBUG

in testcase: mdadm-selftests
version: mdadm-selftests-x86_64-5f41845-1_20220826
with following parameters:

	disk: 1HDD
	test_prefix: 01replace



compiler: gcc-12
test machine: 8 threads 1 sockets Intel(R) Core(TM) i7-4790T CPU @ 2.70GHz (Haswell) with 16G memory

(please refer to attached dmesg/kmsg for entire log/backtrace)



If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Closes: https://lore.kernel.org/oe-lkp/202309051537.999262cc-oliver.sang@intel.com


kern  :info  : [  120.278340] md: recovery of RAID array md0
kern  :info  : [  124.955979] md: md0: recovery done.
kern  :info  : [  125.769059] md: recovery of RAID array md0
kern  :info  : [  127.326184] md: md0: recovery done.
kern  :err   : [  133.408242] =============================================================================
kern  :err   : [  133.417160] BUG bio-144 (Tainted: G S                ): Objects remaining in bio-144 on __kmem_cache_shutdown()
kern  :err   : [  133.427951] -----------------------------------------------------------------------------

kern  :err   : [  133.439000] Slab 0x00000000419abc96 objects=32 used=2 fp=0x000000008e0d14fb flags=0x17ffffc0010200(slab|head|node=0|zone=2|lastcpupid=0x1fffff)
kern  :warn  : [  133.452568] CPU: 3 PID: 1314 Comm: mdadm Tainted: G S                 6.5.0-rc7-00105-gd0dc03872a9d #1
kern  :warn  : [  133.462585] Hardware name: Gigabyte Technology Co., Ltd. Z97X-UD5H/Z97X-UD5H, BIOS F9 04/21/2015
kern  :warn  : [  133.472076] Call Trace:
kern  :warn  : [  133.475236]  <TASK>
kern :warn : [  133.478051] dump_stack_lvl (kbuild/src/consumer/lib/dump_stack.c:107 (discriminator 1)) 
kern :warn : [  133.482427] slab_err (kbuild/src/consumer/mm/slub.c:1016) 
kern :warn : [  133.486370] ? _raw_spin_lock_irqsave (kbuild/src/consumer/arch/x86/include/asm/atomic.h:115 kbuild/src/consumer/include/linux/atomic/atomic-arch-fallback.h:2155 kbuild/src/consumer/include/linux/atomic/atomic-instrumented.h:1296 kbuild/src/consumer/include/asm-generic/qspinlock.h:111 kbuild/src/consumer/include/linux/spinlock.h:187 kbuild/src/consumer/include/linux/spinlock_api_smp.h:111 kbuild/src/consumer/kernel/locking/spinlock.c:162) 
kern :warn : [  133.491615] __kmem_cache_shutdown (kbuild/src/consumer/include/linux/spinlock.h:351 kbuild/src/consumer/mm/slub.c:4625 kbuild/src/consumer/mm/slub.c:4656 kbuild/src/consumer/mm/slub.c:4688) 


The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20230905/202309051537.999262cc-oliver.sang@intel.com
Dmitry Vyukov Sept. 11, 2023, 9:50 a.m. UTC | #4
On Mon, 28 Aug 2023 at 16:40, Jann Horn <jannh@google.com> wrote:
>
> On Sat, Aug 26, 2023 at 5:32 AM Dmitry Vyukov <dvyukov@google.com> wrote:
> > On Fri, 25 Aug 2023 at 23:15, Jann Horn <jannh@google.com> wrote:
> > > Currently, KASAN is unable to catch use-after-free in SLAB_TYPESAFE_BY_RCU
> > > slabs because use-after-free is allowed within the RCU grace period by
> > > design.
> > >
> > > Add a SLUB debugging feature which RCU-delays every individual
> > > kmem_cache_free() before either actually freeing the object or handing it
> > > off to KASAN, and change KASAN to poison freed objects as normal when this
> > > option is enabled.
> > >
> > > Note that this creates a 16-byte unpoisoned area in the middle of the
> > > slab metadata area, which kinda sucks but seems to be necessary in order
> > > to be able to store an rcu_head in there without triggering an ASAN
> > > splat during RCU callback processing.
> >
> > Nice!
> >
> > Can't we unpoision this rcu_head right before call_rcu() and repoison
> > after receiving the callback?
>
> Yeah, I think that should work. It looks like currently
> kasan_unpoison() is exposed in include/linux/kasan.h but
> kasan_poison() is not, and its inline definition probably means I
> can't just move it out of mm/kasan/kasan.h into include/linux/kasan.h;
> do you have a preference for how I should handle this? Hmm, and it
> also looks like code outside of mm/kasan/ anyway wouldn't know what
> are valid values for the "value" argument to kasan_poison().
> I also have another feature idea that would also benefit from having
> something like kasan_poison() available in include/linux/kasan.h, so I
> would prefer that over adding another special-case function inside
> KASAN for poisoning this piece of slab metadata...
>
> I guess I could define a wrapper around kasan_poison() in
> mm/kasan/generic.c that uses a new poison value for "some other part
> of the kernel told us to poison this area", and then expose that
> wrapper with a declaration in include/mm/kasan.h? Something like:
>
> void kasan_poison_outline(const void *addr, size_t size, bool init)
> {
>   kasan_poison(addr, size, KASAN_CUSTOM, init);
> }

Looks reasonable.

> > What happens on cache destruction?
> > Currently we purge quarantine on cache destruction to be able to
> > safely destroy the cache. I suspect we may need to somehow purge rcu
> > callbacks as well, or do something else.
>
> Ooh, good point, I hadn't thought about that... currently
> shutdown_cache() assumes that all the objects have already been freed,
> then puts the kmem_cache on a list for
> slab_caches_to_rcu_destroy_workfn(), which then waits with an
> rcu_barrier() until the slab's pages are all gone.

I guess this is what the test robot found as well.

> Luckily kmem_cache_destroy() is already a sleepable operation, so
> maybe I should just slap another rcu_barrier() in there for builds
> with this config option enabled... I think that should be fine for an
> option mostly intended for debugging.

This is definitely the simplest option.
I am a bit concerned about performance if massive cache destruction
happens (e.g. maybe during destruction of a set of namespaces for a
container). Net namespace is slow to destroy for this reason IIRC,
there were some optimizations to batch rcu synchronization. And now we
are adding more.
But I don't see any reasonable faster option as well.
So I guess let's do this now and optimize later (or not).
Marco Elver Sept. 11, 2023, 11:51 a.m. UTC | #5
On Fri, 25 Aug 2023 at 23:15, 'Jann Horn' via kasan-dev
<kasan-dev@googlegroups.com> wrote:
>
> Currently, KASAN is unable to catch use-after-free in SLAB_TYPESAFE_BY_RCU
> slabs because use-after-free is allowed within the RCU grace period by
> design.
>
> Add a SLUB debugging feature which RCU-delays every individual
> kmem_cache_free() before either actually freeing the object or handing it
> off to KASAN, and change KASAN to poison freed objects as normal when this
> option is enabled.
>
> Note that this creates a 16-byte unpoisoned area in the middle of the
> slab metadata area, which kinda sucks but seems to be necessary in order
> to be able to store an rcu_head in there without triggering an ASAN
> splat during RCU callback processing.
>
> For now I've configured Kconfig.kasan to always enable this feature in the
> GENERIC and SW_TAGS modes; I'm not forcibly enabling it in HW_TAGS mode
> because I'm not sure if it might have unwanted performance degradation
> effects there.
>
> Signed-off-by: Jann Horn <jannh@google.com>
> ---
> can I get a review from the KASAN folks of this?
> I have been running it on my laptop for a bit and it seems to be working
> fine.
>
> Notes:
>     With this patch, a UAF on a TYPESAFE_BY_RCU will splat with an error
>     like this (tested by reverting a security bugfix).
>     Note that, in the ASAN memory state dump, we can see the little
>     unpoisoned 16-byte areas storing the rcu_head.
>
>     BUG: KASAN: slab-use-after-free in folio_lock_anon_vma_read+0x129/0x4c0
>     Read of size 8 at addr ffff888004e85b00 by task forkforkfork/592
>
>     CPU: 0 PID: 592 Comm: forkforkfork Not tainted 6.5.0-rc7-00105-gae70c1e1f6f5-dirty #334
>     Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
>     Call Trace:
>      <TASK>
>      dump_stack_lvl+0x4a/0x80
>      print_report+0xcf/0x660
>      kasan_report+0xd4/0x110
>      folio_lock_anon_vma_read+0x129/0x4c0
>      rmap_walk_anon+0x1cc/0x290
>      folio_referenced+0x277/0x2a0
>      shrink_folio_list+0xb8c/0x1680
>      reclaim_folio_list+0xdc/0x1f0
>      reclaim_pages+0x211/0x280
>      madvise_cold_or_pageout_pte_range+0x812/0xb70
>      walk_pgd_range+0x70b/0xce0
>      __walk_page_range+0x343/0x360
>      walk_page_range+0x227/0x280
>      madvise_pageout+0x1cd/0x2d0
>      do_madvise+0x552/0x15a0
>      __x64_sys_madvise+0x62/0x70
>      do_syscall_64+0x3b/0x90
>      entry_SYSCALL_64_after_hwframe+0x6e/0xd8
>     [...]
>      </TASK>
>
>     Allocated by task 574:
>      kasan_save_stack+0x33/0x60
>      kasan_set_track+0x25/0x30
>      __kasan_slab_alloc+0x6e/0x70
>      kmem_cache_alloc+0xfd/0x2b0
>      anon_vma_fork+0x88/0x270
>      dup_mmap+0x87c/0xc10
>      copy_process+0x3399/0x3590
>      kernel_clone+0x10e/0x480
>      __do_sys_clone+0xa1/0xe0
>      do_syscall_64+0x3b/0x90
>      entry_SYSCALL_64_after_hwframe+0x6e/0xd8
>
>     Freed by task 0:
>      kasan_save_stack+0x33/0x60
>      kasan_set_track+0x25/0x30
>      kasan_save_free_info+0x2b/0x50
>      __kasan_slab_free+0xfe/0x180
>      slab_free_after_rcu_debug+0xad/0x200
>      rcu_core+0x638/0x1620
>      __do_softirq+0x14c/0x581
>
>     Last potentially related work creation:
>      kasan_save_stack+0x33/0x60
>      __kasan_record_aux_stack+0x94/0xa0
>      __call_rcu_common.constprop.0+0x47/0x730
>      __put_anon_vma+0x6e/0x150
>      unlink_anon_vmas+0x277/0x2e0
>      vma_complete+0x341/0x580
>      vma_merge+0x613/0xff0
>      mprotect_fixup+0x1c0/0x510
>      do_mprotect_pkey+0x5a7/0x710
>      __x64_sys_mprotect+0x47/0x60
>      do_syscall_64+0x3b/0x90
>      entry_SYSCALL_64_after_hwframe+0x6e/0xd8
>
>     Second to last potentially related work creation:
>     [...]
>
>     The buggy address belongs to the object at ffff888004e85b00
>      which belongs to the cache anon_vma of size 192
>     The buggy address is located 0 bytes inside of
>      freed 192-byte region [ffff888004e85b00, ffff888004e85bc0)
>
>     The buggy address belongs to the physical page:
>     [...]
>
>     Memory state around the buggy address:
>      ffff888004e85a00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>      ffff888004e85a80: 00 00 00 00 00 00 00 00 fc 00 00 fc fc fc fc fc
>     >ffff888004e85b00: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>                        ^
>      ffff888004e85b80: fb fb fb fb fb fb fb fb fc 00 00 fc fc fc fc fc
>      ffff888004e85c00: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>
>  include/linux/kasan.h    |  6 ++++
>  include/linux/slub_def.h |  3 ++
>  lib/Kconfig.kasan        |  2 ++
>  mm/Kconfig.debug         | 21 +++++++++++++
>  mm/kasan/common.c        | 15 ++++++++-
>  mm/slub.c                | 66 +++++++++++++++++++++++++++++++++++++---

Nice!

It'd be good to add a test case to lib/test_kasan module. I think you
could just copy/adjust the test case "test_memcache_typesafe_by_rcu"
from the KFENCE KUnit test suite.

>  6 files changed, 107 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/kasan.h b/include/linux/kasan.h
> index 819b6bc8ac08..45e07caf4704 100644
> --- a/include/linux/kasan.h
> +++ b/include/linux/kasan.h
> @@ -229,6 +229,8 @@ static __always_inline bool kasan_check_byte(const void *addr)
>         return true;
>  }
>
> +size_t kasan_align(size_t size);
> +
>  #else /* CONFIG_KASAN */
>
>  static inline void kasan_unpoison_range(const void *address, size_t size) {}
> @@ -278,6 +280,10 @@ static inline bool kasan_check_byte(const void *address)
>  {
>         return true;
>  }
> +static inline size_t kasan_align(size_t size)
> +{
> +       return size;
> +}
>
>  #endif /* CONFIG_KASAN */
>
> diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
> index deb90cf4bffb..b87be8fce64a 100644
> --- a/include/linux/slub_def.h
> +++ b/include/linux/slub_def.h
> @@ -120,6 +120,9 @@ struct kmem_cache {
>         int refcount;           /* Refcount for slab cache destroy */
>         void (*ctor)(void *);
>         unsigned int inuse;             /* Offset to metadata */
> +#ifdef CONFIG_SLUB_RCU_DEBUG
> +       unsigned int debug_rcu_head_offset;
> +#endif
>         unsigned int align;             /* Alignment */
>         unsigned int red_left_pad;      /* Left redzone padding size */
>         const char *name;       /* Name (only for display!) */
> diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
> index fdca89c05745..7ff7de96c6e4 100644
> --- a/lib/Kconfig.kasan
> +++ b/lib/Kconfig.kasan
> @@ -79,6 +79,7 @@ config KASAN_GENERIC
>         depends on HAVE_ARCH_KASAN && CC_HAS_KASAN_GENERIC
>         depends on CC_HAS_WORKING_NOSANITIZE_ADDRESS
>         select SLUB_DEBUG if SLUB
> +       select SLUB_RCU_DEBUG if SLUB_DEBUG
>         select CONSTRUCTORS
>         help
>           Enables Generic KASAN.
> @@ -96,6 +97,7 @@ config KASAN_SW_TAGS
>         depends on HAVE_ARCH_KASAN_SW_TAGS && CC_HAS_KASAN_SW_TAGS
>         depends on CC_HAS_WORKING_NOSANITIZE_ADDRESS
>         select SLUB_DEBUG if SLUB
> +       select SLUB_RCU_DEBUG if SLUB_DEBUG
>         select CONSTRUCTORS
>         help
>           Enables Software Tag-Based KASAN.
> diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
> index 018a5bd2f576..99cce7f0fbef 100644
> --- a/mm/Kconfig.debug
> +++ b/mm/Kconfig.debug
> @@ -78,6 +78,27 @@ config SLUB_DEBUG_ON
>           off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying
>           "slub_debug=-".
>
> +config SLUB_RCU_DEBUG
> +       bool "Make use-after-free detection possible in TYPESAFE_BY_RCU caches"
> +       depends on SLUB && SLUB_DEBUG
> +       default n
> +       help
> +         Make SLAB_TYPESAFE_BY_RCU caches behave approximately as if the cache
> +         was not marked as SLAB_TYPESAFE_BY_RCU and every caller used
> +         kfree_rcu() instead.
> +
> +         This is intended for use in combination with KASAN, to enable KASAN to
> +         detect use-after-free accesses in such caches.
> +         (KFENCE is able to do that independent of this flag.)
> +
> +         This might degrade performance.
> +
> +         If you're using this for testing bugs / fuzzing and care about
> +         catching all the bugs WAY more than performance, you might want to
> +         also turn on CONFIG_RCU_STRICT_GRACE_PERIOD.
> +
> +         If unsure, say N.
> +
>  config PAGE_OWNER
>         bool "Track page owner"
>         depends on DEBUG_KERNEL && STACKTRACE_SUPPORT
> diff --git a/mm/kasan/common.c b/mm/kasan/common.c
> index 256930da578a..b4a3504f9f5e 100644
> --- a/mm/kasan/common.c
> +++ b/mm/kasan/common.c
> @@ -191,6 +191,13 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache,
>         if (kasan_requires_meta())
>                 kasan_init_object_meta(cache, object);
>
> +#ifdef CONFIG_SLUB_RCU_DEBUG
> +       if (cache->flags & SLAB_TYPESAFE_BY_RCU) {
> +               kasan_unpoison(object + cache->debug_rcu_head_offset,
> +                              sizeof(struct rcu_head), false);
> +       }
> +#endif /* CONFIG_SLUB_RCU_DEBUG */
> +
>         /* Tag is ignored in set_tag() without CONFIG_KASAN_SW/HW_TAGS */
>         object = set_tag(object, assign_tag(cache, object, true));
>
> @@ -218,7 +225,8 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
>         }
>
>         /* RCU slabs could be legally used after free within the RCU period */
> -       if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU))
> +       if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU) &&
> +           !IS_ENABLED(CONFIG_SLUB_RCU_DEBUG))
>                 return false;
>
>         if (!kasan_byte_accessible(tagged_object)) {
> @@ -450,3 +458,8 @@ bool __kasan_check_byte(const void *address, unsigned long ip)
>         }
>         return true;
>  }
> +
> +size_t kasan_align(size_t size)
> +{
> +       return round_up(size, KASAN_GRANULE_SIZE);
> +}
> diff --git a/mm/slub.c b/mm/slub.c
> index e3b5d5c0eb3a..bae6c2bc1e5f 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1108,7 +1108,8 @@ static int check_bytes_and_report(struct kmem_cache *s, struct slab *slab,
>   *     A. Free pointer (if we cannot overwrite object on free)
>   *     B. Tracking data for SLAB_STORE_USER
>   *     C. Original request size for kmalloc object (SLAB_STORE_USER enabled)
> - *     D. Padding to reach required alignment boundary or at minimum
> + *     D. RCU head for CONFIG_SLUB_RCU_DEBUG (with padding around it)
> + *     E. Padding to reach required alignment boundary or at minimum
>   *             one word if debugging is on to be able to detect writes
>   *             before the word boundary.
>   *
> @@ -1134,6 +1135,11 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p)
>                         off += sizeof(unsigned int);
>         }
>
> +#ifdef CONFIG_SLUB_RCU_DEBUG
> +       if (s->flags & SLAB_TYPESAFE_BY_RCU)
> +               off = kasan_align(s->debug_rcu_head_offset + sizeof(struct rcu_head));
> +#endif /* CONFIG_SLUB_RCU_DEBUG */
> +
>         off += kasan_metadata_size(s, false);
>
>         if (size_from_object(s) == off)
> @@ -1751,12 +1757,17 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
>  #endif
>  #endif /* CONFIG_SLUB_DEBUG */
>
> +#ifdef CONFIG_SLUB_RCU_DEBUG
> +static void slab_free_after_rcu_debug(struct rcu_head *rcu_head);
> +#endif
> +
>  /*
>   * Hooks for other subsystems that check memory allocations. In a typical
>   * production configuration these hooks all should produce no code at all.
>   */
>  static __always_inline bool slab_free_hook(struct kmem_cache *s,
> -                                               void *x, bool init)
> +                                               void *x, bool init,
> +                                               bool after_rcu_delay)
>  {
>         kmemleak_free_recursive(x, s->flags);
>         kmsan_slab_free(s, x);
> @@ -1766,8 +1777,18 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s,
>         if (!(s->flags & SLAB_DEBUG_OBJECTS))
>                 debug_check_no_obj_freed(x, s->object_size);
>
> +#ifdef CONFIG_SLUB_RCU_DEBUG
> +       /* kfence does its own RCU delay */
> +       if ((s->flags & SLAB_TYPESAFE_BY_RCU) && !after_rcu_delay &&
> +           !is_kfence_address(x)) {
> +               call_rcu(kasan_reset_tag(x) + s->debug_rcu_head_offset,
> +                        slab_free_after_rcu_debug);
> +               return true;
> +       }
> +#endif /* CONFIG_SLUB_RCU_DEBUG */
> +
>         /* Use KCSAN to help debug racy use-after-free. */
> -       if (!(s->flags & SLAB_TYPESAFE_BY_RCU))
> +       if (!(s->flags & SLAB_TYPESAFE_BY_RCU) || after_rcu_delay)
>                 __kcsan_check_access(x, s->object_size,
>                                      KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
>
> @@ -1802,7 +1823,7 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
>         void *old_tail = *tail ? *tail : *head;
>
>         if (is_kfence_address(next)) {
> -               slab_free_hook(s, next, false);
> +               slab_free_hook(s, next, false, false);
>                 return true;
>         }
>
> @@ -1815,7 +1836,7 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
>                 next = get_freepointer(s, object);
>
>                 /* If object's reuse doesn't have to be delayed */
> -               if (!slab_free_hook(s, object, slab_want_init_on_free(s))) {
> +               if (!slab_free_hook(s, object, slab_want_init_on_free(s), false)) {
>                         /* Move object to the new freelist */
>                         set_freepointer(s, object, *head);
>                         *head = object;
> @@ -3802,6 +3823,31 @@ static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab,
>                 do_slab_free(s, slab, head, tail, cnt, addr);
>  }
>
> +#ifdef CONFIG_SLUB_RCU_DEBUG
> +static void slab_free_after_rcu_debug(struct rcu_head *rcu_head)
> +{
> +       struct slab *slab = virt_to_slab(rcu_head);
> +       struct kmem_cache *s;
> +       void *object;
> +
> +       if (WARN_ON(is_kfence_address(rcu_head)))
> +               return;
> +
> +       /* find the object and the cache again */
> +       if (WARN_ON(!slab))
> +               return;
> +       s = slab->slab_cache;
> +       if (WARN_ON(!(s->flags & SLAB_TYPESAFE_BY_RCU)))
> +               return;
> +       object = (void *)rcu_head - s->debug_rcu_head_offset;
> +
> +       /* resume freeing */
> +       if (slab_free_hook(s, object, slab_want_init_on_free(s), true))
> +               return;
> +       do_slab_free(s, slab, object, NULL, 1, _THIS_IP_);
> +}
> +#endif /* CONFIG_SLUB_RCU_DEBUG */
> +
>  #ifdef CONFIG_KASAN_GENERIC
>  void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
>  {
> @@ -4443,6 +4489,16 @@ static int calculate_sizes(struct kmem_cache *s)
>                 if (flags & SLAB_KMALLOC)
>                         size += sizeof(unsigned int);
>         }
> +
> +#ifdef CONFIG_SLUB_RCU_DEBUG
> +       if (flags & SLAB_TYPESAFE_BY_RCU) {
> +               size = kasan_align(size);
> +               size = ALIGN(size, __alignof__(struct rcu_head));
> +               s->debug_rcu_head_offset = size;
> +               size += sizeof(struct rcu_head);
> +               size = kasan_align(size);
> +       }
> +#endif /* CONFIG_SLUB_RCU_DEBUG */
>  #endif
>
>         kasan_cache_create(s, &size, &s->flags);
>
> base-commit: 4f9e7fabf8643003afefc172e62dd276686f016e
> --
> 2.42.0.rc1.204.g551eb34607-goog
>
> --
> You received this message because you are subscribed to the Google Groups "kasan-dev" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to kasan-dev+unsubscribe@googlegroups.com.
> To view this discussion on the web visit https://groups.google.com/d/msgid/kasan-dev/20230825211426.3798691-1-jannh%40google.com.
Andrey Konovalov Sept. 16, 2023, 4:18 p.m. UTC | #6
On Mon, Aug 28, 2023 at 4:40 PM Jann Horn <jannh@google.com> wrote:
>
> > Can't we unpoision this rcu_head right before call_rcu() and repoison
> > after receiving the callback?
>
> Yeah, I think that should work. It looks like currently
> kasan_unpoison() is exposed in include/linux/kasan.h but
> kasan_poison() is not, and its inline definition probably means I
> can't just move it out of mm/kasan/kasan.h into include/linux/kasan.h;
> do you have a preference for how I should handle this? Hmm, and it
> also looks like code outside of mm/kasan/ anyway wouldn't know what
> are valid values for the "value" argument to kasan_poison().
> I also have another feature idea that would also benefit from having
> something like kasan_poison() available in include/linux/kasan.h, so I
> would prefer that over adding another special-case function inside
> KASAN for poisoning this piece of slab metadata...

This is a problem only for the Generic mode, right? You already call
kasan_reset_tag on the rcu_head, which should suppress the reporting
for the tag-based modes.

If so, would it be possible to reuse metadata_access_enable/disable?
They are used for accessing slub_debug metadata and seem to fit nicely
with this case as well.

I also second Macro's comment to add a test for the new functionality.

Thanks for working on this!
diff mbox series

Patch

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 819b6bc8ac08..45e07caf4704 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -229,6 +229,8 @@  static __always_inline bool kasan_check_byte(const void *addr)
 	return true;
 }
 
+size_t kasan_align(size_t size);
+
 #else /* CONFIG_KASAN */
 
 static inline void kasan_unpoison_range(const void *address, size_t size) {}
@@ -278,6 +280,10 @@  static inline bool kasan_check_byte(const void *address)
 {
 	return true;
 }
+static inline size_t kasan_align(size_t size)
+{
+	return size;
+}
 
 #endif /* CONFIG_KASAN */
 
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index deb90cf4bffb..b87be8fce64a 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -120,6 +120,9 @@  struct kmem_cache {
 	int refcount;		/* Refcount for slab cache destroy */
 	void (*ctor)(void *);
 	unsigned int inuse;		/* Offset to metadata */
+#ifdef CONFIG_SLUB_RCU_DEBUG
+	unsigned int debug_rcu_head_offset;
+#endif
 	unsigned int align;		/* Alignment */
 	unsigned int red_left_pad;	/* Left redzone padding size */
 	const char *name;	/* Name (only for display!) */
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index fdca89c05745..7ff7de96c6e4 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -79,6 +79,7 @@  config KASAN_GENERIC
 	depends on HAVE_ARCH_KASAN && CC_HAS_KASAN_GENERIC
 	depends on CC_HAS_WORKING_NOSANITIZE_ADDRESS
 	select SLUB_DEBUG if SLUB
+	select SLUB_RCU_DEBUG if SLUB_DEBUG
 	select CONSTRUCTORS
 	help
 	  Enables Generic KASAN.
@@ -96,6 +97,7 @@  config KASAN_SW_TAGS
 	depends on HAVE_ARCH_KASAN_SW_TAGS && CC_HAS_KASAN_SW_TAGS
 	depends on CC_HAS_WORKING_NOSANITIZE_ADDRESS
 	select SLUB_DEBUG if SLUB
+	select SLUB_RCU_DEBUG if SLUB_DEBUG
 	select CONSTRUCTORS
 	help
 	  Enables Software Tag-Based KASAN.
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 018a5bd2f576..99cce7f0fbef 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -78,6 +78,27 @@  config SLUB_DEBUG_ON
 	  off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying
 	  "slub_debug=-".
 
+config SLUB_RCU_DEBUG
+	bool "Make use-after-free detection possible in TYPESAFE_BY_RCU caches"
+	depends on SLUB && SLUB_DEBUG
+	default n
+	help
+	  Make SLAB_TYPESAFE_BY_RCU caches behave approximately as if the cache
+	  was not marked as SLAB_TYPESAFE_BY_RCU and every caller used
+	  kfree_rcu() instead.
+
+	  This is intended for use in combination with KASAN, to enable KASAN to
+	  detect use-after-free accesses in such caches.
+	  (KFENCE is able to do that independent of this flag.)
+
+	  This might degrade performance.
+
+	  If you're using this for testing bugs / fuzzing and care about
+	  catching all the bugs WAY more than performance, you might want to
+	  also turn on CONFIG_RCU_STRICT_GRACE_PERIOD.
+
+	  If unsure, say N.
+
 config PAGE_OWNER
 	bool "Track page owner"
 	depends on DEBUG_KERNEL && STACKTRACE_SUPPORT
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 256930da578a..b4a3504f9f5e 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -191,6 +191,13 @@  void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache,
 	if (kasan_requires_meta())
 		kasan_init_object_meta(cache, object);
 
+#ifdef CONFIG_SLUB_RCU_DEBUG
+	if (cache->flags & SLAB_TYPESAFE_BY_RCU) {
+		kasan_unpoison(object + cache->debug_rcu_head_offset,
+			       sizeof(struct rcu_head), false);
+	}
+#endif /* CONFIG_SLUB_RCU_DEBUG */
+
 	/* Tag is ignored in set_tag() without CONFIG_KASAN_SW/HW_TAGS */
 	object = set_tag(object, assign_tag(cache, object, true));
 
@@ -218,7 +225,8 @@  static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
 	}
 
 	/* RCU slabs could be legally used after free within the RCU period */
-	if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU))
+	if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU) &&
+	    !IS_ENABLED(CONFIG_SLUB_RCU_DEBUG))
 		return false;
 
 	if (!kasan_byte_accessible(tagged_object)) {
@@ -450,3 +458,8 @@  bool __kasan_check_byte(const void *address, unsigned long ip)
 	}
 	return true;
 }
+
+size_t kasan_align(size_t size)
+{
+	return round_up(size, KASAN_GRANULE_SIZE);
+}
diff --git a/mm/slub.c b/mm/slub.c
index e3b5d5c0eb3a..bae6c2bc1e5f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1108,7 +1108,8 @@  static int check_bytes_and_report(struct kmem_cache *s, struct slab *slab,
  * 	A. Free pointer (if we cannot overwrite object on free)
  * 	B. Tracking data for SLAB_STORE_USER
  *	C. Original request size for kmalloc object (SLAB_STORE_USER enabled)
- *	D. Padding to reach required alignment boundary or at minimum
+ *	D. RCU head for CONFIG_SLUB_RCU_DEBUG (with padding around it)
+ *	E. Padding to reach required alignment boundary or at minimum
  * 		one word if debugging is on to be able to detect writes
  * 		before the word boundary.
  *
@@ -1134,6 +1135,11 @@  static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p)
 			off += sizeof(unsigned int);
 	}
 
+#ifdef CONFIG_SLUB_RCU_DEBUG
+	if (s->flags & SLAB_TYPESAFE_BY_RCU)
+		off = kasan_align(s->debug_rcu_head_offset + sizeof(struct rcu_head));
+#endif /* CONFIG_SLUB_RCU_DEBUG */
+
 	off += kasan_metadata_size(s, false);
 
 	if (size_from_object(s) == off)
@@ -1751,12 +1757,17 @@  static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
 #endif
 #endif /* CONFIG_SLUB_DEBUG */
 
+#ifdef CONFIG_SLUB_RCU_DEBUG
+static void slab_free_after_rcu_debug(struct rcu_head *rcu_head);
+#endif
+
 /*
  * Hooks for other subsystems that check memory allocations. In a typical
  * production configuration these hooks all should produce no code at all.
  */
 static __always_inline bool slab_free_hook(struct kmem_cache *s,
-						void *x, bool init)
+						void *x, bool init,
+						bool after_rcu_delay)
 {
 	kmemleak_free_recursive(x, s->flags);
 	kmsan_slab_free(s, x);
@@ -1766,8 +1777,18 @@  static __always_inline bool slab_free_hook(struct kmem_cache *s,
 	if (!(s->flags & SLAB_DEBUG_OBJECTS))
 		debug_check_no_obj_freed(x, s->object_size);
 
+#ifdef CONFIG_SLUB_RCU_DEBUG
+	/* kfence does its own RCU delay */
+	if ((s->flags & SLAB_TYPESAFE_BY_RCU) && !after_rcu_delay &&
+	    !is_kfence_address(x)) {
+		call_rcu(kasan_reset_tag(x) + s->debug_rcu_head_offset,
+			 slab_free_after_rcu_debug);
+		return true;
+	}
+#endif /* CONFIG_SLUB_RCU_DEBUG */
+
 	/* Use KCSAN to help debug racy use-after-free. */
-	if (!(s->flags & SLAB_TYPESAFE_BY_RCU))
+	if (!(s->flags & SLAB_TYPESAFE_BY_RCU) || after_rcu_delay)
 		__kcsan_check_access(x, s->object_size,
 				     KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
 
@@ -1802,7 +1823,7 @@  static inline bool slab_free_freelist_hook(struct kmem_cache *s,
 	void *old_tail = *tail ? *tail : *head;
 
 	if (is_kfence_address(next)) {
-		slab_free_hook(s, next, false);
+		slab_free_hook(s, next, false, false);
 		return true;
 	}
 
@@ -1815,7 +1836,7 @@  static inline bool slab_free_freelist_hook(struct kmem_cache *s,
 		next = get_freepointer(s, object);
 
 		/* If object's reuse doesn't have to be delayed */
-		if (!slab_free_hook(s, object, slab_want_init_on_free(s))) {
+		if (!slab_free_hook(s, object, slab_want_init_on_free(s), false)) {
 			/* Move object to the new freelist */
 			set_freepointer(s, object, *head);
 			*head = object;
@@ -3802,6 +3823,31 @@  static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab,
 		do_slab_free(s, slab, head, tail, cnt, addr);
 }
 
+#ifdef CONFIG_SLUB_RCU_DEBUG
+static void slab_free_after_rcu_debug(struct rcu_head *rcu_head)
+{
+	struct slab *slab = virt_to_slab(rcu_head);
+	struct kmem_cache *s;
+	void *object;
+
+	if (WARN_ON(is_kfence_address(rcu_head)))
+		return;
+
+	/* find the object and the cache again */
+	if (WARN_ON(!slab))
+		return;
+	s = slab->slab_cache;
+	if (WARN_ON(!(s->flags & SLAB_TYPESAFE_BY_RCU)))
+		return;
+	object = (void *)rcu_head - s->debug_rcu_head_offset;
+
+	/* resume freeing */
+	if (slab_free_hook(s, object, slab_want_init_on_free(s), true))
+		return;
+	do_slab_free(s, slab, object, NULL, 1, _THIS_IP_);
+}
+#endif /* CONFIG_SLUB_RCU_DEBUG */
+
 #ifdef CONFIG_KASAN_GENERIC
 void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
 {
@@ -4443,6 +4489,16 @@  static int calculate_sizes(struct kmem_cache *s)
 		if (flags & SLAB_KMALLOC)
 			size += sizeof(unsigned int);
 	}
+
+#ifdef CONFIG_SLUB_RCU_DEBUG
+	if (flags & SLAB_TYPESAFE_BY_RCU) {
+		size = kasan_align(size);
+		size = ALIGN(size, __alignof__(struct rcu_head));
+		s->debug_rcu_head_offset = size;
+		size += sizeof(struct rcu_head);
+		size = kasan_align(size);
+	}
+#endif /* CONFIG_SLUB_RCU_DEBUG */
 #endif
 
 	kasan_cache_create(s, &size, &s->flags);