diff mbox

[09/38] usercopy: Mark kmalloc caches as usercopy caches

Message ID 1515636190-24061-10-git-send-email-keescook@chromium.org (mailing list archive)
State Accepted, archived
Headers show

Commit Message

Kees Cook Jan. 11, 2018, 2:02 a.m. UTC
From: David Windsor <dave@nullcore.net>

Mark the kmalloc slab caches as entirely whitelisted. These caches
are frequently used to fulfill kernel allocations that contain data
to be copied to/from userspace. Internal-only uses are also common,
but are scattered in the kernel. For now, mark all the kmalloc caches
as whitelisted.

This patch is modified from Brad Spengler/PaX Team's PAX_USERCOPY
whitelisting code in the last public patch of grsecurity/PaX based on my
understanding of the code. Changes or omissions from the original code are
mine and don't reflect the original grsecurity/PaX code.

Signed-off-by: David Windsor <dave@nullcore.net>
[kees: merged in moved kmalloc hunks, adjust commit log]
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org
Cc: linux-xfs@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Christoph Lameter <cl@linux.com>
---
 mm/slab.c        |  3 ++-
 mm/slab.h        |  3 ++-
 mm/slab_common.c | 10 ++++++----
 3 files changed, 10 insertions(+), 6 deletions(-)

Comments

Jiri Slaby Nov. 12, 2019, 7:17 a.m. UTC | #1
On 11. 01. 18, 3:02, Kees Cook wrote:
> From: David Windsor <dave@nullcore.net>
> 
> Mark the kmalloc slab caches as entirely whitelisted. These caches
> are frequently used to fulfill kernel allocations that contain data
> to be copied to/from userspace. Internal-only uses are also common,
> but are scattered in the kernel. For now, mark all the kmalloc caches
> as whitelisted.
> 
> This patch is modified from Brad Spengler/PaX Team's PAX_USERCOPY
> whitelisting code in the last public patch of grsecurity/PaX based on my
> understanding of the code. Changes or omissions from the original code are
> mine and don't reflect the original grsecurity/PaX code.
> 
> Signed-off-by: David Windsor <dave@nullcore.net>
> [kees: merged in moved kmalloc hunks, adjust commit log]
> Cc: Pekka Enberg <penberg@kernel.org>
> Cc: David Rientjes <rientjes@google.com>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: linux-mm@kvack.org
> Cc: linux-xfs@vger.kernel.org
> Signed-off-by: Kees Cook <keescook@chromium.org>
> Acked-by: Christoph Lameter <cl@linux.com>
> ---
>  mm/slab.c        |  3 ++-
>  mm/slab.h        |  3 ++-
>  mm/slab_common.c | 10 ++++++----
>  3 files changed, 10 insertions(+), 6 deletions(-)
> 
> diff --git a/mm/slab.c b/mm/slab.c
> index b9b0df620bb9..dd367fe17a4e 100644
> --- a/mm/slab.c
> +++ b/mm/slab.c
...
> @@ -1098,7 +1099,8 @@ void __init setup_kmalloc_cache_index_table(void)
>  static void __init new_kmalloc_cache(int idx, slab_flags_t flags)
>  {
>  	kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name,
> -					kmalloc_info[idx].size, flags);
> +					kmalloc_info[idx].size, flags, 0,
> +					kmalloc_info[idx].size);
>  }
>  
>  /*
> @@ -1139,7 +1141,7 @@ void __init create_kmalloc_caches(slab_flags_t flags)
>  
>  			BUG_ON(!n);
>  			kmalloc_dma_caches[i] = create_kmalloc_cache(n,
> -				size, SLAB_CACHE_DMA | flags);
> +				size, SLAB_CACHE_DMA | flags, 0, 0);

Hi,

was there any (undocumented) reason NOT to mark DMA caches as usercopy?

We are seeing this on s390x:

> usercopy: Kernel memory overwrite attempt detected to SLUB object
'dma-kmalloc-1k' (offset 0, size 11)!
> ------------[ cut here ]------------
> kernel BUG at mm/usercopy.c:99!

See:
https://bugzilla.suse.com/show_bug.cgi?id=1156053

This indeed fixes it:
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1290,7 +1290,8 @@ void __init create_kmalloc_caches(slab_flags_t flags)
                        kmalloc_caches[KMALLOC_DMA][i] =
create_kmalloc_cache(
                                kmalloc_info[i].name[KMALLOC_DMA],
                                kmalloc_info[i].size,
-                               SLAB_CACHE_DMA | flags, 0, 0);
+                               SLAB_CACHE_DMA | flags, 0,
+                               kmalloc_info[i].size);
                }
        }
 #endif


thanks,
Kees Cook Nov. 12, 2019, 9:21 p.m. UTC | #2
On Tue, Nov 12, 2019 at 08:17:57AM +0100, Jiri Slaby wrote:
> On 11. 01. 18, 3:02, Kees Cook wrote:
> > From: David Windsor <dave@nullcore.net>
> > 
> > Mark the kmalloc slab caches as entirely whitelisted. These caches
> > are frequently used to fulfill kernel allocations that contain data
> > to be copied to/from userspace. Internal-only uses are also common,
> > but are scattered in the kernel. For now, mark all the kmalloc caches
> > as whitelisted.
> > 
> > This patch is modified from Brad Spengler/PaX Team's PAX_USERCOPY
> > whitelisting code in the last public patch of grsecurity/PaX based on my
> > understanding of the code. Changes or omissions from the original code are
> > mine and don't reflect the original grsecurity/PaX code.
> > 
> > Signed-off-by: David Windsor <dave@nullcore.net>
> > [kees: merged in moved kmalloc hunks, adjust commit log]
> > Cc: Pekka Enberg <penberg@kernel.org>
> > Cc: David Rientjes <rientjes@google.com>
> > Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> > Cc: Andrew Morton <akpm@linux-foundation.org>
> > Cc: linux-mm@kvack.org
> > Cc: linux-xfs@vger.kernel.org
> > Signed-off-by: Kees Cook <keescook@chromium.org>
> > Acked-by: Christoph Lameter <cl@linux.com>
> > ---
> >  mm/slab.c        |  3 ++-
> >  mm/slab.h        |  3 ++-
> >  mm/slab_common.c | 10 ++++++----
> >  3 files changed, 10 insertions(+), 6 deletions(-)
> > 
> > diff --git a/mm/slab.c b/mm/slab.c
> > index b9b0df620bb9..dd367fe17a4e 100644
> > --- a/mm/slab.c
> > +++ b/mm/slab.c
> ...
> > @@ -1098,7 +1099,8 @@ void __init setup_kmalloc_cache_index_table(void)
> >  static void __init new_kmalloc_cache(int idx, slab_flags_t flags)
> >  {
> >  	kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name,
> > -					kmalloc_info[idx].size, flags);
> > +					kmalloc_info[idx].size, flags, 0,
> > +					kmalloc_info[idx].size);
> >  }
> >  
> >  /*
> > @@ -1139,7 +1141,7 @@ void __init create_kmalloc_caches(slab_flags_t flags)
> >  
> >  			BUG_ON(!n);
> >  			kmalloc_dma_caches[i] = create_kmalloc_cache(n,
> > -				size, SLAB_CACHE_DMA | flags);
> > +				size, SLAB_CACHE_DMA | flags, 0, 0);
> 
> Hi,
> 
> was there any (undocumented) reason NOT to mark DMA caches as usercopy?
> 
> We are seeing this on s390x:
> 
> > usercopy: Kernel memory overwrite attempt detected to SLUB object
> 'dma-kmalloc-1k' (offset 0, size 11)!
> > ------------[ cut here ]------------
> > kernel BUG at mm/usercopy.c:99!

Interesting! I believe the rationale was that if the region is used for
DMA, allowing direct access to it from userspace could be prone to
races.

> See:
> https://bugzilla.suse.com/show_bug.cgi?id=1156053

For context from the bug, the trace is:

(<0000000000386c5a> usercopy_abort+0xa2/0xa8) 
 <000000000036097a> __check_heap_object+0x11a/0x120  
 <0000000000386b3a> __check_object_size+0x18a/0x208  
 <000000000079b4ba> skb_copy_datagram_from_iter+0x62/0x240  
 <000003ff804edd5c> iucv_sock_sendmsg+0x1fc/0x858 Ýaf_iucv¨  
 <0000000000785894> sock_sendmsg+0x54/0x90  
 <0000000000785944> sock_write_iter+0x74/0xa0  
 <000000000038a3f0> new_sync_write+0x110/0x180  
 <000000000038d42e> vfs_write+0xa6/0x1d0  
 <000000000038d748> ksys_write+0x60/0xe8  
 <000000000096a660> system_call+0xdc/0x2e0  

I know Al worked on fixing up usercopy checking for iters. I wonder if
there is redundant checking happening here? i.e. haven't iters already
done object size verifications, so they're not needed during iter copy
helpers?

> This indeed fixes it:
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -1290,7 +1290,8 @@ void __init create_kmalloc_caches(slab_flags_t flags)
>                         kmalloc_caches[KMALLOC_DMA][i] =
> create_kmalloc_cache(
>                                 kmalloc_info[i].name[KMALLOC_DMA],
>                                 kmalloc_info[i].size,
> -                               SLAB_CACHE_DMA | flags, 0, 0);
> +                               SLAB_CACHE_DMA | flags, 0,
> +                               kmalloc_info[i].size);
>                 }
>         }
>  #endif

How is iucv the only network protocol that has run into this? Do others
use a bounce buffer?
Kees Cook Nov. 14, 2019, 9:27 p.m. UTC | #3
On Tue, Nov 12, 2019 at 01:21:54PM -0800, Kees Cook wrote:
> How is iucv the only network protocol that has run into this? Do others
> use a bounce buffer?

Another solution would be to use a dedicated kmem cache (instead of the
shared kmalloc dma one)?
Jiri Slaby Jan. 23, 2020, 8:14 a.m. UTC | #4
On 14. 11. 19, 22:27, Kees Cook wrote:
> On Tue, Nov 12, 2019 at 01:21:54PM -0800, Kees Cook wrote:
>> How is iucv the only network protocol that has run into this? Do others
>> use a bounce buffer?
> 
> Another solution would be to use a dedicated kmem cache (instead of the
> shared kmalloc dma one)?

Has there been any conclusion to this thread yet? For the time being, we
disabled HARDENED_USERCOPY on s390...

https://lore.kernel.org/kernel-hardening/9519edb7-456a-a2fa-659e-3e5a1ff89466@suse.cz/

thanks,
Kees Cook Jan. 27, 2020, 11:19 p.m. UTC | #5
On Thu, Jan 23, 2020 at 09:14:20AM +0100, Jiri Slaby wrote:
> On 14. 11. 19, 22:27, Kees Cook wrote:
> > On Tue, Nov 12, 2019 at 01:21:54PM -0800, Kees Cook wrote:
> >> How is iucv the only network protocol that has run into this? Do others
> >> use a bounce buffer?
> > 
> > Another solution would be to use a dedicated kmem cache (instead of the
> > shared kmalloc dma one)?
> 
> Has there been any conclusion to this thread yet? For the time being, we
> disabled HARDENED_USERCOPY on s390...
> 
> https://lore.kernel.org/kernel-hardening/9519edb7-456a-a2fa-659e-3e5a1ff89466@suse.cz/

I haven't heard anything new. What did people think of a separate kmem
cache?
Christian Borntraeger Jan. 28, 2020, 7:58 a.m. UTC | #6
On 28.01.20 00:19, Kees Cook wrote:
> On Thu, Jan 23, 2020 at 09:14:20AM +0100, Jiri Slaby wrote:
>> On 14. 11. 19, 22:27, Kees Cook wrote:
>>> On Tue, Nov 12, 2019 at 01:21:54PM -0800, Kees Cook wrote:
>>>> How is iucv the only network protocol that has run into this? Do others
>>>> use a bounce buffer?
>>>
>>> Another solution would be to use a dedicated kmem cache (instead of the
>>> shared kmalloc dma one)?
>>
>> Has there been any conclusion to this thread yet? For the time being, we
>> disabled HARDENED_USERCOPY on s390...
>>
>> https://lore.kernel.org/kernel-hardening/9519edb7-456a-a2fa-659e-3e5a1ff89466@suse.cz/
> 
> I haven't heard anything new. What did people think of a separate kmem
> cache?
> 

Adding Julian and Ursula. A separate kmem cache for iucv might be indeed
a solution for the user hardening issue.
On the other hand not marking the DMA caches still seems questionable.

For reference
https://bugzilla.suse.com/show_bug.cgi?id=1156053
the kernel hardening now triggers a warning.
Kees Cook Jan. 28, 2020, 11:01 p.m. UTC | #7
On Tue, Jan 28, 2020 at 08:58:31AM +0100, Christian Borntraeger wrote:
> 
> 
> On 28.01.20 00:19, Kees Cook wrote:
> > On Thu, Jan 23, 2020 at 09:14:20AM +0100, Jiri Slaby wrote:
> >> On 14. 11. 19, 22:27, Kees Cook wrote:
> >>> On Tue, Nov 12, 2019 at 01:21:54PM -0800, Kees Cook wrote:
> >>>> How is iucv the only network protocol that has run into this? Do others
> >>>> use a bounce buffer?
> >>>
> >>> Another solution would be to use a dedicated kmem cache (instead of the
> >>> shared kmalloc dma one)?
> >>
> >> Has there been any conclusion to this thread yet? For the time being, we
> >> disabled HARDENED_USERCOPY on s390...
> >>
> >> https://lore.kernel.org/kernel-hardening/9519edb7-456a-a2fa-659e-3e5a1ff89466@suse.cz/
> > 
> > I haven't heard anything new. What did people think of a separate kmem
> > cache?
> > 
> 
> Adding Julian and Ursula. A separate kmem cache for iucv might be indeed
> a solution for the user hardening issue.

It should be very clean -- any existing kmallocs already have to be
"special" in the sense that they're marked with the DMA flag. So
converting these to a separate cache should be mostly mechanical.

> On the other hand not marking the DMA caches still seems questionable.

My understanding is that exposing DMA memory to userspace copies can
lead to unexpected results, especially for misbehaving hardware, so I'm
not convinced this is a generically bad hardening choice.

-Kees

> 
> For reference
> https://bugzilla.suse.com/show_bug.cgi?id=1156053
> the kernel hardening now triggers a warning.
>
Ursula Braun Jan. 29, 2020, 9:26 a.m. UTC | #8
On 1/29/20 12:01 AM, Kees Cook wrote:
> On Tue, Jan 28, 2020 at 08:58:31AM +0100, Christian Borntraeger wrote:
>>
>>
>> On 28.01.20 00:19, Kees Cook wrote:
>>> On Thu, Jan 23, 2020 at 09:14:20AM +0100, Jiri Slaby wrote:
>>>> On 14. 11. 19, 22:27, Kees Cook wrote:
>>>>> On Tue, Nov 12, 2019 at 01:21:54PM -0800, Kees Cook wrote:
>>>>>> How is iucv the only network protocol that has run into this? Do others
>>>>>> use a bounce buffer?
>>>>>
>>>>> Another solution would be to use a dedicated kmem cache (instead of the
>>>>> shared kmalloc dma one)?
>>>>
>>>> Has there been any conclusion to this thread yet? For the time being, we
>>>> disabled HARDENED_USERCOPY on s390...
>>>>
>>>> https://lore.kernel.org/kernel-hardening/9519edb7-456a-a2fa-659e-3e5a1ff89466@suse.cz/
>>>
>>> I haven't heard anything new. What did people think of a separate kmem
>>> cache?
>>>
>>
>> Adding Julian and Ursula. A separate kmem cache for iucv might be indeed
>> a solution for the user hardening issue.
> 
> It should be very clean -- any existing kmallocs already have to be
> "special" in the sense that they're marked with the DMA flag. So
> converting these to a separate cache should be mostly mechanical.
> 

Linux on System z can run within a guest hosted by the IBM mainframe operating system
z/VM. z/VM offers a transport called Inter-User Communications Vehicle (short IUCV).
It is limited to 4-byte-addresses when sending and receiving data.
One base transport for AF_IUCV sockets in the Linux kernel is this Inter-User
Communications Vehicle of z/VM. AF_IUCV sockets exist for s390 only. 

AF_IUCV sockets make use of the base socket layer, and work with sk_buffs for sending
and receiving data of variable length.
Storage for sk_buffs is allocated with __alloc_skb(), which invokes
   data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
For IUCV transport the "data"-address should fit into 4 bytes. That's the reason why
we work with GFP_DMA here.

kmem_caches manage memory of fixed size. This does not fit well for sk_buff memory
of variable length. Do you propose to add a kmem_cache solution for sk_buff memory here?

>> On the other hand not marking the DMA caches still seems questionable.
> 
> My understanding is that exposing DMA memory to userspace copies can
> lead to unexpected results, especially for misbehaving hardware, so I'm
> not convinced this is a generically bad hardening choice.
> 

We have not yet been reported a memory problem here. Do you have more details, if
this is really a problem for the s390 architecture?

Kind regards, Ursula 

> -Kees
> 
>>
>> For reference
>> https://bugzilla.suse.com/show_bug.cgi?id=1156053
>> the kernel hardening now triggers a warning.
>>
>
Christoph Lameter (Ampere) Jan. 29, 2020, 4:43 p.m. UTC | #9
On Tue, 28 Jan 2020, Kees Cook wrote:

> > On the other hand not marking the DMA caches still seems questionable.
>
> My understanding is that exposing DMA memory to userspace copies can
> lead to unexpected results, especially for misbehaving hardware, so I'm
> not convinced this is a generically bad hardening choice.

"DMA" memory (and thus DMA caches) have nothing to do with DMA. Its a
legacy term. "DMA Memory" is memory limited to a certain
physical address boundary (old restrictions on certain devices only
supporting a limited number of address bits).

DMA can be done to NORMAL memory as well.
Christian Borntraeger Jan. 29, 2020, 5:07 p.m. UTC | #10
On 29.01.20 17:43, Christopher Lameter wrote:
> On Tue, 28 Jan 2020, Kees Cook wrote:
> 
>>> On the other hand not marking the DMA caches still seems questionable.
>>
>> My understanding is that exposing DMA memory to userspace copies can
>> lead to unexpected results, especially for misbehaving hardware, so I'm
>> not convinced this is a generically bad hardening choice.
> 
> "DMA" memory (and thus DMA caches) have nothing to do with DMA. Its a
> legacy term. "DMA Memory" is memory limited to a certain
> physical address boundary (old restrictions on certain devices only
> supporting a limited number of address bits).
> 
> DMA can be done to NORMAL memory as well.

Exactly. 
I think iucv uses GFP_DMA because z/VM needs those buffers to reside below 2GB (which is ZONA_DMA for s390).
Christoph Hellwig Jan. 29, 2020, 5:09 p.m. UTC | #11
On Wed, Jan 29, 2020 at 06:07:14PM +0100, Christian Borntraeger wrote:
> > DMA can be done to NORMAL memory as well.
> 
> Exactly. 
> I think iucv uses GFP_DMA because z/VM needs those buffers to reside below 2GB (which is ZONA_DMA for s390).

The normal way to allocate memory with addressing limits would be to
use dma_alloc_coherent and friends.  Any chance to switch iucv over to
that?  Or is there no device associated with it?
Christian Borntraeger Jan. 29, 2020, 5:19 p.m. UTC | #12
On 29.01.20 18:09, Christoph Hellwig wrote:
> On Wed, Jan 29, 2020 at 06:07:14PM +0100, Christian Borntraeger wrote:
>>> DMA can be done to NORMAL memory as well.
>>
>> Exactly. 
>> I think iucv uses GFP_DMA because z/VM needs those buffers to reside below 2GB (which is ZONA_DMA for s390).
> 
> The normal way to allocate memory with addressing limits would be to
> use dma_alloc_coherent and friends.  Any chance to switch iucv over to
> that?  Or is there no device associated with it?

There is not necessarily a device for that. It is a hypervisor interface (an
instruction that is interpreted by z/VM). We do have the netiucv driver that
creates a virtual nic, but there is also AF_IUCV which works without a device.

But back to the original question: If we mark kmalloc caches as usercopy caches,
we should do the same for DMA kmalloc caches. As outlined by Christoph, this has
nothing to do with device DMA.
Kees Cook Jan. 30, 2020, 7:23 p.m. UTC | #13
On Wed, Jan 29, 2020 at 06:19:56PM +0100, Christian Borntraeger wrote:
> On 29.01.20 18:09, Christoph Hellwig wrote:
> > On Wed, Jan 29, 2020 at 06:07:14PM +0100, Christian Borntraeger wrote:
> >>> DMA can be done to NORMAL memory as well.
> >>
> >> Exactly. 
> >> I think iucv uses GFP_DMA because z/VM needs those buffers to reside below 2GB (which is ZONA_DMA for s390).
> > 
> > The normal way to allocate memory with addressing limits would be to
> > use dma_alloc_coherent and friends.  Any chance to switch iucv over to
> > that?  Or is there no device associated with it?
> 
> There is not necessarily a device for that. It is a hypervisor interface (an
> instruction that is interpreted by z/VM). We do have the netiucv driver that
> creates a virtual nic, but there is also AF_IUCV which works without a device.
> 
> But back to the original question: If we mark kmalloc caches as usercopy caches,
> we should do the same for DMA kmalloc caches. As outlined by Christoph, this has
> nothing to do with device DMA.

Hm, looks like it's allocated from the low 16MB. Seems like poor naming!
:) There seems to be a LOT of stuff using GFP_DMA, and it seems unlikely
those are all expecting low addresses?

Since this has only been a problem on s390, should just s390 gain the
weakening of the usercopy restriction?  Something like:


diff --git a/mm/slab_common.c b/mm/slab_common.c
index 1907cb2903c7..c5bbc141f20b 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1303,7 +1303,9 @@ void __init create_kmalloc_caches(slab_flags_t flags)
 			kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache(
 				kmalloc_info[i].name[KMALLOC_DMA],
 				kmalloc_info[i].size,
-				SLAB_CACHE_DMA | flags, 0, 0);
+				SLAB_CACHE_DMA | flags, 0,
+				IS_ENABLED(CONFIG_S390) ?
+					kmalloc_info[i].size : 0);
 		}
 	}
 #endif
Jann Horn Jan. 31, 2020, 12:03 p.m. UTC | #14
On Thu, Jan 30, 2020 at 8:23 PM Kees Cook <keescook@chromium.org> wrote:
> On Wed, Jan 29, 2020 at 06:19:56PM +0100, Christian Borntraeger wrote:
> > On 29.01.20 18:09, Christoph Hellwig wrote:
> > > On Wed, Jan 29, 2020 at 06:07:14PM +0100, Christian Borntraeger wrote:
> > >>> DMA can be done to NORMAL memory as well.
> > >>
> > >> Exactly.
> > >> I think iucv uses GFP_DMA because z/VM needs those buffers to reside below 2GB (which is ZONA_DMA for s390).
> > >
> > > The normal way to allocate memory with addressing limits would be to
> > > use dma_alloc_coherent and friends.  Any chance to switch iucv over to
> > > that?  Or is there no device associated with it?
> >
> > There is not necessarily a device for that. It is a hypervisor interface (an
> > instruction that is interpreted by z/VM). We do have the netiucv driver that
> > creates a virtual nic, but there is also AF_IUCV which works without a device.
> >
> > But back to the original question: If we mark kmalloc caches as usercopy caches,
> > we should do the same for DMA kmalloc caches. As outlined by Christoph, this has
> > nothing to do with device DMA.
>
> Hm, looks like it's allocated from the low 16MB. Seems like poor naming!

The physical address limit of the DMA zone depends on the architecture
(and the kernel version); e.g. on Linux 4.4 on arm64 (which is used on
the Pixel 2), the DMA zone goes up to 4GiB. Later, arm64 started using
the DMA32 zone for that instead (as was already the case on e.g.
X86-64); but recently (commit 1a8e1cef7603), arm64 started using the
DMA zone again, but now for up to 1GiB. (AFAICS the DMA32 zone can't
be used with kmalloc at all, that only works with the DMA zone.)

> :) There seems to be a LOT of stuff using GFP_DMA, and it seems unlikely
> those are all expecting low addresses?

I think there's a bunch of (especially really old) hardware where the
hardware can only talk to low physical addresses, e.g. stuff that uses
the ISA bus.

However, there aren't *that* many users of GFP_DMA that actually cause
kmalloc allocations with GFP_DMA - many of the users of GFP_DMA
actually just pass that flag to dma_alloc_coherent()/dma_pool_alloc(),
where it is filtered away and the allocation ultimately doesn't go
through the slab allocator AFAICS, or they pass it directly to the
page allocator, where it has no effect on the usercopy stuff. Looking
on my workstation, there are zero objects allocated in dma-kmalloc-*
slabs:

/sys/kernel/slab# for name in dma-kmalloc-*; do echo "$name: $(cat
$name/objects)"; done
dma-kmalloc-128: 0
dma-kmalloc-16: 0
dma-kmalloc-192: 0
dma-kmalloc-1k: 0
dma-kmalloc-256: 0
dma-kmalloc-2k: 0
dma-kmalloc-32: 0
dma-kmalloc-4k: 0
dma-kmalloc-512: 0
dma-kmalloc-64: 0
dma-kmalloc-8: 0
dma-kmalloc-8k: 0
dma-kmalloc-96: 0

On a Pixel 2, there are a whole five objects allocated across the DMA
zone kmalloc caches:

walleye:/sys/kernel/slab # for name in dma-kmalloc-*; do echo "$name:
$(cat $name/objects)"; done
dma-kmalloc-1024: 0
dma-kmalloc-128: 0
dma-kmalloc-2048: 2
dma-kmalloc-256: 0
dma-kmalloc-4096: 3
dma-kmalloc-512: 0
dma-kmalloc-8192: 0

> Since this has only been a problem on s390, should just s390 gain the
> weakening of the usercopy restriction?  Something like:
>
>
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index 1907cb2903c7..c5bbc141f20b 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -1303,7 +1303,9 @@ void __init create_kmalloc_caches(slab_flags_t flags)
>                         kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache(
>                                 kmalloc_info[i].name[KMALLOC_DMA],
>                                 kmalloc_info[i].size,
> -                               SLAB_CACHE_DMA | flags, 0, 0);
> +                               SLAB_CACHE_DMA | flags, 0,
> +                               IS_ENABLED(CONFIG_S390) ?
> +                                       kmalloc_info[i].size : 0);
>                 }
>         }
>  #endif

I think dma-kmalloc slabs should be handled the same way as normal
kmalloc slabs. When a dma-kmalloc allocation is freshly created, it is
just normal kernel memory - even if it might later be used for DMA -,
and it should be perfectly fine to copy_from_user() into such
allocations at that point, and to copy_to_user() out of them at the
end. If you look at the places where such allocations are created, you
can see things like kmemdup(), memcpy() and so on - all normal
operations that shouldn't conceptually be different from usercopy in
any relevant way.
Kees Cook Feb. 1, 2020, 5:56 p.m. UTC | #15
On Fri, Jan 31, 2020 at 01:03:40PM +0100, Jann Horn wrote:
> I think dma-kmalloc slabs should be handled the same way as normal
> kmalloc slabs. When a dma-kmalloc allocation is freshly created, it is
> just normal kernel memory - even if it might later be used for DMA -,
> and it should be perfectly fine to copy_from_user() into such
> allocations at that point, and to copy_to_user() out of them at the
> end. If you look at the places where such allocations are created, you
> can see things like kmemdup(), memcpy() and so on - all normal
> operations that shouldn't conceptually be different from usercopy in
> any relevant way.

I can't find where the address limit for dma-kmalloc is implemented.

As to whitelisting all of dma-kmalloc -- I guess I can be talked into
it. It still seems like the memory used for direct hardware
communication shouldn't be exposed to userspace, but it we're dealing
with packet data, etc, then it makes sense not to have to have bounce
buffers, etc.
Jann Horn Feb. 1, 2020, 7:27 p.m. UTC | #16
[pruned bogus addresses from recipient list]

On Sat, Feb 1, 2020 at 6:56 PM Kees Cook <keescook@chromium.org> wrote:
> On Fri, Jan 31, 2020 at 01:03:40PM +0100, Jann Horn wrote:
> > I think dma-kmalloc slabs should be handled the same way as normal
> > kmalloc slabs. When a dma-kmalloc allocation is freshly created, it is
> > just normal kernel memory - even if it might later be used for DMA -,
> > and it should be perfectly fine to copy_from_user() into such
> > allocations at that point, and to copy_to_user() out of them at the
> > end. If you look at the places where such allocations are created, you
> > can see things like kmemdup(), memcpy() and so on - all normal
> > operations that shouldn't conceptually be different from usercopy in
> > any relevant way.
>
> I can't find where the address limit for dma-kmalloc is implemented.

dma-kmalloc is a slab that uses GFP_DMA pages.

Things have changed a bit through the kernel versions, but in current
mainline, the zone limit for GFP_DMA is reported from arch code to
generic code via zone_dma_bits, from where it is used to decide which
zones should be used for allocations based on the address limit of a
given device:

kernel/dma/direct.c:
/*
 * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use it
 * it for entirely different regions. In that case the arch code needs to
 * override the variable below for dma-direct to work properly.
 */
unsigned int zone_dma_bits __ro_after_init = 24;
[...]
static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
                u64 *phys_limit)
{
[...]
        /*
         * Optimistically try the zone that the physical address mask falls
         * into first.  If that returns memory that isn't actually addressable
         * we will fallback to the next lower zone and try again.
         *
         * Note that GFP_DMA32 and GFP_DMA are no ops without the corresponding
         * zones.
         */
        if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits))
                return GFP_DMA;
        if (*phys_limit <= DMA_BIT_MASK(32))
                return GFP_DMA32;
        return 0;
}


There are only a few architectures that override the limit:

powerpc:
        /*
         * Allow 30-bit DMA for very limited Broadcom wifi chips on many
         * powerbooks.
         */
        if (IS_ENABLED(CONFIG_PPC32))
                zone_dma_bits = 30;
        else
                zone_dma_bits = 31;

s390:
        zone_dma_bits = 31;

and arm64:
#define ARM64_ZONE_DMA_BITS     30
[...]
        if (IS_ENABLED(CONFIG_ZONE_DMA)) {
                zone_dma_bits = ARM64_ZONE_DMA_BITS;
                arm64_dma_phys_limit = max_zone_phys(ARM64_ZONE_DMA_BITS);
        }


The actual categorization of page ranges into zones happens via
free_area_init_nodes() or free_area_init_node(); these are provided
with arrays of maximum physical addresses or zone sizes (depending on
which of them is called) by arch-specific code.
For arm64, the caller is zone_sizes_init(). X86 does it in zone_sizes_init().

> As to whitelisting all of dma-kmalloc -- I guess I can be talked into
> it. It still seems like the memory used for direct hardware
> communication shouldn't be exposed to userspace, but it we're dealing
> with packet data, etc, then it makes sense not to have to have bounce
> buffers, etc.

FWIW, as far as I understand, usercopy doesn't actually have any
effect on drivers that use the modern, proper APIs, since those don't
use the slab allocator at all - as I pointed out in my last mail, the
dma-kmalloc* slabs are used very rarely. (Which is good, because
putting objects from less-than-page-size slabs into iommu entries is a
terrible idea from a security and reliability perspective because it
gives the hardware access to completely unrelated memory.) Instead,
they get pages from the page allocator, and these pages may e.g. be
allocated from the DMA, DMA32 or NORMAL zones depending on the
restrictions imposed by hardware. So I think the usercopy restriction
only affects a few oddball drivers (like this s390 stuff), which is
why you're not seeing more bug reports caused by this.
Matthew Wilcox (Oracle) Feb. 3, 2020, 7:46 a.m. UTC | #17
On Sat, Feb 01, 2020 at 08:27:49PM +0100, Jann Horn wrote:
> FWIW, as far as I understand, usercopy doesn't actually have any
> effect on drivers that use the modern, proper APIs, since those don't
> use the slab allocator at all - as I pointed out in my last mail, the
> dma-kmalloc* slabs are used very rarely. (Which is good, because
> putting objects from less-than-page-size slabs into iommu entries is a
> terrible idea from a security and reliability perspective because it
> gives the hardware access to completely unrelated memory.) Instead,
> they get pages from the page allocator, and these pages may e.g. be
> allocated from the DMA, DMA32 or NORMAL zones depending on the
> restrictions imposed by hardware. So I think the usercopy restriction
> only affects a few oddball drivers (like this s390 stuff), which is
> why you're not seeing more bug reports caused by this.

Getting pages from the page allocator is true for dma_alloc_coherent()
and friends.  But it's not true for streaming DMA mappings (dma_map_*)
for which the memory usually comes from kmalloc().  If this is something
we want to fix (and I have an awful feeling we're going to regret it
if we say "no, we trust the hardware"), we're going to have to come up
with a new memory allocation API for these cases.  Or bounce bugger the
memory for devices we don't trust.

The problem with the dma_map_* API is that memory might end up being
allocated once and then used multiple times by different drivers.  eg if
I allocate an NFS packet, it might get sent first to eth0, then (when the
route fails) sent to eth1.  Similarly in storage, a RAID-5 driver might
map the same memory several times to send to different disk controllers.
Christoph Lameter (Ampere) Feb. 3, 2020, 5:20 p.m. UTC | #18
On Sat, 1 Feb 2020, Kees Cook wrote:
>
> I can't find where the address limit for dma-kmalloc is implemented.

include/linux/mmzones.h

enum zone_type {
        /*
         * ZONE_DMA and ZONE_DMA32 are used when there are peripherals not able
         * to DMA to all of the addressable memory (ZONE_NORMAL).
         * On architectures where this area covers the whole 32 bit address
         * space ZONE_DMA32 is used. ZONE_DMA is left for the ones with smaller
         * DMA addressing constraints. This distinction is important as a 32bit
         * DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit
         * platforms may need both zones as they support peripherals with
         * different DMA addressing limitations.
         *
         * Some examples:
         *
         *  - i386 and x86_64 have a fixed 16M ZONE_DMA and ZONE_DMA32 for the
         *    rest of the lower 4G.
         *
         *  - arm only uses ZONE_DMA, the size, up to 4G, may vary depending on
         *    the specific device.
         *
         *  - arm64 has a fixed 1G ZONE_DMA and ZONE_DMA32 for the rest of the
         *    lower 4G.
         *
         *  - powerpc only uses ZONE_DMA, the size, up to 2G, may vary
         *    depending on the specific device.
         *
         *  - s390 uses ZONE_DMA fixed to the lower 2G.
         *
         *  - ia64 and riscv only use ZONE_DMA32.
         *
         *  - parisc uses neither.
         */
#ifdef CONFIG_ZONE_DMA
        ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
        ZONE_DMA32,
#endif
        /*
         * Normal addressable memory is in ZONE_NORMAL. DMA operations can
be
         * performed on pages in ZONE_NORMAL if the DMA devices support
         * transfers to all addressable memory.
         */
        ZONE_NORMAL,
#ifdef CONFIG_HIGHMEM
        /*
         * A memory area that is only addressable by the kernel through
         * mapping portions into its own address space. This is for example
         * used by i386 to allow the kernel to address the memory beyond
         * 900MB. The kernel will set up special mappings (page
         * table entries on i386) for each page that the kernel needs to
         * access.
         */
        ZONE_HIGHMEM,
#endif
        ZONE_MOVABLE,
#ifdef CONFIG_ZONE_DEVICE
        ZONE_DEVICE,
#endif
        __MAX_NR_ZONES

};
Christoph Hellwig Feb. 3, 2020, 5:36 p.m. UTC | #19
On Wed, Jan 29, 2020 at 06:19:56PM +0100, Christian Borntraeger wrote:
> There is not necessarily a device for that. It is a hypervisor interface (an
> instruction that is interpreted by z/VM). We do have the netiucv driver that
> creates a virtual nic, but there is also AF_IUCV which works without a device.
> 
> But back to the original question: If we mark kmalloc caches as usercopy caches,
> we should do the same for DMA kmalloc caches. As outlined by Christoph, this has
> nothing to do with device DMA.

Oh well, s/390 with its weird mix of cpu and I/O again.  Everywhere else
where we have addressing limits we do treat that as a DMA address.

We've also had a bit of a lose plan to force ZONE_DMA as a public
interface out, as it is generally the wrong thing to do for drivers.
A ZONE_32 and/or ZONE_31 makes some sense as the backing for the
dma allocator, but it mostly shouldn't be exposed, especially not to
the slab allocator.
Christoph Hellwig Feb. 3, 2020, 5:38 p.m. UTC | #20
On Thu, Jan 30, 2020 at 11:23:38AM -0800, Kees Cook wrote:
> Hm, looks like it's allocated from the low 16MB. Seems like poor naming!
> :) There seems to be a LOT of stuff using GFP_DMA, and it seems unlikely
> those are all expecting low addresses?

Most of that is either completely or partially bogus.  Besides the
weird S/390 stuff pretty much everything should be using the dma
allocators.  A couple really messed up drivers even pass GFP_DMA*
to the dma allocator, but my patches to fix that up seem to be stuck.
Christoph Hellwig Feb. 3, 2020, 5:41 p.m. UTC | #21
On Sun, Feb 02, 2020 at 11:46:44PM -0800, Matthew Wilcox wrote:
> > gives the hardware access to completely unrelated memory.) Instead,
> > they get pages from the page allocator, and these pages may e.g. be
> > allocated from the DMA, DMA32 or NORMAL zones depending on the
> > restrictions imposed by hardware. So I think the usercopy restriction
> > only affects a few oddball drivers (like this s390 stuff), which is
> > why you're not seeing more bug reports caused by this.
> 
> Getting pages from the page allocator is true for dma_alloc_coherent()
> and friends.

dma_alloc_coherent also has a few other memory sources than the page
allocator..

> But it's not true for streaming DMA mappings (dma_map_*)
> for which the memory usually comes from kmalloc().  If this is something
> we want to fix (and I have an awful feeling we're going to regret it
> if we say "no, we trust the hardware"), we're going to have to come up
> with a new memory allocation API for these cases.  Or bounce bugger the
> memory for devices we don't trust.

There aren't too many places that use slab allocations for streaming
mappings, and then do usercopies into them.  But I vaguely remember
some usb code getting the annotations for that a while ago.

But if you don't trust your hardware you will need to use IOMMUs and
page aligned memory, or IOMMUs plus bounce buffering for the kmalloc
allocations (we've recently grown code to do that for the intel-iommu
driver, which needs to be lifted into the generic IOMMU code).
Vlastimil Babka April 7, 2020, 8 a.m. UTC | #22
On 1/31/20 1:03 PM, Jann Horn wrote:

> I think dma-kmalloc slabs should be handled the same way as normal
> kmalloc slabs. When a dma-kmalloc allocation is freshly created, it is
> just normal kernel memory - even if it might later be used for DMA -,
> and it should be perfectly fine to copy_from_user() into such
> allocations at that point, and to copy_to_user() out of them at the
> end. If you look at the places where such allocations are created, you
> can see things like kmemdup(), memcpy() and so on - all normal
> operations that shouldn't conceptually be different from usercopy in
> any relevant way.
 
So, let's do that?

----8<----
From d5190e4e871689a530da3c3fd327be45a88f006a Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Tue, 7 Apr 2020 09:58:00 +0200
Subject: [PATCH] usercopy: Mark dma-kmalloc caches as usercopy caches

We have seen a "usercopy: Kernel memory overwrite attempt detected to SLUB
object 'dma-kmalloc-1 k' (offset 0, size 11)!" error on s390x, as IUCV uses
kmalloc() with __GFP_DMA because of memory address restrictions.
The issue has been discussed [2] and it has been noted that if all the kmalloc
caches are marked as usercopy, there's little reason not to mark dma-kmalloc
caches too. The 'dma' part merely means that __GFP_DMA is used to restrict
memory address range.

As Jann Horn put it [3]:

"I think dma-kmalloc slabs should be handled the same way as normal
kmalloc slabs. When a dma-kmalloc allocation is freshly created, it is
just normal kernel memory - even if it might later be used for DMA -,
and it should be perfectly fine to copy_from_user() into such
allocations at that point, and to copy_to_user() out of them at the
end. If you look at the places where such allocations are created, you
can see things like kmemdup(), memcpy() and so on - all normal
operations that shouldn't conceptually be different from usercopy in
any relevant way."

Thus this patch marks the dma-kmalloc-* caches as usercopy.

[1] https://bugzilla.suse.com/show_bug.cgi?id=1156053
[2] https://lore.kernel.org/kernel-hardening/bfca96db-bbd0-d958-7732-76e36c667c68@suse.cz/
[3] https://lore.kernel.org/kernel-hardening/CAG48ez1a4waGk9kB0WLaSbs4muSoK0AYAVk8=XYaKj4_+6e6Hg@mail.gmail.com/

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slab_common.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/slab_common.c b/mm/slab_common.c
index 5282f881d2f5..ae9486160594 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1303,7 +1303,8 @@ void __init create_kmalloc_caches(slab_flags_t flags)
 			kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache(
 				kmalloc_info[i].name[KMALLOC_DMA],
 				kmalloc_info[i].size,
-				SLAB_CACHE_DMA | flags, 0, 0);
+				SLAB_CACHE_DMA | flags, 0,
+				kmalloc_info[i].size);
 		}
 	}
 #endif
Christian Borntraeger April 7, 2020, 11:05 a.m. UTC | #23
On 07.04.20 10:00, Vlastimil Babka wrote:
> On 1/31/20 1:03 PM, Jann Horn wrote:
> 
>> I think dma-kmalloc slabs should be handled the same way as normal
>> kmalloc slabs. When a dma-kmalloc allocation is freshly created, it is
>> just normal kernel memory - even if it might later be used for DMA -,
>> and it should be perfectly fine to copy_from_user() into such
>> allocations at that point, and to copy_to_user() out of them at the
>> end. If you look at the places where such allocations are created, you
>> can see things like kmemdup(), memcpy() and so on - all normal
>> operations that shouldn't conceptually be different from usercopy in
>> any relevant way.
>  
> So, let's do that?
> 
> ----8<----
> From d5190e4e871689a530da3c3fd327be45a88f006a Mon Sep 17 00:00:00 2001
> From: Vlastimil Babka <vbabka@suse.cz>
> Date: Tue, 7 Apr 2020 09:58:00 +0200
> Subject: [PATCH] usercopy: Mark dma-kmalloc caches as usercopy caches
> 
> We have seen a "usercopy: Kernel memory overwrite attempt detected to SLUB
> object 'dma-kmalloc-1 k' (offset 0, size 11)!" error on s390x, as IUCV uses
> kmalloc() with __GFP_DMA because of memory address restrictions.
> The issue has been discussed [2] and it has been noted that if all the kmalloc
> caches are marked as usercopy, there's little reason not to mark dma-kmalloc
> caches too. The 'dma' part merely means that __GFP_DMA is used to restrict
> memory address range.
> 
> As Jann Horn put it [3]:
> 
> "I think dma-kmalloc slabs should be handled the same way as normal
> kmalloc slabs. When a dma-kmalloc allocation is freshly created, it is
> just normal kernel memory - even if it might later be used for DMA -,
> and it should be perfectly fine to copy_from_user() into such
> allocations at that point, and to copy_to_user() out of them at the
> end. If you look at the places where such allocations are created, you
> can see things like kmemdup(), memcpy() and so on - all normal
> operations that shouldn't conceptually be different from usercopy in
> any relevant way."
> 
> Thus this patch marks the dma-kmalloc-* caches as usercopy.
> 
> [1] https://bugzilla.suse.com/show_bug.cgi?id=1156053
> [2] https://lore.kernel.org/kernel-hardening/bfca96db-bbd0-d958-7732-76e36c667c68@suse.cz/
> [3] https://lore.kernel.org/kernel-hardening/CAG48ez1a4waGk9kB0WLaSbs4muSoK0AYAVk8=XYaKj4_+6e6Hg@mail.gmail.com/
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>

> ---
>  mm/slab_common.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index 5282f881d2f5..ae9486160594 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -1303,7 +1303,8 @@ void __init create_kmalloc_caches(slab_flags_t flags)
>  			kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache(
>  				kmalloc_info[i].name[KMALLOC_DMA],
>  				kmalloc_info[i].size,
> -				SLAB_CACHE_DMA | flags, 0, 0);
> +				SLAB_CACHE_DMA | flags, 0,
> +				kmalloc_info[i].size);
>  		}
>  	}
>  #endif
>
Jiri Slaby April 20, 2020, 7:53 a.m. UTC | #24
On 07. 04. 20, 10:00, Vlastimil Babka wrote:
> From d5190e4e871689a530da3c3fd327be45a88f006a Mon Sep 17 00:00:00 2001
> From: Vlastimil Babka <vbabka@suse.cz>
> Date: Tue, 7 Apr 2020 09:58:00 +0200
> Subject: [PATCH] usercopy: Mark dma-kmalloc caches as usercopy caches
> 
> We have seen a "usercopy: Kernel memory overwrite attempt detected to SLUB
> object 'dma-kmalloc-1 k' (offset 0, size 11)!" error on s390x, as IUCV uses
> kmalloc() with __GFP_DMA because of memory address restrictions.
> The issue has been discussed [2] and it has been noted that if all the kmalloc
> caches are marked as usercopy, there's little reason not to mark dma-kmalloc
> caches too. The 'dma' part merely means that __GFP_DMA is used to restrict
> memory address range.
> 
> As Jann Horn put it [3]:
> 
> "I think dma-kmalloc slabs should be handled the same way as normal
> kmalloc slabs. When a dma-kmalloc allocation is freshly created, it is
> just normal kernel memory - even if it might later be used for DMA -,
> and it should be perfectly fine to copy_from_user() into such
> allocations at that point, and to copy_to_user() out of them at the
> end. If you look at the places where such allocations are created, you
> can see things like kmemdup(), memcpy() and so on - all normal
> operations that shouldn't conceptually be different from usercopy in
> any relevant way."
> 
> Thus this patch marks the dma-kmalloc-* caches as usercopy.
> 
> [1] https://bugzilla.suse.com/show_bug.cgi?id=1156053
> [2] https://lore.kernel.org/kernel-hardening/bfca96db-bbd0-d958-7732-76e36c667c68@suse.cz/
> [3] https://lore.kernel.org/kernel-hardening/CAG48ez1a4waGk9kB0WLaSbs4muSoK0AYAVk8=XYaKj4_+6e6Hg@mail.gmail.com/
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Friendly ping.

Acked-by: Jiri Slaby <jslaby@suse.cz>

> ---
>  mm/slab_common.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index 5282f881d2f5..ae9486160594 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -1303,7 +1303,8 @@ void __init create_kmalloc_caches(slab_flags_t flags)
>  			kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache(
>  				kmalloc_info[i].name[KMALLOC_DMA],
>  				kmalloc_info[i].size,
> -				SLAB_CACHE_DMA | flags, 0, 0);
> +				SLAB_CACHE_DMA | flags, 0,
> +				kmalloc_info[i].size);
>  		}
>  	}
>  #endif
> 

thanks,
Kees Cook April 20, 2020, 5:43 p.m. UTC | #25
On Mon, Apr 20, 2020 at 09:53:20AM +0200, Jiri Slaby wrote:
> On 07. 04. 20, 10:00, Vlastimil Babka wrote:
> > From d5190e4e871689a530da3c3fd327be45a88f006a Mon Sep 17 00:00:00 2001
> > From: Vlastimil Babka <vbabka@suse.cz>
> > Date: Tue, 7 Apr 2020 09:58:00 +0200
> > Subject: [PATCH] usercopy: Mark dma-kmalloc caches as usercopy caches
> > 
> > We have seen a "usercopy: Kernel memory overwrite attempt detected to SLUB
> > object 'dma-kmalloc-1 k' (offset 0, size 11)!" error on s390x, as IUCV uses
> > kmalloc() with __GFP_DMA because of memory address restrictions.
> > The issue has been discussed [2] and it has been noted that if all the kmalloc
> > caches are marked as usercopy, there's little reason not to mark dma-kmalloc
> > caches too. The 'dma' part merely means that __GFP_DMA is used to restrict
> > memory address range.
> > 
> > As Jann Horn put it [3]:
> > 
> > "I think dma-kmalloc slabs should be handled the same way as normal
> > kmalloc slabs. When a dma-kmalloc allocation is freshly created, it is
> > just normal kernel memory - even if it might later be used for DMA -,
> > and it should be perfectly fine to copy_from_user() into such
> > allocations at that point, and to copy_to_user() out of them at the
> > end. If you look at the places where such allocations are created, you
> > can see things like kmemdup(), memcpy() and so on - all normal
> > operations that shouldn't conceptually be different from usercopy in
> > any relevant way."
> > 
> > Thus this patch marks the dma-kmalloc-* caches as usercopy.
> > 
> > [1] https://bugzilla.suse.com/show_bug.cgi?id=1156053
> > [2] https://lore.kernel.org/kernel-hardening/bfca96db-bbd0-d958-7732-76e36c667c68@suse.cz/
> > [3] https://lore.kernel.org/kernel-hardening/CAG48ez1a4waGk9kB0WLaSbs4muSoK0AYAVk8=XYaKj4_+6e6Hg@mail.gmail.com/
> > 
> > Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> 
> Friendly ping.
> 
> Acked-by: Jiri Slaby <jslaby@suse.cz>

Should this go via -mm?

-Kees

> 
> > ---
> >  mm/slab_common.c | 3 ++-
> >  1 file changed, 2 insertions(+), 1 deletion(-)
> > 
> > diff --git a/mm/slab_common.c b/mm/slab_common.c
> > index 5282f881d2f5..ae9486160594 100644
> > --- a/mm/slab_common.c
> > +++ b/mm/slab_common.c
> > @@ -1303,7 +1303,8 @@ void __init create_kmalloc_caches(slab_flags_t flags)
> >  			kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache(
> >  				kmalloc_info[i].name[KMALLOC_DMA],
> >  				kmalloc_info[i].size,
> > -				SLAB_CACHE_DMA | flags, 0, 0);
> > +				SLAB_CACHE_DMA | flags, 0,
> > +				kmalloc_info[i].size);
> >  		}
> >  	}
> >  #endif
> > 
> 
> thanks,
> -- 
> js
> suse labs
diff mbox

Patch

diff --git a/mm/slab.c b/mm/slab.c
index b9b0df620bb9..dd367fe17a4e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1291,7 +1291,8 @@  void __init kmem_cache_init(void)
 	 */
 	kmalloc_caches[INDEX_NODE] = create_kmalloc_cache(
 				kmalloc_info[INDEX_NODE].name,
-				kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS);
+				kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS,
+				0, kmalloc_size(INDEX_NODE));
 	slab_state = PARTIAL_NODE;
 	setup_kmalloc_cache_index_table();
 
diff --git a/mm/slab.h b/mm/slab.h
index b476c435680f..98c5bcc45d8b 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -97,7 +97,8 @@  struct kmem_cache *kmalloc_slab(size_t, gfp_t);
 int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags);
 
 extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size,
-			slab_flags_t flags);
+			slab_flags_t flags, size_t useroffset,
+			size_t usersize);
 extern void create_boot_cache(struct kmem_cache *, const char *name,
 			size_t size, slab_flags_t flags, size_t useroffset,
 			size_t usersize);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index a51f65408637..8ac2a6320a6c 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -937,14 +937,15 @@  void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz
 }
 
 struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
-				slab_flags_t flags)
+				slab_flags_t flags, size_t useroffset,
+				size_t usersize)
 {
 	struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
 
 	if (!s)
 		panic("Out of memory when creating slab %s\n", name);
 
-	create_boot_cache(s, name, size, flags, 0, size);
+	create_boot_cache(s, name, size, flags, useroffset, usersize);
 	list_add(&s->list, &slab_caches);
 	memcg_link_cache(s);
 	s->refcount = 1;
@@ -1098,7 +1099,8 @@  void __init setup_kmalloc_cache_index_table(void)
 static void __init new_kmalloc_cache(int idx, slab_flags_t flags)
 {
 	kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name,
-					kmalloc_info[idx].size, flags);
+					kmalloc_info[idx].size, flags, 0,
+					kmalloc_info[idx].size);
 }
 
 /*
@@ -1139,7 +1141,7 @@  void __init create_kmalloc_caches(slab_flags_t flags)
 
 			BUG_ON(!n);
 			kmalloc_dma_caches[i] = create_kmalloc_cache(n,
-				size, SLAB_CACHE_DMA | flags);
+				size, SLAB_CACHE_DMA | flags, 0, 0);
 		}
 	}
 #endif