diff mbox series

[v3,11/17] rdma_rxe: Address an issue with hardened user copy

Message ID 20200820224638.3212-12-rpearson@hpe.com (mailing list archive)
State Changes Requested
Delegated to: Jason Gunthorpe
Headers show
Series Memory window support for rdma_rxe | expand

Commit Message

Bob Pearson Aug. 20, 2020, 10:46 p.m. UTC
Added a new feature to pools to let driver white list a region of
a pool object. This removes a kernel oops caused when create qp
returns the qp number so the next patch will work without errors.

Signed-off-by: Bob Pearson <rpearson@hpe.com>
---
 drivers/infiniband/sw/rxe/rxe_pool.c | 20 +++++++++++++++++---
 drivers/infiniband/sw/rxe/rxe_pool.h |  4 ++++
 2 files changed, 21 insertions(+), 3 deletions(-)

Comments

Zhu Yanjun Aug. 22, 2020, 3:32 a.m. UTC | #1
On 8/21/2020 6:46 AM, Bob Pearson wrote:
> Added a new feature to pools to let driver white list a region of
> a pool object. This removes a kernel oops caused when create qp
> returns the qp number so the next patch will work without errors.
>
> Signed-off-by: Bob Pearson <rpearson@hpe.com>
> ---
>   drivers/infiniband/sw/rxe/rxe_pool.c | 20 +++++++++++++++++---
>   drivers/infiniband/sw/rxe/rxe_pool.h |  4 ++++
>   2 files changed, 21 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
> index 5679714827ec..374e56689d30 100644
> --- a/drivers/infiniband/sw/rxe/rxe_pool.c
> +++ b/drivers/infiniband/sw/rxe/rxe_pool.c
> @@ -40,9 +40,12 @@ struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = {
>   		.name		= "rxe-qp",
>   		.size		= sizeof(struct rxe_qp),
>   		.cleanup	= rxe_qp_cleanup,
> -		.flags		= RXE_POOL_INDEX,
> +		.flags		= RXE_POOL_INDEX
> +				| RXE_POOL_WHITELIST,
>   		.min_index	= RXE_MIN_QP_INDEX,
>   		.max_index	= RXE_MAX_QP_INDEX,
> +		.user_offset	= offsetof(struct rxe_qp, ibqp.qp_num),
> +		.user_size	= sizeof(u32),
>   	},
>   	[RXE_TYPE_CQ] = {
>   		.name		= "rxe-cq",
> @@ -116,10 +119,21 @@ int rxe_cache_init(void)
>   		type = &rxe_type_info[i];
>   		size = ALIGN(type->size, RXE_POOL_ALIGN);
>   		if (!(type->flags & RXE_POOL_NO_ALLOC)) {
> -			type->cache =
> -				kmem_cache_create(type->name, size,
> +			if (type->flags & RXE_POOL_WHITELIST) {
> +				type->cache =
> +					kmem_cache_create_usercopy(
> +						type->name, size,
> +						RXE_POOL_ALIGN,
> +						RXE_POOL_CACHE_FLAGS,
> +						type->user_offset,
> +						type->user_size, NULL);
> +			} else {
> +				type->cache =
> +					kmem_cache_create(type->name, size,
>   						  RXE_POOL_ALIGN,
>   						  RXE_POOL_CACHE_FLAGS, NULL);
> +			}
> +
>   			if (!type->cache) {
>   				pr_err("Unable to init kmem cache for %s\n",
>   				       type->name);
> diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h
> index 664153bf9392..fc5b584a8137 100644
> --- a/drivers/infiniband/sw/rxe/rxe_pool.h
> +++ b/drivers/infiniband/sw/rxe/rxe_pool.h
> @@ -17,6 +17,7 @@ enum rxe_pool_flags {
>   	RXE_POOL_INDEX		= BIT(1),
>   	RXE_POOL_KEY		= BIT(2),
>   	RXE_POOL_NO_ALLOC	= BIT(4),
> +	RXE_POOL_WHITELIST	= BIT(5),
>   };
>   
>   enum rxe_elem_type {
> @@ -44,6 +45,9 @@ struct rxe_type_info {
>   	u32			min_index;
>   	size_t			key_offset;
>   	size_t			key_size;
> +	/* for white listing where necessary */

s/where/when


> +	unsigned int		user_offset;
> +	unsigned int		user_size;
>   	struct kmem_cache	*cache;
>   };
>
Bob Pearson Aug. 22, 2020, 4:16 a.m. UTC | #2
On 8/21/20 10:32 PM, Zhu Yanjun wrote:
> On 8/21/2020 6:46 AM, Bob Pearson wrote:
>> Added a new feature to pools to let driver white list a region of
>> a pool object. This removes a kernel oops caused when create qp
>> returns the qp number so the next patch will work without errors.
>>
>> Signed-off-by: Bob Pearson <rpearson@hpe.com>
>> ---
>>   drivers/infiniband/sw/rxe/rxe_pool.c | 20 +++++++++++++++++---
>>   drivers/infiniband/sw/rxe/rxe_pool.h |  4 ++++
>>   2 files changed, 21 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
>> index 5679714827ec..374e56689d30 100644
>> --- a/drivers/infiniband/sw/rxe/rxe_pool.c
>> +++ b/drivers/infiniband/sw/rxe/rxe_pool.c
>> @@ -40,9 +40,12 @@ struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = {
>>           .name        = "rxe-qp",
>>           .size        = sizeof(struct rxe_qp),
>>           .cleanup    = rxe_qp_cleanup,
>> -        .flags        = RXE_POOL_INDEX,
>> +        .flags        = RXE_POOL_INDEX
>> +                | RXE_POOL_WHITELIST,
>>           .min_index    = RXE_MIN_QP_INDEX,
>>           .max_index    = RXE_MAX_QP_INDEX,
>> +        .user_offset    = offsetof(struct rxe_qp, ibqp.qp_num),
>> +        .user_size    = sizeof(u32),
>>       },
>>       [RXE_TYPE_CQ] = {
>>           .name        = "rxe-cq",
>> @@ -116,10 +119,21 @@ int rxe_cache_init(void)
>>           type = &rxe_type_info[i];
>>           size = ALIGN(type->size, RXE_POOL_ALIGN);
>>           if (!(type->flags & RXE_POOL_NO_ALLOC)) {
>> -            type->cache =
>> -                kmem_cache_create(type->name, size,
>> +            if (type->flags & RXE_POOL_WHITELIST) {
>> +                type->cache =
>> +                    kmem_cache_create_usercopy(
>> +                        type->name, size,
>> +                        RXE_POOL_ALIGN,
>> +                        RXE_POOL_CACHE_FLAGS,
>> +                        type->user_offset,
>> +                        type->user_size, NULL);
>> +            } else {
>> +                type->cache =
>> +                    kmem_cache_create(type->name, size,
>>                             RXE_POOL_ALIGN,
>>                             RXE_POOL_CACHE_FLAGS, NULL);
>> +            }
>> +
>>               if (!type->cache) {
>>                   pr_err("Unable to init kmem cache for %s\n",
>>                          type->name);
>> diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h
>> index 664153bf9392..fc5b584a8137 100644
>> --- a/drivers/infiniband/sw/rxe/rxe_pool.h
>> +++ b/drivers/infiniband/sw/rxe/rxe_pool.h
>> @@ -17,6 +17,7 @@ enum rxe_pool_flags {
>>       RXE_POOL_INDEX        = BIT(1),
>>       RXE_POOL_KEY        = BIT(2),
>>       RXE_POOL_NO_ALLOC    = BIT(4),
>> +    RXE_POOL_WHITELIST    = BIT(5),
>>   };
>>     enum rxe_elem_type {
>> @@ -44,6 +45,9 @@ struct rxe_type_info {
>>       u32            min_index;
>>       size_t            key_offset;
>>       size_t            key_size;
>> +    /* for white listing where necessary */
> 
> s/where/when
> 
> 
>> +    unsigned int        user_offset;
>> +    unsigned int        user_size;
>>       struct kmem_cache    *cache;
>>   };
>>   
> 
> 
The reason for this change is that every time I do anything with rdma_rxe on current head of tree I get a kernel oops with a warning that there is a bad or missing white list. I traced this back to the user_copy routine which (recently) decided that when you copy just a part of a kernel memory object stored in a kmem cache that this represented a risk of leaking information from the kernel to user space. For the QP object the qp_num is copied back to user space in the user API. They also provided a new kmem_ccache_create_usercopy call that allows you to specify a 'whitelisted' portion of each object with an offset and length. So I just made it a feature of pools since it may come up again instead of treating QPs differently that all the other objects. This is part of a general program to harden the Linux kernel.
You can see the change to rxe_cache_init in the same file. Perhaps just dropping the comment would address the concern. See an earlier post I made with a pointer to an article in lwn describing the changes to the kernel.
Leon Romanovsky Aug. 24, 2020, 8:47 a.m. UTC | #3
On Fri, Aug 21, 2020 at 11:16:59PM -0500, Bob Pearson wrote:
> On 8/21/20 10:32 PM, Zhu Yanjun wrote:
> > On 8/21/2020 6:46 AM, Bob Pearson wrote:
> >> Added a new feature to pools to let driver white list a region of
> >> a pool object. This removes a kernel oops caused when create qp
> >> returns the qp number so the next patch will work without errors.
> >>
> >> Signed-off-by: Bob Pearson <rpearson@hpe.com>
> >> ---
> >>   drivers/infiniband/sw/rxe/rxe_pool.c | 20 +++++++++++++++++---
> >>   drivers/infiniband/sw/rxe/rxe_pool.h |  4 ++++
> >>   2 files changed, 21 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
> >> index 5679714827ec..374e56689d30 100644
> >> --- a/drivers/infiniband/sw/rxe/rxe_pool.c
> >> +++ b/drivers/infiniband/sw/rxe/rxe_pool.c
> >> @@ -40,9 +40,12 @@ struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = {
> >>           .name        = "rxe-qp",
> >>           .size        = sizeof(struct rxe_qp),
> >>           .cleanup    = rxe_qp_cleanup,
> >> -        .flags        = RXE_POOL_INDEX,
> >> +        .flags        = RXE_POOL_INDEX
> >> +                | RXE_POOL_WHITELIST,
> >>           .min_index    = RXE_MIN_QP_INDEX,
> >>           .max_index    = RXE_MAX_QP_INDEX,
> >> +        .user_offset    = offsetof(struct rxe_qp, ibqp.qp_num),
> >> +        .user_size    = sizeof(u32),
> >>       },
> >>       [RXE_TYPE_CQ] = {
> >>           .name        = "rxe-cq",
> >> @@ -116,10 +119,21 @@ int rxe_cache_init(void)
> >>           type = &rxe_type_info[i];
> >>           size = ALIGN(type->size, RXE_POOL_ALIGN);
> >>           if (!(type->flags & RXE_POOL_NO_ALLOC)) {
> >> -            type->cache =
> >> -                kmem_cache_create(type->name, size,
> >> +            if (type->flags & RXE_POOL_WHITELIST) {
> >> +                type->cache =
> >> +                    kmem_cache_create_usercopy(
> >> +                        type->name, size,
> >> +                        RXE_POOL_ALIGN,
> >> +                        RXE_POOL_CACHE_FLAGS,
> >> +                        type->user_offset,
> >> +                        type->user_size, NULL);
> >> +            } else {
> >> +                type->cache =
> >> +                    kmem_cache_create(type->name, size,
> >>                             RXE_POOL_ALIGN,
> >>                             RXE_POOL_CACHE_FLAGS, NULL);
> >> +            }
> >> +
> >>               if (!type->cache) {
> >>                   pr_err("Unable to init kmem cache for %s\n",
> >>                          type->name);
> >> diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h
> >> index 664153bf9392..fc5b584a8137 100644
> >> --- a/drivers/infiniband/sw/rxe/rxe_pool.h
> >> +++ b/drivers/infiniband/sw/rxe/rxe_pool.h
> >> @@ -17,6 +17,7 @@ enum rxe_pool_flags {
> >>       RXE_POOL_INDEX        = BIT(1),
> >>       RXE_POOL_KEY        = BIT(2),
> >>       RXE_POOL_NO_ALLOC    = BIT(4),
> >> +    RXE_POOL_WHITELIST    = BIT(5),
> >>   };
> >>     enum rxe_elem_type {
> >> @@ -44,6 +45,9 @@ struct rxe_type_info {
> >>       u32            min_index;
> >>       size_t            key_offset;
> >>       size_t            key_size;
> >> +    /* for white listing where necessary */
> >
> > s/where/when
> >
> >
> >> +    unsigned int        user_offset;
> >> +    unsigned int        user_size;
> >>       struct kmem_cache    *cache;
> >>   };
> >>  
> >
> >
> The reason for this change is that every time I do anything with rdma_rxe on current head of tree I get a kernel oops with a warning that there is a bad or missing white list. I traced this back to the user_copy routine which (recently) decided that when you copy just a part of a kernel memory object stored in a kmem cache that this represented a risk of leaking information from the kernel to user space. For the QP object the qp_num is copied back to user space in the user API. They also provided a new kmem_ccache_create_usercopy call that allows you to specify a 'whitelisted' portion of each object with an offset and length. So I just made it a feature of pools since it may come up again instead of treating QPs differently that all the other objects. This is part of a general program to harden the Linux kernel.
> You can see the change to rxe_cache_init in the same file. Perhaps just dropping the comment would address the concern. See an earlier post I made with a pointer to an article in lwn describing the changes to the kernel.

RXE must be rewritten to be secure.

Please drop this patch, we don't need compilation for no reason.

Thanks
Leon Romanovsky Aug. 24, 2020, 8:52 a.m. UTC | #4
On Fri, Aug 21, 2020 at 11:16:59PM -0500, Bob Pearson wrote:
> On 8/21/20 10:32 PM, Zhu Yanjun wrote:
> > On 8/21/2020 6:46 AM, Bob Pearson wrote:
> >> Added a new feature to pools to let driver white list a region of
> >> a pool object. This removes a kernel oops caused when create qp
> >> returns the qp number so the next patch will work without errors.
> >>
> >> Signed-off-by: Bob Pearson <rpearson@hpe.com>

And we asked you to provide warning itself.

Thanks
Bob Pearson Aug. 24, 2020, 11:52 p.m. UTC | #5
On 8/24/20 3:52 AM, Leon Romanovsky wrote:
> On Fri, Aug 21, 2020 at 11:16:59PM -0500, Bob Pearson wrote:
>> On 8/21/20 10:32 PM, Zhu Yanjun wrote:
>>> On 8/21/2020 6:46 AM, Bob Pearson wrote:
>>>> Added a new feature to pools to let driver white list a region of
>>>> a pool object. This removes a kernel oops caused when create qp
>>>> returns the qp number so the next patch will work without errors.
>>>>
>>>> Signed-off-by: Bob Pearson <rpearson@hpe.com>
> 
> And we asked you to provide warning itself.
> 
> Thanks
> 

Thanks for your responses to this patch (11/17). I am not yet convinced that there is anything that needs fixing. If you read the code in __check_heap_object in mm/slab.c (see below) you can see that any memory reference to kernel space from the slab/slub allocator, even if it is otherwise perfectly fine, that is not contained in the usercopy region (useroffset to useroffset + usersize from the start of each object) will trigger a warning. This is intentional not a bug. If you create the cache with kmem_cache_create this region is NULL, if you use kmem_cache_create_usercopy you may set the limits on the usercopy region.

There at least two ways to mitigate this warning, set the usercopy region (whitelist it) or copy the data through some other memory (e.g. copy onto the stack and call user copy from there). I have tried both of these and they work but still you are looking for something else. Either of these changes makes rxe secure as you put it.

This user_copy warning is from drivers/infiniband/core and is referring to the qp objects. It has nothing to do with any of the other changes in the patches. It is caused by the addition of the checks below which are new to the mainline kernel.

#ifdef CONFIG_HARDENED_USERCOPY
/*
 * Rejects incorrectly sized objects and objects that are to be copied
 * to/from userspace but do not fall entirely within the containing slab
 * cache's usercopy region.
 *
 * Returns NULL if check passes, otherwise const char * to name of cache
 * to indicate an error.
 */
void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
                         bool to_user)
{
        struct kmem_cache *cachep;
        unsigned int objnr;       objnr = obj_to_index(cachep, page, (void *)ptr);
        BUG_ON(objnr >= cachep->num);

        /* Find offset within object. */
        offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep);

        /* Allow address range falling entirely within usercopy region. */
        if (offset >= cachep->useroffset &&
            offset - cachep->useroffset <= cachep->usersize &&
            n <= cachep->useroffset - offset + cachep->usersize)
                return;
        if (usercopy_fallback &&
            offset <= cachep->object_size &&
            n <= cachep->object_size - offset) {
                usercopy_warn("SLAB object", cachep->name, to_user, offset, n);
                return;
        }

        usercopy_abort("SLAB object", cachep->name, to_user, offset, n);
}
#endif /* CONFIG_HARDENED_USERCOPY */
Leon Romanovsky Aug. 25, 2020, 5:04 a.m. UTC | #6
On Mon, Aug 24, 2020 at 06:52:50PM -0500, Bob Pearson wrote:
> On 8/24/20 3:52 AM, Leon Romanovsky wrote:
> > On Fri, Aug 21, 2020 at 11:16:59PM -0500, Bob Pearson wrote:
> >> On 8/21/20 10:32 PM, Zhu Yanjun wrote:
> >>> On 8/21/2020 6:46 AM, Bob Pearson wrote:
> >>>> Added a new feature to pools to let driver white list a region of
> >>>> a pool object. This removes a kernel oops caused when create qp
> >>>> returns the qp number so the next patch will work without errors.
> >>>>
> >>>> Signed-off-by: Bob Pearson <rpearson@hpe.com>
> >
> > And we asked you to provide warning itself.
> >
> > Thanks
> >
>
> Thanks for your responses to this patch (11/17). I am not yet convinced that there is anything that needs fixing. If you read the code in __check_heap_object in mm/slab.c (see below) you can see that any memory reference to kernel space from the slab/slub allocator, even if it is otherwise perfectly fine, that is not contained in the usercopy region (useroffset to useroffset + usersize from the start of each object) will trigger a warning. This is intentional not a bug. If you create the cache with kmem_cache_create this region is NULL, if you use kmem_cache_create_usercopy you may set the limits on the usercopy region.

I suggest to drop this patch, in this cycle, I will send patch that
converts QP to general allocation scheme. It will remove RXE QP pool.

Thanks
diff mbox series

Patch

diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
index 5679714827ec..374e56689d30 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.c
+++ b/drivers/infiniband/sw/rxe/rxe_pool.c
@@ -40,9 +40,12 @@  struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = {
 		.name		= "rxe-qp",
 		.size		= sizeof(struct rxe_qp),
 		.cleanup	= rxe_qp_cleanup,
-		.flags		= RXE_POOL_INDEX,
+		.flags		= RXE_POOL_INDEX
+				| RXE_POOL_WHITELIST,
 		.min_index	= RXE_MIN_QP_INDEX,
 		.max_index	= RXE_MAX_QP_INDEX,
+		.user_offset	= offsetof(struct rxe_qp, ibqp.qp_num),
+		.user_size	= sizeof(u32),
 	},
 	[RXE_TYPE_CQ] = {
 		.name		= "rxe-cq",
@@ -116,10 +119,21 @@  int rxe_cache_init(void)
 		type = &rxe_type_info[i];
 		size = ALIGN(type->size, RXE_POOL_ALIGN);
 		if (!(type->flags & RXE_POOL_NO_ALLOC)) {
-			type->cache =
-				kmem_cache_create(type->name, size,
+			if (type->flags & RXE_POOL_WHITELIST) {
+				type->cache =
+					kmem_cache_create_usercopy(
+						type->name, size,
+						RXE_POOL_ALIGN,
+						RXE_POOL_CACHE_FLAGS,
+						type->user_offset,
+						type->user_size, NULL);
+			} else {
+				type->cache =
+					kmem_cache_create(type->name, size,
 						  RXE_POOL_ALIGN,
 						  RXE_POOL_CACHE_FLAGS, NULL);
+			}
+
 			if (!type->cache) {
 				pr_err("Unable to init kmem cache for %s\n",
 				       type->name);
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h
index 664153bf9392..fc5b584a8137 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.h
+++ b/drivers/infiniband/sw/rxe/rxe_pool.h
@@ -17,6 +17,7 @@  enum rxe_pool_flags {
 	RXE_POOL_INDEX		= BIT(1),
 	RXE_POOL_KEY		= BIT(2),
 	RXE_POOL_NO_ALLOC	= BIT(4),
+	RXE_POOL_WHITELIST	= BIT(5),
 };
 
 enum rxe_elem_type {
@@ -44,6 +45,9 @@  struct rxe_type_info {
 	u32			min_index;
 	size_t			key_offset;
 	size_t			key_size;
+	/* for white listing where necessary */
+	unsigned int		user_offset;
+	unsigned int		user_size;
 	struct kmem_cache	*cache;
 };