diff mbox

[V2] mlx4_core: allocate ICM memory in page size chunks

Message ID 20180511192318.22342-1-qing.huang@oracle.com (mailing list archive)
State Not Applicable
Headers show

Commit Message

Qing Huang May 11, 2018, 7:23 p.m. UTC
When a system is under memory presure (high usage with fragments),
the original 256KB ICM chunk allocations will likely trigger kernel
memory management to enter slow path doing memory compact/migration
ops in order to complete high order memory allocations.

When that happens, user processes calling uverb APIs may get stuck
for more than 120s easily even though there are a lot of free pages
in smaller chunks available in the system.

Syslog:
...
Dec 10 09:04:51 slcc03db02 kernel: [397078.572732] INFO: task
oracle_205573_e:205573 blocked for more than 120 seconds.
...

With 4KB ICM chunk size on x86_64 arch, the above issue is fixed.

However in order to support smaller ICM chunk size, we need to fix
another issue in large size kcalloc allocations.

E.g.
Setting log_num_mtt=30 requires 1G mtt entries. With the 4KB ICM chunk
size, each ICM chunk can only hold 512 mtt entries (8 bytes for each mtt
entry). So we need a 16MB allocation for a table->icm pointer array to
hold 2M pointers which can easily cause kcalloc to fail.

The solution is to use vzalloc to replace kcalloc. There is no need
for contiguous memory pages for a driver meta data structure (no need
of DMA ops).

Signed-off-by: Qing Huang <qing.huang@oracle.com>
Acked-by: Daniel Jurgens <danielj@mellanox.com>
Reviewed-by: Zhu Yanjun <yanjun.zhu@oracle.com>
---
v2 -> v1: adjusted chunk size to reflect different architectures.

 drivers/net/ethernet/mellanox/mlx4/icm.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

Comments

Tariq Toukan May 13, 2018, 9 a.m. UTC | #1
On 11/05/2018 10:23 PM, Qing Huang wrote:
> When a system is under memory presure (high usage with fragments),
> the original 256KB ICM chunk allocations will likely trigger kernel
> memory management to enter slow path doing memory compact/migration
> ops in order to complete high order memory allocations.
> 
> When that happens, user processes calling uverb APIs may get stuck
> for more than 120s easily even though there are a lot of free pages
> in smaller chunks available in the system.
> 
> Syslog:
> ...
> Dec 10 09:04:51 slcc03db02 kernel: [397078.572732] INFO: task
> oracle_205573_e:205573 blocked for more than 120 seconds.
> ...
> 
> With 4KB ICM chunk size on x86_64 arch, the above issue is fixed.
> 
> However in order to support smaller ICM chunk size, we need to fix
> another issue in large size kcalloc allocations.
> 
> E.g.
> Setting log_num_mtt=30 requires 1G mtt entries. With the 4KB ICM chunk
> size, each ICM chunk can only hold 512 mtt entries (8 bytes for each mtt
> entry). So we need a 16MB allocation for a table->icm pointer array to
> hold 2M pointers which can easily cause kcalloc to fail.
> 
> The solution is to use vzalloc to replace kcalloc. There is no need
> for contiguous memory pages for a driver meta data structure (no need
> of DMA ops).
> 
> Signed-off-by: Qing Huang <qing.huang@oracle.com>
> Acked-by: Daniel Jurgens <danielj@mellanox.com>
> Reviewed-by: Zhu Yanjun <yanjun.zhu@oracle.com>
> ---
> v2 -> v1: adjusted chunk size to reflect different architectures.
> 
>   drivers/net/ethernet/mellanox/mlx4/icm.c | 14 +++++++-------
>   1 file changed, 7 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c b/drivers/net/ethernet/mellanox/mlx4/icm.c
> index a822f7a..ccb62b8 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/icm.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
> @@ -43,12 +43,12 @@
>   #include "fw.h"
>   
>   /*
> - * We allocate in as big chunks as we can, up to a maximum of 256 KB
> - * per chunk.
> + * We allocate in page size (default 4KB on many archs) chunks to avoid high
> + * order memory allocations in fragmented/high usage memory situation.
>    */
>   enum {
> -	MLX4_ICM_ALLOC_SIZE	= 1 << 18,
> -	MLX4_TABLE_CHUNK_SIZE	= 1 << 18
> +	MLX4_ICM_ALLOC_SIZE	= 1 << PAGE_SHIFT,
> +	MLX4_TABLE_CHUNK_SIZE	= 1 << PAGE_SHIFT

Which is actually PAGE_SIZE.
Also, please add a comma at the end of the last entry.

>   };
>   
>   static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk)
> @@ -400,7 +400,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
>   	obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
>   	num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
>   
> -	table->icm      = kcalloc(num_icm, sizeof(*table->icm), GFP_KERNEL);
> +	table->icm      = vzalloc(num_icm * sizeof(*table->icm));

Why not kvzalloc ?

>   	if (!table->icm)
>   		return -ENOMEM;
>   	table->virt     = virt;
> @@ -446,7 +446,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
>   			mlx4_free_icm(dev, table->icm[i], use_coherent);
>   		}
>   
> -	kfree(table->icm);
> +	vfree(table->icm);
>   
>   	return -ENOMEM;
>   }
> @@ -462,5 +462,5 @@ void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table)
>   			mlx4_free_icm(dev, table->icm[i], table->coherent);
>   		}
>   
> -	kfree(table->icm);
> +	vfree(table->icm);
>   }
> 

Thanks for your patch.

I need to verify there is no dramatic performance degradation here.
You can prepare and send a v3 in the meanwhile.

Thanks,
Tariq
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Qing Huang May 14, 2018, 4:41 p.m. UTC | #2
On 5/13/2018 2:00 AM, Tariq Toukan wrote:
>
>
> On 11/05/2018 10:23 PM, Qing Huang wrote:
>> When a system is under memory presure (high usage with fragments),
>> the original 256KB ICM chunk allocations will likely trigger kernel
>> memory management to enter slow path doing memory compact/migration
>> ops in order to complete high order memory allocations.
>>
>> When that happens, user processes calling uverb APIs may get stuck
>> for more than 120s easily even though there are a lot of free pages
>> in smaller chunks available in the system.
>>
>> Syslog:
>> ...
>> Dec 10 09:04:51 slcc03db02 kernel: [397078.572732] INFO: task
>> oracle_205573_e:205573 blocked for more than 120 seconds.
>> ...
>>
>> With 4KB ICM chunk size on x86_64 arch, the above issue is fixed.
>>
>> However in order to support smaller ICM chunk size, we need to fix
>> another issue in large size kcalloc allocations.
>>
>> E.g.
>> Setting log_num_mtt=30 requires 1G mtt entries. With the 4KB ICM chunk
>> size, each ICM chunk can only hold 512 mtt entries (8 bytes for each mtt
>> entry). So we need a 16MB allocation for a table->icm pointer array to
>> hold 2M pointers which can easily cause kcalloc to fail.
>>
>> The solution is to use vzalloc to replace kcalloc. There is no need
>> for contiguous memory pages for a driver meta data structure (no need
>> of DMA ops).
>>
>> Signed-off-by: Qing Huang <qing.huang@oracle.com>
>> Acked-by: Daniel Jurgens <danielj@mellanox.com>
>> Reviewed-by: Zhu Yanjun <yanjun.zhu@oracle.com>
>> ---
>> v2 -> v1: adjusted chunk size to reflect different architectures.
>>
>>   drivers/net/ethernet/mellanox/mlx4/icm.c | 14 +++++++-------
>>   1 file changed, 7 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c 
>> b/drivers/net/ethernet/mellanox/mlx4/icm.c
>> index a822f7a..ccb62b8 100644
>> --- a/drivers/net/ethernet/mellanox/mlx4/icm.c
>> +++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
>> @@ -43,12 +43,12 @@
>>   #include "fw.h"
>>     /*
>> - * We allocate in as big chunks as we can, up to a maximum of 256 KB
>> - * per chunk.
>> + * We allocate in page size (default 4KB on many archs) chunks to 
>> avoid high
>> + * order memory allocations in fragmented/high usage memory situation.
>>    */
>>   enum {
>> -    MLX4_ICM_ALLOC_SIZE    = 1 << 18,
>> -    MLX4_TABLE_CHUNK_SIZE    = 1 << 18
>> +    MLX4_ICM_ALLOC_SIZE    = 1 << PAGE_SHIFT,
>> +    MLX4_TABLE_CHUNK_SIZE    = 1 << PAGE_SHIFT
>
> Which is actually PAGE_SIZE.

Yes, we wanted to avoid high order memory allocations.

> Also, please add a comma at the end of the last entry.

Hmm..., followed the existing code style and checkpatch.pl didn't 
complain about the comma.

>
>>   };
>>     static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct 
>> mlx4_icm_chunk *chunk)
>> @@ -400,7 +400,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, 
>> struct mlx4_icm_table *table,
>>       obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
>>       num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
>>   -    table->icm      = kcalloc(num_icm, sizeof(*table->icm), 
>> GFP_KERNEL);
>> +    table->icm      = vzalloc(num_icm * sizeof(*table->icm));
>
> Why not kvzalloc ?

I think table->icm pointer array doesn't really need physically 
contiguous memory. Sometimes high order
memory allocation by kmalloc variants may trigger slow path and cause 
tasks to be blocked.

Thanks,
Qing

>
>>       if (!table->icm)
>>           return -ENOMEM;
>>       table->virt     = virt;
>> @@ -446,7 +446,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, 
>> struct mlx4_icm_table *table,
>>               mlx4_free_icm(dev, table->icm[i], use_coherent);
>>           }
>>   -    kfree(table->icm);
>> +    vfree(table->icm);
>>         return -ENOMEM;
>>   }
>> @@ -462,5 +462,5 @@ void mlx4_cleanup_icm_table(struct mlx4_dev *dev, 
>> struct mlx4_icm_table *table)
>>               mlx4_free_icm(dev, table->icm[i], table->coherent);
>>           }
>>   -    kfree(table->icm);
>> +    vfree(table->icm);
>>   }
>>
>
> Thanks for your patch.
>
> I need to verify there is no dramatic performance degradation here.
> You can prepare and send a v3 in the meanwhile.
>
> Thanks,
> Tariq
> -- 
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tariq Toukan May 15, 2018, 9:19 a.m. UTC | #3
On 14/05/2018 7:41 PM, Qing Huang wrote:
> 
> 
> On 5/13/2018 2:00 AM, Tariq Toukan wrote:
>>
>>
>> On 11/05/2018 10:23 PM, Qing Huang wrote:
>>> When a system is under memory presure (high usage with fragments),
>>> the original 256KB ICM chunk allocations will likely trigger kernel
>>> memory management to enter slow path doing memory compact/migration
>>> ops in order to complete high order memory allocations.
>>>
>>> When that happens, user processes calling uverb APIs may get stuck
>>> for more than 120s easily even though there are a lot of free pages
>>> in smaller chunks available in the system.
>>>
>>> Syslog:
>>> ...
>>> Dec 10 09:04:51 slcc03db02 kernel: [397078.572732] INFO: task
>>> oracle_205573_e:205573 blocked for more than 120 seconds.
>>> ...
>>>
>>> With 4KB ICM chunk size on x86_64 arch, the above issue is fixed.
>>>
>>> However in order to support smaller ICM chunk size, we need to fix
>>> another issue in large size kcalloc allocations.
>>>
>>> E.g.
>>> Setting log_num_mtt=30 requires 1G mtt entries. With the 4KB ICM chunk
>>> size, each ICM chunk can only hold 512 mtt entries (8 bytes for each mtt
>>> entry). So we need a 16MB allocation for a table->icm pointer array to
>>> hold 2M pointers which can easily cause kcalloc to fail.
>>>
>>> The solution is to use vzalloc to replace kcalloc. There is no need
>>> for contiguous memory pages for a driver meta data structure (no need
>>> of DMA ops).
>>>
>>> Signed-off-by: Qing Huang <qing.huang@oracle.com>
>>> Acked-by: Daniel Jurgens <danielj@mellanox.com>
>>> Reviewed-by: Zhu Yanjun <yanjun.zhu@oracle.com>
>>> ---
>>> v2 -> v1: adjusted chunk size to reflect different architectures.
>>>
>>>   drivers/net/ethernet/mellanox/mlx4/icm.c | 14 +++++++-------
>>>   1 file changed, 7 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c 
>>> b/drivers/net/ethernet/mellanox/mlx4/icm.c
>>> index a822f7a..ccb62b8 100644
>>> --- a/drivers/net/ethernet/mellanox/mlx4/icm.c
>>> +++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
>>> @@ -43,12 +43,12 @@
>>>   #include "fw.h"
>>>     /*
>>> - * We allocate in as big chunks as we can, up to a maximum of 256 KB
>>> - * per chunk.
>>> + * We allocate in page size (default 4KB on many archs) chunks to 
>>> avoid high
>>> + * order memory allocations in fragmented/high usage memory situation.
>>>    */
>>>   enum {
>>> -    MLX4_ICM_ALLOC_SIZE    = 1 << 18,
>>> -    MLX4_TABLE_CHUNK_SIZE    = 1 << 18
>>> +    MLX4_ICM_ALLOC_SIZE    = 1 << PAGE_SHIFT,
>>> +    MLX4_TABLE_CHUNK_SIZE    = 1 << PAGE_SHIFT
>>
>> Which is actually PAGE_SIZE.
> 
> Yes, we wanted to avoid high order memory allocations.
> 

Then please use PAGE_SIZE instead.

>> Also, please add a comma at the end of the last entry.
> 
> Hmm..., followed the existing code style and checkpatch.pl didn't 
> complain about the comma.
> 

I am in favor of having a comma also after the last element, so that 
when another enum element is added we do not modify this line again, 
which would falsely affect git blame.

I know it didn't exist before your patch, but once we're here, let's do it.

>>
>>>   };
>>>     static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct 
>>> mlx4_icm_chunk *chunk)
>>> @@ -400,7 +400,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, 
>>> struct mlx4_icm_table *table,
>>>       obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
>>>       num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
>>>   -    table->icm      = kcalloc(num_icm, sizeof(*table->icm), 
>>> GFP_KERNEL);
>>> +    table->icm      = vzalloc(num_icm * sizeof(*table->icm));
>>
>> Why not kvzalloc ?
> 
> I think table->icm pointer array doesn't really need physically 
> contiguous memory. Sometimes high order
> memory allocation by kmalloc variants may trigger slow path and cause 
> tasks to be blocked.
> 

This is control path so it is less latency-sensitive.
Let's not produce unnecessary degradation here, please call kvzalloc so 
we maintain a similar behavior when contiguous memory is available, and 
a fallback for resiliency.

> Thanks,
> Qing
> 
>>
>>>       if (!table->icm)
>>>           return -ENOMEM;
>>>       table->virt     = virt;
>>> @@ -446,7 +446,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, 
>>> struct mlx4_icm_table *table,
>>>               mlx4_free_icm(dev, table->icm[i], use_coherent);
>>>           }
>>>   -    kfree(table->icm);
>>> +    vfree(table->icm);
>>>         return -ENOMEM;
>>>   }
>>> @@ -462,5 +462,5 @@ void mlx4_cleanup_icm_table(struct mlx4_dev *dev, 
>>> struct mlx4_icm_table *table)
>>>               mlx4_free_icm(dev, table->icm[i], table->coherent);
>>>           }
>>>   -    kfree(table->icm);
>>> +    vfree(table->icm);
>>>   }
>>>
>>
>> Thanks for your patch.
>>
>> I need to verify there is no dramatic performance degradation here.
>> You can prepare and send a v3 in the meanwhile.
>>
>> Thanks,
>> Tariq
>> -- 
>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Qing Huang May 15, 2018, 6:53 p.m. UTC | #4
On 5/15/2018 2:19 AM, Tariq Toukan wrote:
>
>
> On 14/05/2018 7:41 PM, Qing Huang wrote:
>>
>>
>> On 5/13/2018 2:00 AM, Tariq Toukan wrote:
>>>
>>>
>>> On 11/05/2018 10:23 PM, Qing Huang wrote:
>>>> When a system is under memory presure (high usage with fragments),
>>>> the original 256KB ICM chunk allocations will likely trigger kernel
>>>> memory management to enter slow path doing memory compact/migration
>>>> ops in order to complete high order memory allocations.
>>>>
>>>> When that happens, user processes calling uverb APIs may get stuck
>>>> for more than 120s easily even though there are a lot of free pages
>>>> in smaller chunks available in the system.
>>>>
>>>> Syslog:
>>>> ...
>>>> Dec 10 09:04:51 slcc03db02 kernel: [397078.572732] INFO: task
>>>> oracle_205573_e:205573 blocked for more than 120 seconds.
>>>> ...
>>>>
>>>> With 4KB ICM chunk size on x86_64 arch, the above issue is fixed.
>>>>
>>>> However in order to support smaller ICM chunk size, we need to fix
>>>> another issue in large size kcalloc allocations.
>>>>
>>>> E.g.
>>>> Setting log_num_mtt=30 requires 1G mtt entries. With the 4KB ICM chunk
>>>> size, each ICM chunk can only hold 512 mtt entries (8 bytes for 
>>>> each mtt
>>>> entry). So we need a 16MB allocation for a table->icm pointer array to
>>>> hold 2M pointers which can easily cause kcalloc to fail.
>>>>
>>>> The solution is to use vzalloc to replace kcalloc. There is no need
>>>> for contiguous memory pages for a driver meta data structure (no need
>>>> of DMA ops).
>>>>
>>>> Signed-off-by: Qing Huang <qing.huang@oracle.com>
>>>> Acked-by: Daniel Jurgens <danielj@mellanox.com>
>>>> Reviewed-by: Zhu Yanjun <yanjun.zhu@oracle.com>
>>>> ---
>>>> v2 -> v1: adjusted chunk size to reflect different architectures.
>>>>
>>>>   drivers/net/ethernet/mellanox/mlx4/icm.c | 14 +++++++-------
>>>>   1 file changed, 7 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c 
>>>> b/drivers/net/ethernet/mellanox/mlx4/icm.c
>>>> index a822f7a..ccb62b8 100644
>>>> --- a/drivers/net/ethernet/mellanox/mlx4/icm.c
>>>> +++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
>>>> @@ -43,12 +43,12 @@
>>>>   #include "fw.h"
>>>>     /*
>>>> - * We allocate in as big chunks as we can, up to a maximum of 256 KB
>>>> - * per chunk.
>>>> + * We allocate in page size (default 4KB on many archs) chunks to 
>>>> avoid high
>>>> + * order memory allocations in fragmented/high usage memory 
>>>> situation.
>>>>    */
>>>>   enum {
>>>> -    MLX4_ICM_ALLOC_SIZE    = 1 << 18,
>>>> -    MLX4_TABLE_CHUNK_SIZE    = 1 << 18
>>>> +    MLX4_ICM_ALLOC_SIZE    = 1 << PAGE_SHIFT,
>>>> +    MLX4_TABLE_CHUNK_SIZE    = 1 << PAGE_SHIFT
>>>
>>> Which is actually PAGE_SIZE.
>>
>> Yes, we wanted to avoid high order memory allocations.
>>
>
> Then please use PAGE_SIZE instead.

PAGE_SIZE is usually defined as 1 << PAGE_SHIFT. So I think PAGE_SHIFT 
is actually more appropriate here.


>
>>> Also, please add a comma at the end of the last entry.
>>
>> Hmm..., followed the existing code style and checkpatch.pl didn't 
>> complain about the comma.
>>
>
> I am in favor of having a comma also after the last element, so that 
> when another enum element is added we do not modify this line again, 
> which would falsely affect git blame.
>
> I know it didn't exist before your patch, but once we're here, let's 
> do it.

I'm okay either way. If adding an extra comma is preferred by many 
people, someone should update checkpatch.pl to enforce it. :)

>
>>>
>>>>   };
>>>>     static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct 
>>>> mlx4_icm_chunk *chunk)
>>>> @@ -400,7 +400,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, 
>>>> struct mlx4_icm_table *table,
>>>>       obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
>>>>       num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
>>>>   -    table->icm      = kcalloc(num_icm, sizeof(*table->icm), 
>>>> GFP_KERNEL);
>>>> +    table->icm      = vzalloc(num_icm * sizeof(*table->icm));
>>>
>>> Why not kvzalloc ?
>>
>> I think table->icm pointer array doesn't really need physically 
>> contiguous memory. Sometimes high order
>> memory allocation by kmalloc variants may trigger slow path and cause 
>> tasks to be blocked.
>>
>
> This is control path so it is less latency-sensitive.
> Let's not produce unnecessary degradation here, please call kvzalloc 
> so we maintain a similar behavior when contiguous memory is available, 
> and a fallback for resiliency.

No sure what exactly degradation is caused by vzalloc here. I think it's 
better to keep physically contiguous pages
to other requests which really need them. Besides slow path/mem 
compacting can be really expensive.

>
>> Thanks,
>> Qing
>>
>>>
>>>>       if (!table->icm)
>>>>           return -ENOMEM;
>>>>       table->virt     = virt;
>>>> @@ -446,7 +446,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, 
>>>> struct mlx4_icm_table *table,
>>>>               mlx4_free_icm(dev, table->icm[i], use_coherent);
>>>>           }
>>>>   -    kfree(table->icm);
>>>> +    vfree(table->icm);
>>>>         return -ENOMEM;
>>>>   }
>>>> @@ -462,5 +462,5 @@ void mlx4_cleanup_icm_table(struct mlx4_dev 
>>>> *dev, struct mlx4_icm_table *table)
>>>>               mlx4_free_icm(dev, table->icm[i], table->coherent);
>>>>           }
>>>>   -    kfree(table->icm);
>>>> +    vfree(table->icm);
>>>>   }
>>>>
>>>
>>> Thanks for your patch.
>>>
>>> I need to verify there is no dramatic performance degradation here.
>>> You can prepare and send a v3 in the meanwhile.
>>>
>>> Thanks,
>>> Tariq
>>> -- 
>>> To unsubscribe from this list: send the line "unsubscribe 
>>> linux-rdma" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>
> -- 
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet May 15, 2018, 7:08 p.m. UTC | #5
On 05/15/2018 11:53 AM, Qing Huang wrote:
> 
>> This is control path so it is less latency-sensitive.
>> Let's not produce unnecessary degradation here, please call kvzalloc so we maintain a similar behavior when contiguous memory is available, and a fallback for resiliency.
> 
> No sure what exactly degradation is caused by vzalloc here. I think it's better to keep physically contiguous pages
> to other requests which really need them. Besides slow path/mem compacting can be really expensive.
>

Just use kvzalloc(), and you get the benefit of having contiguous memory if available,
without expensive compact phase.

This thing _automatically_ falls back to vmalloc(), thus your problem will be solved.

If you are not sure, trust others.


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Qing Huang May 15, 2018, 7:45 p.m. UTC | #6
On 5/15/2018 12:08 PM, Eric Dumazet wrote:
>
> On 05/15/2018 11:53 AM, Qing Huang wrote:
>>> This is control path so it is less latency-sensitive.
>>> Let's not produce unnecessary degradation here, please call kvzalloc so we maintain a similar behavior when contiguous memory is available, and a fallback for resiliency.
>> No sure what exactly degradation is caused by vzalloc here. I think it's better to keep physically contiguous pages
>> to other requests which really need them. Besides slow path/mem compacting can be really expensive.
>>
> Just use kvzalloc(), and you get the benefit of having contiguous memory if available,
> without expensive compact phase.
>
> This thing _automatically_ falls back to vmalloc(), thus your problem will be solved.
>
> If you are not sure, trust others.

Thanks for the review. There are many places in kernel and applications 
where physically contiguous pages are needed.
We saw quite a few issues when there were not enough contiguous phy mem 
available. My main concern here is that why
using physically contiguous pages when they are not really needed?




--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tariq Toukan May 16, 2018, 7:04 a.m. UTC | #7
On 15/05/2018 9:53 PM, Qing Huang wrote:
> 
> 
> On 5/15/2018 2:19 AM, Tariq Toukan wrote:
>>
>>
>> On 14/05/2018 7:41 PM, Qing Huang wrote:
>>>
>>>
>>> On 5/13/2018 2:00 AM, Tariq Toukan wrote:
>>>>
>>>>
>>>> On 11/05/2018 10:23 PM, Qing Huang wrote:
>>>>> When a system is under memory presure (high usage with fragments),
>>>>> the original 256KB ICM chunk allocations will likely trigger kernel
>>>>> memory management to enter slow path doing memory compact/migration
>>>>> ops in order to complete high order memory allocations.
>>>>>
>>>>> When that happens, user processes calling uverb APIs may get stuck
>>>>> for more than 120s easily even though there are a lot of free pages
>>>>> in smaller chunks available in the system.
>>>>>
>>>>> Syslog:
>>>>> ...
>>>>> Dec 10 09:04:51 slcc03db02 kernel: [397078.572732] INFO: task
>>>>> oracle_205573_e:205573 blocked for more than 120 seconds.
>>>>> ...
>>>>>
>>>>> With 4KB ICM chunk size on x86_64 arch, the above issue is fixed.
>>>>>
>>>>> However in order to support smaller ICM chunk size, we need to fix
>>>>> another issue in large size kcalloc allocations.
>>>>>
>>>>> E.g.
>>>>> Setting log_num_mtt=30 requires 1G mtt entries. With the 4KB ICM chunk
>>>>> size, each ICM chunk can only hold 512 mtt entries (8 bytes for 
>>>>> each mtt
>>>>> entry). So we need a 16MB allocation for a table->icm pointer array to
>>>>> hold 2M pointers which can easily cause kcalloc to fail.
>>>>>
>>>>> The solution is to use vzalloc to replace kcalloc. There is no need
>>>>> for contiguous memory pages for a driver meta data structure (no need
>>>>> of DMA ops).
>>>>>
>>>>> Signed-off-by: Qing Huang <qing.huang@oracle.com>
>>>>> Acked-by: Daniel Jurgens <danielj@mellanox.com>
>>>>> Reviewed-by: Zhu Yanjun <yanjun.zhu@oracle.com>
>>>>> ---
>>>>> v2 -> v1: adjusted chunk size to reflect different architectures.
>>>>>
>>>>>   drivers/net/ethernet/mellanox/mlx4/icm.c | 14 +++++++-------
>>>>>   1 file changed, 7 insertions(+), 7 deletions(-)
>>>>>
>>>>> diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c 
>>>>> b/drivers/net/ethernet/mellanox/mlx4/icm.c
>>>>> index a822f7a..ccb62b8 100644
>>>>> --- a/drivers/net/ethernet/mellanox/mlx4/icm.c
>>>>> +++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
>>>>> @@ -43,12 +43,12 @@
>>>>>   #include "fw.h"
>>>>>     /*
>>>>> - * We allocate in as big chunks as we can, up to a maximum of 256 KB
>>>>> - * per chunk.
>>>>> + * We allocate in page size (default 4KB on many archs) chunks to 
>>>>> avoid high
>>>>> + * order memory allocations in fragmented/high usage memory 
>>>>> situation.
>>>>>    */
>>>>>   enum {
>>>>> -    MLX4_ICM_ALLOC_SIZE    = 1 << 18,
>>>>> -    MLX4_TABLE_CHUNK_SIZE    = 1 << 18
>>>>> +    MLX4_ICM_ALLOC_SIZE    = 1 << PAGE_SHIFT,
>>>>> +    MLX4_TABLE_CHUNK_SIZE    = 1 << PAGE_SHIFT
>>>>
>>>> Which is actually PAGE_SIZE.
>>>
>>> Yes, we wanted to avoid high order memory allocations.
>>>
>>
>> Then please use PAGE_SIZE instead.
> 
> PAGE_SIZE is usually defined as 1 << PAGE_SHIFT. So I think PAGE_SHIFT 
> is actually more appropriate here.
> 

Definition of PAGE_SIZE varies among different archs.
It is not always as simple as 1 << PAGE_SHIFT.
It might be:
PAGE_SIZE (1UL << PAGE_SHIFT)
PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
etc...

Please replace 1 << PAGE_SHIFT with PAGE_SIZE.

> 
>>
>>>> Also, please add a comma at the end of the last entry.
>>>
>>> Hmm..., followed the existing code style and checkpatch.pl didn't 
>>> complain about the comma.
>>>
>>
>> I am in favor of having a comma also after the last element, so that 
>> when another enum element is added we do not modify this line again, 
>> which would falsely affect git blame.
>>
>> I know it didn't exist before your patch, but once we're here, let's 
>> do it.
> 
> I'm okay either way. If adding an extra comma is preferred by many 
> people, someone should update checkpatch.pl to enforce it. :)
> 
I agree.
Until then, please use an extra comma in this patch.

>>
>>>>
>>>>>   };
>>>>>     static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct 
>>>>> mlx4_icm_chunk *chunk)
>>>>> @@ -400,7 +400,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, 
>>>>> struct mlx4_icm_table *table,
>>>>>       obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
>>>>>       num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
>>>>>   -    table->icm      = kcalloc(num_icm, sizeof(*table->icm), 
>>>>> GFP_KERNEL);
>>>>> +    table->icm      = vzalloc(num_icm * sizeof(*table->icm));
>>>>
>>>> Why not kvzalloc ?
>>>
>>> I think table->icm pointer array doesn't really need physically 
>>> contiguous memory. Sometimes high order
>>> memory allocation by kmalloc variants may trigger slow path and cause 
>>> tasks to be blocked.
>>>
>>
>> This is control path so it is less latency-sensitive.
>> Let's not produce unnecessary degradation here, please call kvzalloc 
>> so we maintain a similar behavior when contiguous memory is available, 
>> and a fallback for resiliency.
> 
> No sure what exactly degradation is caused by vzalloc here. I think it's 
> better to keep physically contiguous pages
> to other requests which really need them. Besides slow path/mem 
> compacting can be really expensive.
> 

Degradation is expected when you replace a contig memory with non-contig 
memory, without any perf test.
We agree that when contig memory is not available, we should use 
non-contig instead of simply failing, and for this you can call kvzalloc.

>>
>>> Thanks,
>>> Qing
>>>
>>>>
>>>>>       if (!table->icm)
>>>>>           return -ENOMEM;
>>>>>       table->virt     = virt;
>>>>> @@ -446,7 +446,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, 
>>>>> struct mlx4_icm_table *table,
>>>>>               mlx4_free_icm(dev, table->icm[i], use_coherent);
>>>>>           }
>>>>>   -    kfree(table->icm);
>>>>> +    vfree(table->icm);
>>>>>         return -ENOMEM;
>>>>>   }
>>>>> @@ -462,5 +462,5 @@ void mlx4_cleanup_icm_table(struct mlx4_dev 
>>>>> *dev, struct mlx4_icm_table *table)
>>>>>               mlx4_free_icm(dev, table->icm[i], table->coherent);
>>>>>           }
>>>>>   -    kfree(table->icm);
>>>>> +    vfree(table->icm);
>>>>>   }
>>>>>
>>>>
>>>> Thanks for your patch.
>>>>
>>>> I need to verify there is no dramatic performance degradation here.
>>>> You can prepare and send a v3 in the meanwhile.
>>>>
>>>> Thanks,
>>>> Tariq
>>>> -- 
>>>> To unsubscribe from this list: send the line "unsubscribe 
>>>> linux-rdma" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>>
>> -- 
>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gi-Oh Kim May 16, 2018, 10:10 a.m. UTC | #8
On Wed, May 16, 2018 at 9:04 AM, Tariq Toukan <tariqt@mellanox.com> wrote:
>
>
> On 15/05/2018 9:53 PM, Qing Huang wrote:
>>
>>
>>
>> On 5/15/2018 2:19 AM, Tariq Toukan wrote:
>>>
>>>
>>>
>>> On 14/05/2018 7:41 PM, Qing Huang wrote:
>>>>
>>>>
>>>>
>>>> On 5/13/2018 2:00 AM, Tariq Toukan wrote:
>>>>>
>>>>>
>>>>>
>>>>> On 11/05/2018 10:23 PM, Qing Huang wrote:
>>>>>>
>>>>>> When a system is under memory presure (high usage with fragments),
>>>>>> the original 256KB ICM chunk allocations will likely trigger kernel
>>>>>> memory management to enter slow path doing memory compact/migration
>>>>>> ops in order to complete high order memory allocations.
>>>>>>
>>>>>> When that happens, user processes calling uverb APIs may get stuck
>>>>>> for more than 120s easily even though there are a lot of free pages
>>>>>> in smaller chunks available in the system.
>>>>>>
>>>>>> Syslog:
>>>>>> ...
>>>>>> Dec 10 09:04:51 slcc03db02 kernel: [397078.572732] INFO: task
>>>>>> oracle_205573_e:205573 blocked for more than 120 seconds.
>>>>>> ...
>>>>>>
>>>>>> With 4KB ICM chunk size on x86_64 arch, the above issue is fixed.
>>>>>>
>>>>>> However in order to support smaller ICM chunk size, we need to fix
>>>>>> another issue in large size kcalloc allocations.
>>>>>>
>>>>>> E.g.
>>>>>> Setting log_num_mtt=30 requires 1G mtt entries. With the 4KB ICM chunk
>>>>>> size, each ICM chunk can only hold 512 mtt entries (8 bytes for each
>>>>>> mtt
>>>>>> entry). So we need a 16MB allocation for a table->icm pointer array to
>>>>>> hold 2M pointers which can easily cause kcalloc to fail.
>>>>>>
>>>>>> The solution is to use vzalloc to replace kcalloc. There is no need
>>>>>> for contiguous memory pages for a driver meta data structure (no need
>>>>>> of DMA ops).
>>>>>>
>>>>>> Signed-off-by: Qing Huang <qing.huang@oracle.com>
>>>>>> Acked-by: Daniel Jurgens <danielj@mellanox.com>
>>>>>> Reviewed-by: Zhu Yanjun <yanjun.zhu@oracle.com>
>>>>>> ---
>>>>>> v2 -> v1: adjusted chunk size to reflect different architectures.
>>>>>>
>>>>>>   drivers/net/ethernet/mellanox/mlx4/icm.c | 14 +++++++-------
>>>>>>   1 file changed, 7 insertions(+), 7 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c
>>>>>> b/drivers/net/ethernet/mellanox/mlx4/icm.c
>>>>>> index a822f7a..ccb62b8 100644
>>>>>> --- a/drivers/net/ethernet/mellanox/mlx4/icm.c
>>>>>> +++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
>>>>>> @@ -43,12 +43,12 @@
>>>>>>   #include "fw.h"
>>>>>>     /*
>>>>>> - * We allocate in as big chunks as we can, up to a maximum of 256 KB
>>>>>> - * per chunk.
>>>>>> + * We allocate in page size (default 4KB on many archs) chunks to
>>>>>> avoid high
>>>>>> + * order memory allocations in fragmented/high usage memory
>>>>>> situation.
>>>>>>    */
>>>>>>   enum {
>>>>>> -    MLX4_ICM_ALLOC_SIZE    = 1 << 18,
>>>>>> -    MLX4_TABLE_CHUNK_SIZE    = 1 << 18
>>>>>> +    MLX4_ICM_ALLOC_SIZE    = 1 << PAGE_SHIFT,
>>>>>> +    MLX4_TABLE_CHUNK_SIZE    = 1 << PAGE_SHIFT
>>>>>
>>>>>
>>>>> Which is actually PAGE_SIZE.
>>>>
>>>>
>>>> Yes, we wanted to avoid high order memory allocations.
>>>>
>>>
>>> Then please use PAGE_SIZE instead.
>>
>>
>> PAGE_SIZE is usually defined as 1 << PAGE_SHIFT. So I think PAGE_SHIFT is
>> actually more appropriate here.
>>
>
> Definition of PAGE_SIZE varies among different archs.
> It is not always as simple as 1 << PAGE_SHIFT.
> It might be:
> PAGE_SIZE (1UL << PAGE_SHIFT)
> PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
> etc...
>
> Please replace 1 << PAGE_SHIFT with PAGE_SIZE.
>
>>
>>>
>>>>> Also, please add a comma at the end of the last entry.
>>>>
>>>>
>>>> Hmm..., followed the existing code style and checkpatch.pl didn't
>>>> complain about the comma.
>>>>
>>>
>>> I am in favor of having a comma also after the last element, so that when
>>> another enum element is added we do not modify this line again, which would
>>> falsely affect git blame.
>>>
>>> I know it didn't exist before your patch, but once we're here, let's do
>>> it.
>>
>>
>> I'm okay either way. If adding an extra comma is preferred by many people,
>> someone should update checkpatch.pl to enforce it. :)
>>
> I agree.
> Until then, please use an extra comma in this patch.
>
>>>
>>>>>
>>>>>>   };
>>>>>>     static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct
>>>>>> mlx4_icm_chunk *chunk)
>>>>>> @@ -400,7 +400,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev,
>>>>>> struct mlx4_icm_table *table,
>>>>>>       obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
>>>>>>       num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
>>>>>>   -    table->icm      = kcalloc(num_icm, sizeof(*table->icm),
>>>>>> GFP_KERNEL);
>>>>>> +    table->icm      = vzalloc(num_icm * sizeof(*table->icm));
>>>>>
>>>>>
>>>>> Why not kvzalloc ?
>>>>
>>>>
>>>> I think table->icm pointer array doesn't really need physically
>>>> contiguous memory. Sometimes high order
>>>> memory allocation by kmalloc variants may trigger slow path and cause
>>>> tasks to be blocked.
>>>>
>>>
>>> This is control path so it is less latency-sensitive.
>>> Let's not produce unnecessary degradation here, please call kvzalloc so
>>> we maintain a similar behavior when contiguous memory is available, and a
>>> fallback for resiliency.
>>
>>
>> No sure what exactly degradation is caused by vzalloc here. I think it's
>> better to keep physically contiguous pages
>> to other requests which really need them. Besides slow path/mem compacting
>> can be really expensive.
>>
>
> Degradation is expected when you replace a contig memory with non-contig
> memory, without any perf test.
> We agree that when contig memory is not available, we should use non-contig
> instead of simply failing, and for this you can call kvzalloc.

The expected degradation would be little if the data is not very
performance sensitive.
I think vmalloc would be better in general case.

Even if the server has hundreds of gigabyte memory, even 1MB
contiguous memory is often rare.
For example, I attached /proc/pagetypeinfo of my server running for 153 days.
The largest contiguous memory is 2^7=512KB.

Node    0, zone   Normal, type    Unmovable   4499   9418   4817
732    747    567     18      3      0      0      0
Node    0, zone   Normal, type      Movable  38179  40839  10546
1888    491     51      1      0      0      0      0
Node    0, zone   Normal, type  Reclaimable    117     98   1279
35     21     10      8      0      0      0      0
Node    0, zone   Normal, type   HighAtomic      0      0      0
0      0      0      0      0      0      0      0
Node    0, zone   Normal, type      Isolate      0      0      0
0      0      0      0      0      0      0      0

So I think vmalloc would be good if it is not very performance critical data.
kernel test robot May 17, 2018, 6:39 p.m. UTC | #9
Hi Qing,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on net-next/master]
[also build test ERROR on v4.17-rc5 next-20180517]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Qing-Huang/mlx4_core-allocate-ICM-memory-in-page-size-chunks/20180512-090438
config: sparc64-allyesconfig (attached as .config)
compiler: sparc64-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=sparc64 

All error/warnings (new ones prefixed by >>):

   drivers/net//ethernet/mellanox/mlx4/icm.c: In function 'mlx4_init_icm_table':
>> drivers/net//ethernet/mellanox/mlx4/icm.c:403:20: error: implicit declaration of function 'vzalloc'; did you mean 'kzalloc'? [-Werror=implicit-function-declaration]
     table->icm      = vzalloc(num_icm * sizeof(*table->icm));
                       ^~~~~~~
                       kzalloc
>> drivers/net//ethernet/mellanox/mlx4/icm.c:403:18: warning: assignment makes pointer from integer without a cast [-Wint-conversion]
     table->icm      = vzalloc(num_icm * sizeof(*table->icm));
                     ^
>> drivers/net//ethernet/mellanox/mlx4/icm.c:449:2: error: implicit declaration of function 'vfree'; did you mean 'kfree'? [-Werror=implicit-function-declaration]
     vfree(table->icm);
     ^~~~~
     kfree
   cc1: some warnings being treated as errors

vim +403 drivers/net//ethernet/mellanox/mlx4/icm.c

   389	
   390	int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
   391				u64 virt, int obj_size,	u32 nobj, int reserved,
   392				int use_lowmem, int use_coherent)
   393	{
   394		int obj_per_chunk;
   395		int num_icm;
   396		unsigned chunk_size;
   397		int i;
   398		u64 size;
   399	
   400		obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
   401		num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
   402	
 > 403		table->icm      = vzalloc(num_icm * sizeof(*table->icm));
   404		if (!table->icm)
   405			return -ENOMEM;
   406		table->virt     = virt;
   407		table->num_icm  = num_icm;
   408		table->num_obj  = nobj;
   409		table->obj_size = obj_size;
   410		table->lowmem   = use_lowmem;
   411		table->coherent = use_coherent;
   412		mutex_init(&table->mutex);
   413	
   414		size = (u64) nobj * obj_size;
   415		for (i = 0; i * MLX4_TABLE_CHUNK_SIZE < reserved * obj_size; ++i) {
   416			chunk_size = MLX4_TABLE_CHUNK_SIZE;
   417			if ((i + 1) * MLX4_TABLE_CHUNK_SIZE > size)
   418				chunk_size = PAGE_ALIGN(size -
   419						i * MLX4_TABLE_CHUNK_SIZE);
   420	
   421			table->icm[i] = mlx4_alloc_icm(dev, chunk_size >> PAGE_SHIFT,
   422						       (use_lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
   423						       __GFP_NOWARN, use_coherent);
   424			if (!table->icm[i])
   425				goto err;
   426			if (mlx4_MAP_ICM(dev, table->icm[i], virt + i * MLX4_TABLE_CHUNK_SIZE)) {
   427				mlx4_free_icm(dev, table->icm[i], use_coherent);
   428				table->icm[i] = NULL;
   429				goto err;
   430			}
   431	
   432			/*
   433			 * Add a reference to this ICM chunk so that it never
   434			 * gets freed (since it contains reserved firmware objects).
   435			 */
   436			++table->icm[i]->refcount;
   437		}
   438	
   439		return 0;
   440	
   441	err:
   442		for (i = 0; i < num_icm; ++i)
   443			if (table->icm[i]) {
   444				mlx4_UNMAP_ICM(dev, virt + i * MLX4_TABLE_CHUNK_SIZE,
   445					       MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
   446				mlx4_free_icm(dev, table->icm[i], use_coherent);
   447			}
   448	
 > 449		vfree(table->icm);
   450	
   451		return -ENOMEM;
   452	}
   453	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
diff mbox

Patch

diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c b/drivers/net/ethernet/mellanox/mlx4/icm.c
index a822f7a..ccb62b8 100644
--- a/drivers/net/ethernet/mellanox/mlx4/icm.c
+++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
@@ -43,12 +43,12 @@ 
 #include "fw.h"
 
 /*
- * We allocate in as big chunks as we can, up to a maximum of 256 KB
- * per chunk.
+ * We allocate in page size (default 4KB on many archs) chunks to avoid high
+ * order memory allocations in fragmented/high usage memory situation.
  */
 enum {
-	MLX4_ICM_ALLOC_SIZE	= 1 << 18,
-	MLX4_TABLE_CHUNK_SIZE	= 1 << 18
+	MLX4_ICM_ALLOC_SIZE	= 1 << PAGE_SHIFT,
+	MLX4_TABLE_CHUNK_SIZE	= 1 << PAGE_SHIFT
 };
 
 static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk)
@@ -400,7 +400,7 @@  int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
 	obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
 	num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
 
-	table->icm      = kcalloc(num_icm, sizeof(*table->icm), GFP_KERNEL);
+	table->icm      = vzalloc(num_icm * sizeof(*table->icm));
 	if (!table->icm)
 		return -ENOMEM;
 	table->virt     = virt;
@@ -446,7 +446,7 @@  int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
 			mlx4_free_icm(dev, table->icm[i], use_coherent);
 		}
 
-	kfree(table->icm);
+	vfree(table->icm);
 
 	return -ENOMEM;
 }
@@ -462,5 +462,5 @@  void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table)
 			mlx4_free_icm(dev, table->icm[i], table->coherent);
 		}
 
-	kfree(table->icm);
+	vfree(table->icm);
 }