diff mbox series

[v2,for-next,1/2] RDMA/hns: Add support for CQ stash

Message ID 1605527919-48769-2-git-send-email-liweihang@huawei.com (mailing list archive)
State Superseded
Headers show
Series RDMA/hns: Add supports for stash | expand

Commit Message

Weihang Li Nov. 16, 2020, 11:58 a.m. UTC
From: Lang Cheng <chenglang@huawei.com>

Stash is a mechanism that uses the core information carried by the ARM AXI
bus to access the L3 cache. It can be used to improve the performance by
increasing the hit ratio of L3 cache. CQs need to enable stash by default.

Signed-off-by: Lang Cheng <chenglang@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
---
 drivers/infiniband/hw/hns/hns_roce_common.h | 12 +++++++++
 drivers/infiniband/hw/hns/hns_roce_device.h |  1 +
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c  |  3 +++
 drivers/infiniband/hw/hns/hns_roce_hw_v2.h  | 39 +++++++++++++++++------------
 4 files changed, 39 insertions(+), 16 deletions(-)

Comments

Leon Romanovsky Nov. 16, 2020, 1:46 p.m. UTC | #1
On Mon, Nov 16, 2020 at 07:58:38PM +0800, Weihang Li wrote:
> From: Lang Cheng <chenglang@huawei.com>
>
> Stash is a mechanism that uses the core information carried by the ARM AXI
> bus to access the L3 cache. It can be used to improve the performance by
> increasing the hit ratio of L3 cache. CQs need to enable stash by default.
>
> Signed-off-by: Lang Cheng <chenglang@huawei.com>
> Signed-off-by: Weihang Li <liweihang@huawei.com>
> ---
>  drivers/infiniband/hw/hns/hns_roce_common.h | 12 +++++++++
>  drivers/infiniband/hw/hns/hns_roce_device.h |  1 +
>  drivers/infiniband/hw/hns/hns_roce_hw_v2.c  |  3 +++
>  drivers/infiniband/hw/hns/hns_roce_hw_v2.h  | 39 +++++++++++++++++------------
>  4 files changed, 39 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/infiniband/hw/hns/hns_roce_common.h b/drivers/infiniband/hw/hns/hns_roce_common.h
> index f5669ff..8d96c4e 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_common.h
> +++ b/drivers/infiniband/hw/hns/hns_roce_common.h
> @@ -53,6 +53,18 @@
>  #define roce_set_bit(origin, shift, val) \
>  	roce_set_field((origin), (1ul << (shift)), (shift), (val))
>
> +#define FIELD_LOC(field_h, field_l) field_h, field_l
> +
> +#define _hr_reg_set(arr, field_h, field_l)                                     \
> +	do {                                                                   \
> +		BUILD_BUG_ON(((field_h) / 32) != ((field_l) / 32));            \
> +		BUILD_BUG_ON((field_h) / 32 >= ARRAY_SIZE(arr));               \
> +		(arr)[(field_h) / 32] |=                                       \
> +			cpu_to_le32(GENMASK((field_h) % 32, (field_l) % 32));  \
> +	} while (0)
> +
> +#define hr_reg_set(arr, field) _hr_reg_set(arr, field)

I afraid that it is too much.
1. FIELD_LOC() macro to hide two fields.
2. hr_reg_set and  _hr_reg_set are the same.
3. In both patches field_h and field_l are the same.
4. "do {} while (0)" without need.

Thanks
Weihang Li Nov. 17, 2020, 6:37 a.m. UTC | #2
On 2020/11/16 21:47, Leon Romanovsky wrote:
> On Mon, Nov 16, 2020 at 07:58:38PM +0800, Weihang Li wrote:
>> From: Lang Cheng <chenglang@huawei.com>
>>
>> Stash is a mechanism that uses the core information carried by the ARM AXI
>> bus to access the L3 cache. It can be used to improve the performance by
>> increasing the hit ratio of L3 cache. CQs need to enable stash by default.
>>
>> Signed-off-by: Lang Cheng <chenglang@huawei.com>
>> Signed-off-by: Weihang Li <liweihang@huawei.com>
>> ---
>>  drivers/infiniband/hw/hns/hns_roce_common.h | 12 +++++++++
>>  drivers/infiniband/hw/hns/hns_roce_device.h |  1 +
>>  drivers/infiniband/hw/hns/hns_roce_hw_v2.c  |  3 +++
>>  drivers/infiniband/hw/hns/hns_roce_hw_v2.h  | 39 +++++++++++++++++------------
>>  4 files changed, 39 insertions(+), 16 deletions(-)
>>
>> diff --git a/drivers/infiniband/hw/hns/hns_roce_common.h b/drivers/infiniband/hw/hns/hns_roce_common.h
>> index f5669ff..8d96c4e 100644
>> --- a/drivers/infiniband/hw/hns/hns_roce_common.h
>> +++ b/drivers/infiniband/hw/hns/hns_roce_common.h
>> @@ -53,6 +53,18 @@
>>  #define roce_set_bit(origin, shift, val) \
>>  	roce_set_field((origin), (1ul << (shift)), (shift), (val))
>>
>> +#define FIELD_LOC(field_h, field_l) field_h, field_l
>> +
>> +#define _hr_reg_set(arr, field_h, field_l)                                     \
>> +	do {                                                                   \
>> +		BUILD_BUG_ON(((field_h) / 32) != ((field_l) / 32));            \
>> +		BUILD_BUG_ON((field_h) / 32 >= ARRAY_SIZE(arr));               \
>> +		(arr)[(field_h) / 32] |=                                       \
>> +			cpu_to_le32(GENMASK((field_h) % 32, (field_l) % 32));  \
>> +	} while (0)
>> +
>> +#define hr_reg_set(arr, field) _hr_reg_set(arr, field)
> 
> I afraid that it is too much.

Hi Leon,

Thanks for the comments.

> 1. FIELD_LOC() macro to hide two fields.

Jason has suggested us to simplify the function of setting/getting bit/field in
hns driver like IBA_SET and IBA_GET.

https://patchwork.kernel.org/project/linux-rdma/patch/1589982799-28728-3-git-send-email-liweihang@huawei.com/

So we try to make it easier and clearer to define a bitfield for developers.

For example:

	#define QPCEX_SRC_ID FIELD_LOC(94, 84)

	hr_reg_set(context->ext, QPCEX_SRC_ID);

This will set 84 ~ 91 bit of QPC to 1. Without FIELD_LOC(), it may look
like:

	#define QPCEX_SRC_ID_START 84
	#define QPCEX_SRC_ID_END 94

	hr_reg_set(context->ext, QPCEX_SRC_ID_END, QPCEX_SRC_ID_START);

> 2. hr_reg_set and  _hr_reg_set are the same.

'field' will be spilted into two parts: 'field_h' and 'field_l':

	#define _hr_reg_set(arr, field_h, field_l)
	...

	#define hr_reg_set(arr, field) _hr_reg_set(arr, field)

> 3. In both patches field_h and field_l are the same.

This is because we want hr_reg_set() to be used both for setting bits and for
setting fields. In this series, we just use it to set bit.

> 4. "do {} while (0)" without need.

OK, I will remove the do-while and replace BUILD_BUG_ON with BUILD_BUG_ON_ZERO:

	#define _hr_reg_set(arr, field_h, field_l)                                  \
		(arr)[(field_h) / 32] |=                                            \
			cpu_to_le32(GENMASK((field_h) % 32, (field_l) % 32)) +      \
			BUILD_BUG_ON_ZERO(((field_h) / 32) != ((field_l) / 32)) +   \
			BUILD_BUG_ON_ZERO((field_h) / 32 >= ARRAY_SIZE(arr));

Weihang

> 
> Thanks
>
Leon Romanovsky Nov. 17, 2020, 7:20 a.m. UTC | #3
On Tue, Nov 17, 2020 at 06:37:58AM +0000, liweihang wrote:
> On 2020/11/16 21:47, Leon Romanovsky wrote:
> > On Mon, Nov 16, 2020 at 07:58:38PM +0800, Weihang Li wrote:
> >> From: Lang Cheng <chenglang@huawei.com>
> >>
> >> Stash is a mechanism that uses the core information carried by the ARM AXI
> >> bus to access the L3 cache. It can be used to improve the performance by
> >> increasing the hit ratio of L3 cache. CQs need to enable stash by default.
> >>
> >> Signed-off-by: Lang Cheng <chenglang@huawei.com>
> >> Signed-off-by: Weihang Li <liweihang@huawei.com>
> >> ---
> >>  drivers/infiniband/hw/hns/hns_roce_common.h | 12 +++++++++
> >>  drivers/infiniband/hw/hns/hns_roce_device.h |  1 +
> >>  drivers/infiniband/hw/hns/hns_roce_hw_v2.c  |  3 +++
> >>  drivers/infiniband/hw/hns/hns_roce_hw_v2.h  | 39 +++++++++++++++++------------
> >>  4 files changed, 39 insertions(+), 16 deletions(-)
> >>
> >> diff --git a/drivers/infiniband/hw/hns/hns_roce_common.h b/drivers/infiniband/hw/hns/hns_roce_common.h
> >> index f5669ff..8d96c4e 100644
> >> --- a/drivers/infiniband/hw/hns/hns_roce_common.h
> >> +++ b/drivers/infiniband/hw/hns/hns_roce_common.h
> >> @@ -53,6 +53,18 @@
> >>  #define roce_set_bit(origin, shift, val) \
> >>  	roce_set_field((origin), (1ul << (shift)), (shift), (val))
> >>
> >> +#define FIELD_LOC(field_h, field_l) field_h, field_l
> >> +
> >> +#define _hr_reg_set(arr, field_h, field_l)                                     \
> >> +	do {                                                                   \
> >> +		BUILD_BUG_ON(((field_h) / 32) != ((field_l) / 32));            \
> >> +		BUILD_BUG_ON((field_h) / 32 >= ARRAY_SIZE(arr));               \
> >> +		(arr)[(field_h) / 32] |=                                       \
> >> +			cpu_to_le32(GENMASK((field_h) % 32, (field_l) % 32));  \
> >> +	} while (0)
> >> +
> >> +#define hr_reg_set(arr, field) _hr_reg_set(arr, field)
> >
> > I afraid that it is too much.
>
> Hi Leon,
>
> Thanks for the comments.
>
> > 1. FIELD_LOC() macro to hide two fields.
>
> Jason has suggested us to simplify the function of setting/getting bit/field in
> hns driver like IBA_SET and IBA_GET.
>
> https://patchwork.kernel.org/project/linux-rdma/patch/1589982799-28728-3-git-send-email-liweihang@huawei.com/
>
> So we try to make it easier and clearer to define a bitfield for developers.

Jason asked to use genmask and FIELD_PREP, but you invented something else.

Thanks
Weihang Li Nov. 17, 2020, 8:35 a.m. UTC | #4
On 2020/11/17 15:21, Leon Romanovsky wrote:
> On Tue, Nov 17, 2020 at 06:37:58AM +0000, liweihang wrote:
>> On 2020/11/16 21:47, Leon Romanovsky wrote:
>>> On Mon, Nov 16, 2020 at 07:58:38PM +0800, Weihang Li wrote:
>>>> From: Lang Cheng <chenglang@huawei.com>
>>>>
>>>> Stash is a mechanism that uses the core information carried by the ARM AXI
>>>> bus to access the L3 cache. It can be used to improve the performance by
>>>> increasing the hit ratio of L3 cache. CQs need to enable stash by default.
>>>>
>>>> Signed-off-by: Lang Cheng <chenglang@huawei.com>
>>>> Signed-off-by: Weihang Li <liweihang@huawei.com>
>>>> ---
>>>>  drivers/infiniband/hw/hns/hns_roce_common.h | 12 +++++++++
>>>>  drivers/infiniband/hw/hns/hns_roce_device.h |  1 +
>>>>  drivers/infiniband/hw/hns/hns_roce_hw_v2.c  |  3 +++
>>>>  drivers/infiniband/hw/hns/hns_roce_hw_v2.h  | 39 +++++++++++++++++------------
>>>>  4 files changed, 39 insertions(+), 16 deletions(-)
>>>>
>>>> diff --git a/drivers/infiniband/hw/hns/hns_roce_common.h b/drivers/infiniband/hw/hns/hns_roce_common.h
>>>> index f5669ff..8d96c4e 100644
>>>> --- a/drivers/infiniband/hw/hns/hns_roce_common.h
>>>> +++ b/drivers/infiniband/hw/hns/hns_roce_common.h
>>>> @@ -53,6 +53,18 @@
>>>>  #define roce_set_bit(origin, shift, val) \
>>>>  	roce_set_field((origin), (1ul << (shift)), (shift), (val))
>>>>
>>>> +#define FIELD_LOC(field_h, field_l) field_h, field_l
>>>> +
>>>> +#define _hr_reg_set(arr, field_h, field_l)                                     \
>>>> +	do {                                                                   \
>>>> +		BUILD_BUG_ON(((field_h) / 32) != ((field_l) / 32));            \
>>>> +		BUILD_BUG_ON((field_h) / 32 >= ARRAY_SIZE(arr));               \
>>>> +		(arr)[(field_h) / 32] |=                                       \
>>>> +			cpu_to_le32(GENMASK((field_h) % 32, (field_l) % 32));  \
>>>> +	} while (0)
>>>> +
>>>> +#define hr_reg_set(arr, field) _hr_reg_set(arr, field)
>>>
>>> I afraid that it is too much.
>>
>> Hi Leon,
>>
>> Thanks for the comments.
>>
>>> 1. FIELD_LOC() macro to hide two fields.
>>
>> Jason has suggested us to simplify the function of setting/getting bit/field in
>> hns driver like IBA_SET and IBA_GET.
>>
>> https://patchwork.kernel.org/project/linux-rdma/patch/1589982799-28728-3-git-send-email-liweihang@huawei.com/
>>
>> So we try to make it easier and clearer to define a bitfield for developers.
> 
> Jason asked to use genmask and FIELD_PREP, but you invented something else.
> 
> Thanks
> 

We use them in another interface 'hr_reg_write(arr, field, val)' which hasn't been
used in this series.

Does it make any unacceptable mistake? We would appreciate any suggestions :)

Thanks
Weihang
Leon Romanovsky Nov. 17, 2020, 8:50 a.m. UTC | #5
On Tue, Nov 17, 2020 at 08:35:55AM +0000, liweihang wrote:
> On 2020/11/17 15:21, Leon Romanovsky wrote:
> > On Tue, Nov 17, 2020 at 06:37:58AM +0000, liweihang wrote:
> >> On 2020/11/16 21:47, Leon Romanovsky wrote:
> >>> On Mon, Nov 16, 2020 at 07:58:38PM +0800, Weihang Li wrote:
> >>>> From: Lang Cheng <chenglang@huawei.com>
> >>>>
> >>>> Stash is a mechanism that uses the core information carried by the ARM AXI
> >>>> bus to access the L3 cache. It can be used to improve the performance by
> >>>> increasing the hit ratio of L3 cache. CQs need to enable stash by default.
> >>>>
> >>>> Signed-off-by: Lang Cheng <chenglang@huawei.com>
> >>>> Signed-off-by: Weihang Li <liweihang@huawei.com>
> >>>> ---
> >>>>  drivers/infiniband/hw/hns/hns_roce_common.h | 12 +++++++++
> >>>>  drivers/infiniband/hw/hns/hns_roce_device.h |  1 +
> >>>>  drivers/infiniband/hw/hns/hns_roce_hw_v2.c  |  3 +++
> >>>>  drivers/infiniband/hw/hns/hns_roce_hw_v2.h  | 39 +++++++++++++++++------------
> >>>>  4 files changed, 39 insertions(+), 16 deletions(-)
> >>>>
> >>>> diff --git a/drivers/infiniband/hw/hns/hns_roce_common.h b/drivers/infiniband/hw/hns/hns_roce_common.h
> >>>> index f5669ff..8d96c4e 100644
> >>>> --- a/drivers/infiniband/hw/hns/hns_roce_common.h
> >>>> +++ b/drivers/infiniband/hw/hns/hns_roce_common.h
> >>>> @@ -53,6 +53,18 @@
> >>>>  #define roce_set_bit(origin, shift, val) \
> >>>>  	roce_set_field((origin), (1ul << (shift)), (shift), (val))
> >>>>
> >>>> +#define FIELD_LOC(field_h, field_l) field_h, field_l
> >>>> +
> >>>> +#define _hr_reg_set(arr, field_h, field_l)                                     \
> >>>> +	do {                                                                   \
> >>>> +		BUILD_BUG_ON(((field_h) / 32) != ((field_l) / 32));            \
> >>>> +		BUILD_BUG_ON((field_h) / 32 >= ARRAY_SIZE(arr));               \
> >>>> +		(arr)[(field_h) / 32] |=                                       \
> >>>> +			cpu_to_le32(GENMASK((field_h) % 32, (field_l) % 32));  \
> >>>> +	} while (0)
> >>>> +
> >>>> +#define hr_reg_set(arr, field) _hr_reg_set(arr, field)
> >>>
> >>> I afraid that it is too much.
> >>
> >> Hi Leon,
> >>
> >> Thanks for the comments.
> >>
> >>> 1. FIELD_LOC() macro to hide two fields.
> >>
> >> Jason has suggested us to simplify the function of setting/getting bit/field in
> >> hns driver like IBA_SET and IBA_GET.
> >>
> >> https://patchwork.kernel.org/project/linux-rdma/patch/1589982799-28728-3-git-send-email-liweihang@huawei.com/
> >>
> >> So we try to make it easier and clearer to define a bitfield for developers.
> >
> > Jason asked to use genmask and FIELD_PREP, but you invented something else.
> >
> > Thanks
> >
>
> We use them in another interface 'hr_reg_write(arr, field, val)' which hasn't been
> used in this series.
>
> Does it make any unacceptable mistake? We would appreciate any suggestions :)

The invention of FIELD_LOC() and hr_reg_set equal to __hr_reg_set are unacceptable.
Pass directly your field_h and field_l to hr_reg_set().

Thanks


>
> Thanks
> Weihang
>
>
Weihang Li Nov. 18, 2020, 10:49 a.m. UTC | #6
On 2020/11/17 16:50, Leon Romanovsky wrote:
> On Tue, Nov 17, 2020 at 08:35:55AM +0000, liweihang wrote:
>> On 2020/11/17 15:21, Leon Romanovsky wrote:
>>> On Tue, Nov 17, 2020 at 06:37:58AM +0000, liweihang wrote:
>>>> On 2020/11/16 21:47, Leon Romanovsky wrote:
>>>>> On Mon, Nov 16, 2020 at 07:58:38PM +0800, Weihang Li wrote:
>>>>>> From: Lang Cheng <chenglang@huawei.com>
>>>>>>
>>>>>> Stash is a mechanism that uses the core information carried by the ARM AXI
>>>>>> bus to access the L3 cache. It can be used to improve the performance by
>>>>>> increasing the hit ratio of L3 cache. CQs need to enable stash by default.
>>>>>>
>>>>>> Signed-off-by: Lang Cheng <chenglang@huawei.com>
>>>>>> Signed-off-by: Weihang Li <liweihang@huawei.com>
>>>>>> ---
>>>>>>  drivers/infiniband/hw/hns/hns_roce_common.h | 12 +++++++++
>>>>>>  drivers/infiniband/hw/hns/hns_roce_device.h |  1 +
>>>>>>  drivers/infiniband/hw/hns/hns_roce_hw_v2.c  |  3 +++
>>>>>>  drivers/infiniband/hw/hns/hns_roce_hw_v2.h  | 39 +++++++++++++++++------------
>>>>>>  4 files changed, 39 insertions(+), 16 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/infiniband/hw/hns/hns_roce_common.h b/drivers/infiniband/hw/hns/hns_roce_common.h
>>>>>> index f5669ff..8d96c4e 100644
>>>>>> --- a/drivers/infiniband/hw/hns/hns_roce_common.h
>>>>>> +++ b/drivers/infiniband/hw/hns/hns_roce_common.h
>>>>>> @@ -53,6 +53,18 @@
>>>>>>  #define roce_set_bit(origin, shift, val) \
>>>>>>  	roce_set_field((origin), (1ul << (shift)), (shift), (val))
>>>>>>
>>>>>> +#define FIELD_LOC(field_h, field_l) field_h, field_l
>>>>>> +
>>>>>> +#define _hr_reg_set(arr, field_h, field_l)                                     \
>>>>>> +	do {                                                                   \
>>>>>> +		BUILD_BUG_ON(((field_h) / 32) != ((field_l) / 32));            \
>>>>>> +		BUILD_BUG_ON((field_h) / 32 >= ARRAY_SIZE(arr));               \
>>>>>> +		(arr)[(field_h) / 32] |=                                       \
>>>>>> +			cpu_to_le32(GENMASK((field_h) % 32, (field_l) % 32));  \
>>>>>> +	} while (0)
>>>>>> +
>>>>>> +#define hr_reg_set(arr, field) _hr_reg_set(arr, field)
>>>>>
>>>>> I afraid that it is too much.
>>>>
>>>> Hi Leon,
>>>>
>>>> Thanks for the comments.
>>>>
>>>>> 1. FIELD_LOC() macro to hide two fields.
>>>>
>>>> Jason has suggested us to simplify the function of setting/getting bit/field in
>>>> hns driver like IBA_SET and IBA_GET.
>>>>
>>>> https://patchwork.kernel.org/project/linux-rdma/patch/1589982799-28728-3-git-send-email-liweihang@huawei.com/
>>>>
>>>> So we try to make it easier and clearer to define a bitfield for developers.
>>>
>>> Jason asked to use genmask and FIELD_PREP, but you invented something else.
>>>
>>> Thanks
>>>
>>
>> We use them in another interface 'hr_reg_write(arr, field, val)' which hasn't been
>> used in this series.
>>
>> Does it make any unacceptable mistake? We would appreciate any suggestions :)
> 
> The invention of FIELD_LOC() and hr_reg_set equal to __hr_reg_set are unacceptable.
> Pass directly your field_h and field_l to hr_reg_set().
> 
> Thanks
> 

Hi Leon,

We let hr_reg_set equal() to __hr_reg_set() because if not, there will be a compile error:

../hns_roce_hw_v2.c:4566:41: error: macro "_hr_reg_set" requires 3 arguments, but only 2 given
_hr_reg_set(cq_context->raw, CQC_STASH);

There are similar codes in iba.h, I think they are of the same reason:

	#define _IBA_SET(field_struct, field_offset, field_mask, num_bits, ptr, value) \
		({                                                                     \
			field_struct *_ptr = ptr;                                      \
			_iba_set##num_bits((void *)_ptr + (field_offset), field_mask,  \
					   FIELD_PREP(field_mask, value));             \
		})
	#define IBA_SET(field, ptr, value) _IBA_SET(field, ptr, value)

	#define IBA_FIELD64_LOC(field_struct, byte_offset)                             \
		field_struct, byte_offset, GENMASK_ULL(63, 0), 64

	#define CM_FIELD64_LOC(field_struct, byte_offset)                              \
		IBA_FIELD64_LOC(field_struct, (byte_offset + sizeof(struct ib_mad_hdr)))

	#define CM_REQ_LOCAL_CA_GUID CM_FIELD64_LOC(struct cm_req_msg, 16)

	IBA_SET(CM_REQ_LOCAL_CA_GUID, req_msg,
		be64_to_cpu(cm_id_priv->id.device->node_guid));

As I said, we want to use a single symbol to represent a field to make it
easier and clearer to define and set a bitfield for developers.

Let's compare the following implementations:

	#define _hr_reg_set(arr, field_h, field_l) \
		(arr)[(field_h) / 32] |= \
			cpu_to_le32(GENMASK((field_h) % 32, (field_l) % 32)) + \
			BUILD_BUG_ON_ZERO(((field_h) / 32) != ((field_l) / 32)) + \
			BUILD_BUG_ON_ZERO((field_h) / 32 >= ARRAY_SIZE(arr))

1)
	#define hr_reg_set(arr, field) _hr_reg_set(arr, field)

	#define QPCEX_PASID_EN FIELD_LOC(111, 95)
	hr_reg_set(context->ext, QPCEX_PASID_EN);

In this way, we just need to define a single symbol and use it easily.

2)
	#define QPCEX_PASID_EN_H 111
	#define QPCEX_PASID_EN_L 95
	_hr_reg_set(context->ext, QPCEX_PASID_EN_H, QPCEX_PASID_EN_L);

We have to define and pass 2 symbols.

3)
	#define QPCEX_PASID_EN_H 111
	#define QPCEX_PASID_EN 95
	#define hr_reg_set(arr, field) _hr_reg_set(arr, field, field##_H)
	hr_reg_set(context->ext, QPCEX_PASID_EN);

We have to define 2 symbols and use 1 symbols when setting field.

4)
	#define _hr_reg_set(arr, field_h, field_l) \
		...

	#define QPCEX_PASID_EN_M GENMASK(111, 95) //mask
	#define QPCEX_PASID_EN 95 // shift
	#define hr_reg_set(arr, field) _hr_reg_set(arr, field, field##_M)
	hr_reg_set(context->ext, QPCEX_PASID_EN);

We use mask and shift, similar with the way #3.

I think the first way is the best, and it doesn't seem to have any side effects.

Thanks
Weihang
Jason Gunthorpe Nov. 18, 2020, 8:07 p.m. UTC | #7
On Wed, Nov 18, 2020 at 10:49:14AM +0000, liweihang wrote:
> On 2020/11/17 16:50, Leon Romanovsky wrote:
> > On Tue, Nov 17, 2020 at 08:35:55AM +0000, liweihang wrote:
> >> On 2020/11/17 15:21, Leon Romanovsky wrote:
> >>> On Tue, Nov 17, 2020 at 06:37:58AM +0000, liweihang wrote:
> >>>> On 2020/11/16 21:47, Leon Romanovsky wrote:
> >>>>> On Mon, Nov 16, 2020 at 07:58:38PM +0800, Weihang Li wrote:
> >>>>>> From: Lang Cheng <chenglang@huawei.com>
> >>>>>>
> >>>>>> Stash is a mechanism that uses the core information carried by the ARM AXI
> >>>>>> bus to access the L3 cache. It can be used to improve the performance by
> >>>>>> increasing the hit ratio of L3 cache. CQs need to enable stash by default.
> >>>>>>
> >>>>>> Signed-off-by: Lang Cheng <chenglang@huawei.com>
> >>>>>> Signed-off-by: Weihang Li <liweihang@huawei.com>
> >>>>>>  drivers/infiniband/hw/hns/hns_roce_common.h | 12 +++++++++
> >>>>>>  drivers/infiniband/hw/hns/hns_roce_device.h |  1 +
> >>>>>>  drivers/infiniband/hw/hns/hns_roce_hw_v2.c  |  3 +++
> >>>>>>  drivers/infiniband/hw/hns/hns_roce_hw_v2.h  | 39 +++++++++++++++++------------
> >>>>>>  4 files changed, 39 insertions(+), 16 deletions(-)
> >>>>>>
> >>>>>> diff --git a/drivers/infiniband/hw/hns/hns_roce_common.h b/drivers/infiniband/hw/hns/hns_roce_common.h
> >>>>>> index f5669ff..8d96c4e 100644
> >>>>>> +++ b/drivers/infiniband/hw/hns/hns_roce_common.h
> >>>>>> @@ -53,6 +53,18 @@
> >>>>>>  #define roce_set_bit(origin, shift, val) \
> >>>>>>  	roce_set_field((origin), (1ul << (shift)), (shift), (val))
> >>>>>>
> >>>>>> +#define FIELD_LOC(field_h, field_l) field_h, field_l
> >>>>>> +
> >>>>>> +#define _hr_reg_set(arr, field_h, field_l)                                     \
> >>>>>> +	do {                                                                   \
> >>>>>> +		BUILD_BUG_ON(((field_h) / 32) != ((field_l) / 32));            \
> >>>>>> +		BUILD_BUG_ON((field_h) / 32 >= ARRAY_SIZE(arr));               \
> >>>>>> +		(arr)[(field_h) / 32] |=                                       \
> >>>>>> +			cpu_to_le32(GENMASK((field_h) % 32, (field_l) % 32));  \
> >>>>>> +	} while (0)
> >>>>>> +
> >>>>>> +#define hr_reg_set(arr, field) _hr_reg_set(arr, field)
> >>>>>
> >>>>> I afraid that it is too much.
> >>>>
> >>>> Hi Leon,
> >>>>
> >>>> Thanks for the comments.
> >>>>
> >>>>> 1. FIELD_LOC() macro to hide two fields.
> >>>>
> >>>> Jason has suggested us to simplify the function of setting/getting bit/field in
> >>>> hns driver like IBA_SET and IBA_GET.
> >>>>
> >>>> https://patchwork.kernel.org/project/linux-rdma/patch/1589982799-28728-3-git-send-email-liweihang@huawei.com/
> >>>>
> >>>> So we try to make it easier and clearer to define a bitfield for developers.
> >>>
> >>> Jason asked to use genmask and FIELD_PREP, but you invented something else.
> >>>
> >>> Thanks
> >>>
> >>
> >> We use them in another interface 'hr_reg_write(arr, field, val)' which hasn't been
> >> used in this series.
> >>
> >> Does it make any unacceptable mistake? We would appreciate any suggestions :)
> > 
> > The invention of FIELD_LOC() and hr_reg_set equal to __hr_reg_set are unacceptable.
> > Pass directly your field_h and field_l to hr_reg_set().
> > 
> > Thanks
> > 
> 
> Hi Leon,
> 
> We let hr_reg_set equal() to __hr_reg_set() because if not, there will be a compile error:
> 
> .../hns_roce_hw_v2.c:4566:41: error: macro "_hr_reg_set" requires 3 arguments, but only 2 given
> _hr_reg_set(cq_context->raw, CQC_STASH);

Yes, it is very un-intuitive but the rules for CPP require the extra
macro pass to generate the correct expansion. Otherwise cpp will try
to pass the value with commas in as a single argument, what we need
here is to expand the commads first and have them break up into macro
arguments as it goes down the macro chain.

> Let's compare the following implementations:
> 
> 	#define _hr_reg_set(arr, field_h, field_l) \
> 		(arr)[(field_h) / 32] |= \
> 			cpu_to_le32(GENMASK((field_h) % 32, (field_l) % 32)) + \
> 			BUILD_BUG_ON_ZERO(((field_h) / 32) != ((field_l) / 32)) + \
> 			BUILD_BUG_ON_ZERO((field_h) / 32 >= ARRAY_SIZE(arr))
> 
> 1)
> 	#define hr_reg_set(arr, field) _hr_reg_set(arr, field)
> 
> 	#define QPCEX_PASID_EN FIELD_LOC(111, 95)
> 	hr_reg_set(context->ext, QPCEX_PASID_EN);

It is also weird that something called set can only write all ones to
the field.

It feels saner to have a set that accepts a value and if the all ones
case is something very common then use a macro to compute it from the
field name.

Jason
Weihang Li Nov. 20, 2020, 9:01 a.m. UTC | #8
On 2020/11/19 4:08, Jason Gunthorpe wrote:
> On Wed, Nov 18, 2020 at 10:49:14AM +0000, liweihang wrote:
>> On 2020/11/17 16:50, Leon Romanovsky wrote:
>>> On Tue, Nov 17, 2020 at 08:35:55AM +0000, liweihang wrote:
>>>> On 2020/11/17 15:21, Leon Romanovsky wrote:
>>>>> On Tue, Nov 17, 2020 at 06:37:58AM +0000, liweihang wrote:
>>>>>> On 2020/11/16 21:47, Leon Romanovsky wrote:
>>>>>>> On Mon, Nov 16, 2020 at 07:58:38PM +0800, Weihang Li wrote:
>>>>>>>> From: Lang Cheng <chenglang@huawei.com>
>>>>>>>>
>>>>>>>> Stash is a mechanism that uses the core information carried by the ARM AXI
>>>>>>>> bus to access the L3 cache. It can be used to improve the performance by
>>>>>>>> increasing the hit ratio of L3 cache. CQs need to enable stash by default.
>>>>>>>>
>>>>>>>> Signed-off-by: Lang Cheng <chenglang@huawei.com>
>>>>>>>> Signed-off-by: Weihang Li <liweihang@huawei.com>
>>>>>>>>  drivers/infiniband/hw/hns/hns_roce_common.h | 12 +++++++++
>>>>>>>>  drivers/infiniband/hw/hns/hns_roce_device.h |  1 +
>>>>>>>>  drivers/infiniband/hw/hns/hns_roce_hw_v2.c  |  3 +++
>>>>>>>>  drivers/infiniband/hw/hns/hns_roce_hw_v2.h  | 39 +++++++++++++++++------------
>>>>>>>>  4 files changed, 39 insertions(+), 16 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/infiniband/hw/hns/hns_roce_common.h b/drivers/infiniband/hw/hns/hns_roce_common.h
>>>>>>>> index f5669ff..8d96c4e 100644
>>>>>>>> +++ b/drivers/infiniband/hw/hns/hns_roce_common.h
>>>>>>>> @@ -53,6 +53,18 @@
>>>>>>>>  #define roce_set_bit(origin, shift, val) \
>>>>>>>>  	roce_set_field((origin), (1ul << (shift)), (shift), (val))
>>>>>>>>
>>>>>>>> +#define FIELD_LOC(field_h, field_l) field_h, field_l
>>>>>>>> +
>>>>>>>> +#define _hr_reg_set(arr, field_h, field_l)                                     \
>>>>>>>> +	do {                                                                   \
>>>>>>>> +		BUILD_BUG_ON(((field_h) / 32) != ((field_l) / 32));            \
>>>>>>>> +		BUILD_BUG_ON((field_h) / 32 >= ARRAY_SIZE(arr));               \
>>>>>>>> +		(arr)[(field_h) / 32] |=                                       \
>>>>>>>> +			cpu_to_le32(GENMASK((field_h) % 32, (field_l) % 32));  \
>>>>>>>> +	} while (0)
>>>>>>>> +
>>>>>>>> +#define hr_reg_set(arr, field) _hr_reg_set(arr, field)
>>>>>>>
>>>>>>> I afraid that it is too much.
>>>>>>
>>>>>> Hi Leon,
>>>>>>
>>>>>> Thanks for the comments.
>>>>>>
>>>>>>> 1. FIELD_LOC() macro to hide two fields.
>>>>>>
>>>>>> Jason has suggested us to simplify the function of setting/getting bit/field in
>>>>>> hns driver like IBA_SET and IBA_GET.
>>>>>>
>>>>>> https://patchwork.kernel.org/project/linux-rdma/patch/1589982799-28728-3-git-send-email-liweihang@huawei.com/
>>>>>>
>>>>>> So we try to make it easier and clearer to define a bitfield for developers.
>>>>>
>>>>> Jason asked to use genmask and FIELD_PREP, but you invented something else.
>>>>>
>>>>> Thanks
>>>>>
>>>>
>>>> We use them in another interface 'hr_reg_write(arr, field, val)' which hasn't been
>>>> used in this series.
>>>>
>>>> Does it make any unacceptable mistake? We would appreciate any suggestions :)
>>>
>>> The invention of FIELD_LOC() and hr_reg_set equal to __hr_reg_set are unacceptable.
>>> Pass directly your field_h and field_l to hr_reg_set().
>>>
>>> Thanks
>>>
>>
>> Hi Leon,
>>
>> We let hr_reg_set equal() to __hr_reg_set() because if not, there will be a compile error:
>>
>> .../hns_roce_hw_v2.c:4566:41: error: macro "_hr_reg_set" requires 3 arguments, but only 2 given
>> _hr_reg_set(cq_context->raw, CQC_STASH);
> 
> Yes, it is very un-intuitive but the rules for CPP require the extra
> macro pass to generate the correct expansion. Otherwise cpp will try
> to pass the value with commas in as a single argument, what we need
> here is to expand the commads first and have them break up into macro
> arguments as it goes down the macro chain.
> 

Thanks for your explanation :)

>> Let's compare the following implementations:
>>
>> 	#define _hr_reg_set(arr, field_h, field_l) \
>> 		(arr)[(field_h) / 32] |= \
>> 			cpu_to_le32(GENMASK((field_h) % 32, (field_l) % 32)) + \
>> 			BUILD_BUG_ON_ZERO(((field_h) / 32) != ((field_l) / 32)) + \
>> 			BUILD_BUG_ON_ZERO((field_h) / 32 >= ARRAY_SIZE(arr))
>>
>> 1)
>> 	#define hr_reg_set(arr, field) _hr_reg_set(arr, field)
>>
>> 	#define QPCEX_PASID_EN FIELD_LOC(111, 95)
>> 	hr_reg_set(context->ext, QPCEX_PASID_EN);
> 
> It is also weird that something called set can only write all ones to
> the field.
> 
> It feels saner to have a set that accepts a value and if the all ones
> case is something very common then use a macro to compute it from the
> field name.
> 
> Jason
> 

OK, We will achieve hr_reg_enable(arr, field) for setting a single bit
and hr_reg_write(arr, field, val) for filling a value into a field.

Thanks
Weihang
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/hns/hns_roce_common.h b/drivers/infiniband/hw/hns/hns_roce_common.h
index f5669ff..8d96c4e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_common.h
+++ b/drivers/infiniband/hw/hns/hns_roce_common.h
@@ -53,6 +53,18 @@ 
 #define roce_set_bit(origin, shift, val) \
 	roce_set_field((origin), (1ul << (shift)), (shift), (val))
 
+#define FIELD_LOC(field_h, field_l) field_h, field_l
+
+#define _hr_reg_set(arr, field_h, field_l)                                     \
+	do {                                                                   \
+		BUILD_BUG_ON(((field_h) / 32) != ((field_l) / 32));            \
+		BUILD_BUG_ON((field_h) / 32 >= ARRAY_SIZE(arr));               \
+		(arr)[(field_h) / 32] |=                                       \
+			cpu_to_le32(GENMASK((field_h) % 32, (field_l) % 32));  \
+	} while (0)
+
+#define hr_reg_set(arr, field) _hr_reg_set(arr, field)
+
 #define ROCEE_GLB_CFG_ROCEE_DB_SQ_MODE_S 3
 #define ROCEE_GLB_CFG_ROCEE_DB_OTH_MODE_S 4
 
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index 1d99022..ab7df8e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -223,6 +223,7 @@  enum {
 	HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL		= BIT(9),
 	HNS_ROCE_CAP_FLAG_ATOMIC		= BIT(10),
 	HNS_ROCE_CAP_FLAG_SDI_MODE		= BIT(14),
+	HNS_ROCE_CAP_FLAG_STASH			= BIT(17),
 };
 
 #define HNS_ROCE_DB_TYPE_COUNT			2
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 4d697e4..5fd0458 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -3177,6 +3177,9 @@  static void hns_roce_v2_write_cqc(struct hns_roce_dev *hr_dev,
 		       V2_CQC_BYTE_8_CQE_SIZE_S, hr_cq->cqe_size ==
 		       HNS_ROCE_V3_CQE_SIZE ? 1 : 0);
 
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_STASH)
+		hr_reg_set(cq_context->raw, CQC_STASH);
+
 	cq_context->cqe_cur_blk_addr = cpu_to_le32(to_hr_hw_page_addr(mtts[0]));
 
 	roce_set_field(cq_context->byte_16_hop_addr,
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
index 1409d05..50a5187 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
@@ -267,22 +267,27 @@  enum hns_roce_sgid_type {
 };
 
 struct hns_roce_v2_cq_context {
-	__le32	byte_4_pg_ceqn;
-	__le32	byte_8_cqn;
-	__le32	cqe_cur_blk_addr;
-	__le32	byte_16_hop_addr;
-	__le32	cqe_nxt_blk_addr;
-	__le32	byte_24_pgsz_addr;
-	__le32	byte_28_cq_pi;
-	__le32	byte_32_cq_ci;
-	__le32	cqe_ba;
-	__le32	byte_40_cqe_ba;
-	__le32	byte_44_db_record;
-	__le32	db_record_addr;
-	__le32	byte_52_cqe_cnt;
-	__le32	byte_56_cqe_period_maxcnt;
-	__le32	cqe_report_timer;
-	__le32	byte_64_se_cqe_idx;
+	union {
+		struct {
+			__le32 byte_4_pg_ceqn;
+			__le32 byte_8_cqn;
+			__le32 cqe_cur_blk_addr;
+			__le32 byte_16_hop_addr;
+			__le32 cqe_nxt_blk_addr;
+			__le32 byte_24_pgsz_addr;
+			__le32 byte_28_cq_pi;
+			__le32 byte_32_cq_ci;
+			__le32 cqe_ba;
+			__le32 byte_40_cqe_ba;
+			__le32 byte_44_db_record;
+			__le32 db_record_addr;
+			__le32 byte_52_cqe_cnt;
+			__le32 byte_56_cqe_period_maxcnt;
+			__le32 cqe_report_timer;
+			__le32 byte_64_se_cqe_idx;
+		};
+		__le32 raw[16];
+	};
 };
 #define HNS_ROCE_V2_CQ_DEFAULT_BURST_NUM 0x0
 #define HNS_ROCE_V2_CQ_DEFAULT_INTERVAL	0x0
@@ -360,6 +365,8 @@  struct hns_roce_v2_cq_context {
 #define	V2_CQC_BYTE_64_SE_CQE_IDX_S 0
 #define	V2_CQC_BYTE_64_SE_CQE_IDX_M GENMASK(23, 0)
 
+#define CQC_STASH FIELD_LOC(63, 63)
+
 struct hns_roce_srq_context {
 	__le32	byte_4_srqn_srqst;
 	__le32	byte_8_limit_wl;