diff mbox series

[v2,for-next,2/2] RDMA/hns: Support userspace configuring congestion control algorithm with QP granularity

Message ID 20240208035038.94668-3-huangjunxian6@hisilicon.com (mailing list archive)
State Changes Requested
Headers show
Series RDMA/hns: Support configuring congestion control algorithm with QP granularity | expand

Commit Message

Junxian Huang Feb. 8, 2024, 3:50 a.m. UTC
Support userspace configuring congestion control algorithm with
QP granularity. If the algorithm is not specified in userspace,
use the default one.

Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
---
 drivers/infiniband/hw/hns/hns_roce_device.h | 23 +++++--
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c  | 14 +---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.h  |  3 +-
 drivers/infiniband/hw/hns/hns_roce_main.c   |  3 +
 drivers/infiniband/hw/hns/hns_roce_qp.c     | 71 +++++++++++++++++++++
 include/uapi/rdma/hns-abi.h                 | 17 +++++
 6 files changed, 112 insertions(+), 19 deletions(-)

Comments

Jason Gunthorpe Feb. 21, 2024, 3:52 p.m. UTC | #1
On Thu, Feb 08, 2024 at 11:50:38AM +0800, Junxian Huang wrote:
> Support userspace configuring congestion control algorithm with
> QP granularity. If the algorithm is not specified in userspace,
> use the default one.
> 
> Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
> ---
>  drivers/infiniband/hw/hns/hns_roce_device.h | 23 +++++--
>  drivers/infiniband/hw/hns/hns_roce_hw_v2.c  | 14 +---
>  drivers/infiniband/hw/hns/hns_roce_hw_v2.h  |  3 +-
>  drivers/infiniband/hw/hns/hns_roce_main.c   |  3 +
>  drivers/infiniband/hw/hns/hns_roce_qp.c     | 71 +++++++++++++++++++++
>  include/uapi/rdma/hns-abi.h                 | 17 +++++
>  6 files changed, 112 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
> index c88ba7e053bf..55f2f54e15fb 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_device.h
> +++ b/drivers/infiniband/hw/hns/hns_roce_device.h
> @@ -594,11 +594,19 @@ struct hns_roce_work {
>  	u32 queue_num;
>  };
>  
> -enum cong_type {
> -	CONG_TYPE_DCQCN,
> -	CONG_TYPE_LDCP,
> -	CONG_TYPE_HC3,
> -	CONG_TYPE_DIP,
> +enum hns_roce_scc_algo {
> +	HNS_ROCE_SCC_ALGO_DCQCN = 0,
> +	HNS_ROCE_SCC_ALGO_LDCP,
> +	HNS_ROCE_SCC_ALGO_HC3,
> +	HNS_ROCE_SCC_ALGO_DIP,
> +	HNS_ROCE_SCC_ALGO_TOTAL,
> +};
> +
> +enum hns_roce_cong_type {
> +	CONG_TYPE_DCQCN = 1 << HNS_ROCE_SCC_ALGO_DCQCN,
> +	CONG_TYPE_LDCP = 1 << HNS_ROCE_SCC_ALGO_LDCP,
> +	CONG_TYPE_HC3 = 1 << HNS_ROCE_SCC_ALGO_HC3,
> +	CONG_TYPE_DIP = 1 << HNS_ROCE_SCC_ALGO_DIP,
>  };
>  
>  struct hns_roce_qp {
> @@ -644,7 +652,7 @@ struct hns_roce_qp {
>  	struct list_head	sq_node; /* all send qps are on a list */
>  	struct hns_user_mmap_entry *dwqe_mmap_entry;
>  	u32			config;
> -	enum cong_type		cong_type;
> +	enum hns_roce_cong_type	cong_type;
>  };
>  
>  struct hns_roce_ib_iboe {
> @@ -845,7 +853,8 @@ struct hns_roce_caps {
>  	u16		default_aeq_period;
>  	u16		default_aeq_arm_st;
>  	u16		default_ceq_arm_st;
> -	enum cong_type	cong_type;
> +	u8		cong_cap;
> +	enum hns_roce_cong_type	default_cong_type;
>  };
>  
>  enum hns_roce_device_state {
> diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
> index 42e28586cefa..21532f213b0f 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
> +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
> @@ -2209,11 +2209,12 @@ static int hns_roce_query_caps(struct hns_roce_dev *hr_dev)
>  	caps->max_wqes = 1 << le16_to_cpu(resp_c->sq_depth);
>  
>  	caps->num_srqs = 1 << hr_reg_read(resp_d, PF_CAPS_D_NUM_SRQS);
> -	caps->cong_type = hr_reg_read(resp_d, PF_CAPS_D_CONG_TYPE);
> +	caps->cong_cap = hr_reg_read(resp_d, PF_CAPS_D_CONG_CAP);
>  	caps->max_srq_wrs = 1 << le16_to_cpu(resp_d->srq_depth);
>  	caps->ceqe_depth = 1 << hr_reg_read(resp_d, PF_CAPS_D_CEQ_DEPTH);
>  	caps->num_comp_vectors = hr_reg_read(resp_d, PF_CAPS_D_NUM_CEQS);
>  	caps->aeqe_depth = 1 << hr_reg_read(resp_d, PF_CAPS_D_AEQ_DEPTH);
> +	caps->default_cong_type = hr_reg_read(resp_d, PF_CAPS_D_DEFAULT_ALG);
>  	caps->reserved_pds = hr_reg_read(resp_d, PF_CAPS_D_RSV_PDS);
>  	caps->num_uars = 1 << hr_reg_read(resp_d, PF_CAPS_D_NUM_UARS);
>  	caps->reserved_qps = hr_reg_read(resp_d, PF_CAPS_D_RSV_QPS);
> @@ -4737,14 +4738,8 @@ enum {
>  static int check_cong_type(struct ib_qp *ibqp,
>  			   struct hns_roce_congestion_algorithm *cong_alg)
>  {
> -	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
>  	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
>  
> -	if (ibqp->qp_type == IB_QPT_UD || ibqp->qp_type == IB_QPT_GSI)
> -		hr_qp->cong_type = CONG_TYPE_DCQCN;
> -	else
> -		hr_qp->cong_type = hr_dev->caps.cong_type;
> -
>  	/* different congestion types match different configurations */
>  	switch (hr_qp->cong_type) {
>  	case CONG_TYPE_DCQCN:
> @@ -4772,9 +4767,6 @@ static int check_cong_type(struct ib_qp *ibqp,
>  		cong_alg->wnd_mode_sel = WND_LIMIT;
>  		break;
>  	default:
> -		ibdev_warn(&hr_dev->ib_dev,
> -			   "invalid type(%u) for congestion selection.\n",
> -			   hr_qp->cong_type);
>  		hr_qp->cong_type = CONG_TYPE_DCQCN;
>  		cong_alg->alg_sel = CONG_DCQCN;
>  		cong_alg->alg_sub_sel = UNSUPPORT_CONG_LEVEL;
> @@ -4807,7 +4799,7 @@ static int fill_cong_field(struct ib_qp *ibqp, const struct ib_qp_attr *attr,
>  		return ret;
>  
>  	hr_reg_write(context, QPC_CONG_ALGO_TMPL_ID, hr_dev->cong_algo_tmpl_id +
> -		     hr_qp->cong_type * HNS_ROCE_CONG_SIZE);
> +		     ilog2(hr_qp->cong_type) * HNS_ROCE_CONG_SIZE);
>  	hr_reg_clear(qpc_mask, QPC_CONG_ALGO_TMPL_ID);
>  	hr_reg_write(&context->ext, QPCEX_CONG_ALG_SEL, cong_field.alg_sel);
>  	hr_reg_clear(&qpc_mask->ext, QPCEX_CONG_ALG_SEL);
> diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
> index cd97cbee682a..359a74672ba1 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
> +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
> @@ -1214,12 +1214,13 @@ struct hns_roce_query_pf_caps_d {
>  #define PF_CAPS_D_RQWQE_HOP_NUM PF_CAPS_D_FIELD_LOC(21, 20)
>  #define PF_CAPS_D_EX_SGE_HOP_NUM PF_CAPS_D_FIELD_LOC(23, 22)
>  #define PF_CAPS_D_SQWQE_HOP_NUM PF_CAPS_D_FIELD_LOC(25, 24)
> -#define PF_CAPS_D_CONG_TYPE PF_CAPS_D_FIELD_LOC(29, 26)
> +#define PF_CAPS_D_CONG_CAP PF_CAPS_D_FIELD_LOC(29, 26)
>  #define PF_CAPS_D_CEQ_DEPTH PF_CAPS_D_FIELD_LOC(85, 64)
>  #define PF_CAPS_D_NUM_CEQS PF_CAPS_D_FIELD_LOC(95, 86)
>  #define PF_CAPS_D_AEQ_DEPTH PF_CAPS_D_FIELD_LOC(117, 96)
>  #define PF_CAPS_D_AEQ_ARM_ST PF_CAPS_D_FIELD_LOC(119, 118)
>  #define PF_CAPS_D_CEQ_ARM_ST PF_CAPS_D_FIELD_LOC(121, 120)
> +#define PF_CAPS_D_DEFAULT_ALG PF_CAPS_D_FIELD_LOC(127, 122)
>  #define PF_CAPS_D_RSV_PDS PF_CAPS_D_FIELD_LOC(147, 128)
>  #define PF_CAPS_D_NUM_UARS PF_CAPS_D_FIELD_LOC(155, 148)
>  #define PF_CAPS_D_RSV_QPS PF_CAPS_D_FIELD_LOC(179, 160)
> diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
> index b55fe6911f9f..e5b678814f58 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_main.c
> +++ b/drivers/infiniband/hw/hns/hns_roce_main.c
> @@ -394,6 +394,9 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx,
>  			resp.config |= HNS_ROCE_RSP_CQE_INLINE_FLAGS;
>  	}
>  
> +	if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09)
> +		resp.congest_type  = hr_dev->caps.cong_cap;
> +
>  	ret = hns_roce_uar_alloc(hr_dev, &context->uar);
>  	if (ret)
>  		goto error_out;
> diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
> index 31b147210688..e22911d6b6a9 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_qp.c
> +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
> @@ -1004,6 +1004,70 @@ static void free_kernel_wrid(struct hns_roce_qp *hr_qp)
>  	kfree(hr_qp->sq.wrid);
>  }
>  
> +static void default_congest_type(struct hns_roce_dev *hr_dev,
> +				 struct hns_roce_qp *hr_qp)
> +{
> +	struct hns_roce_caps *caps = &hr_dev->caps;
> +
> +	if (hr_qp->ibqp.qp_type == IB_QPT_UD ||
> +	    hr_qp->ibqp.qp_type == IB_QPT_GSI)
> +		hr_qp->cong_type = CONG_TYPE_DCQCN;
> +	else
> +		hr_qp->cong_type = 1 << caps->default_cong_type;
> +}
> +
> +static int set_congest_type(struct hns_roce_qp *hr_qp,
> +			    struct hns_roce_ib_create_qp *ucmd)
> +{
> +	struct hns_roce_dev *hr_dev = to_hr_dev(hr_qp->ibqp.device);
> +
> +	switch (ucmd->cong_type_flags) {
> +	case HNS_ROCE_CREATE_QP_FLAGS_DCQCN:
> +		hr_qp->cong_type = CONG_TYPE_DCQCN;
> +		break;
> +	case HNS_ROCE_CREATE_QP_FLAGS_LDCP:
> +		hr_qp->cong_type = CONG_TYPE_LDCP;
> +		break;
> +	case HNS_ROCE_CREATE_QP_FLAGS_HC3:
> +		hr_qp->cong_type = CONG_TYPE_HC3;
> +		break;
> +	case HNS_ROCE_CREATE_QP_FLAGS_DIP:
> +		hr_qp->cong_type = CONG_TYPE_DIP;
> +		break;
> +	default:
> +		hr_qp->cong_type = 0;
> +	}
> +
> +	if (!(hr_qp->cong_type & hr_dev->caps.cong_cap)) {
> +		ibdev_err_ratelimited(&hr_dev->ib_dev,
> +				      "Unsupported congest type 0x%x, cong_cap = 0x%x.\n",
> +				      hr_qp->cong_type, hr_dev->caps.cong_cap);
> +		return -EOPNOTSUPP;
> +	}
> +
> +	if (hr_qp->ibqp.qp_type == IB_QPT_UD &&
> +	    !(hr_qp->cong_type & CONG_TYPE_DCQCN)) {
> +		ibdev_err_ratelimited(&hr_dev->ib_dev,
> +				      "Only DCQCN supported for UD. Unsupported congest type 0x%x.\n",
> +				      hr_qp->cong_type);

Do not print kernel messages triggered by bad userspace input.

Jason

> +		return -EOPNOTSUPP;
> +	}
> +
> +	return 0;
> +}
> +
> +static int set_congest_param(struct hns_roce_dev *hr_dev,
> +			     struct hns_roce_qp *hr_qp,
> +			     struct hns_roce_ib_create_qp *ucmd)
> +{
> +	if (ucmd->comp_mask & HNS_ROCE_CREATE_QP_MASK_CONGEST_TYPE)
> +		return set_congest_type(hr_qp, ucmd);
> +
> +	default_congest_type(hr_dev, hr_qp);
> +
> +	return 0;
> +}
> +
>  static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
>  			struct ib_qp_init_attr *init_attr,
>  			struct ib_udata *udata,
> @@ -1026,6 +1090,9 @@ static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
>  		return ret;
>  	}
>  
> +	if (init_attr->qp_type == IB_QPT_XRC_TGT)
> +		default_congest_type(hr_dev, hr_qp);
> +
>  	if (udata) {
>  		ret = ib_copy_from_udata(ucmd, udata,
>  					 min(udata->inlen, sizeof(*ucmd)));
> @@ -1043,6 +1110,10 @@ static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
>  			ibdev_err(ibdev,
>  				  "failed to set user SQ size, ret = %d.\n",
>  				  ret);
> +
> +		ret = set_congest_param(hr_dev, hr_qp, ucmd);
> +		if (ret)
> +			return ret;
>  	} else {
>  		if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09)
>  			hr_qp->config = HNS_ROCE_EXSGE_FLAGS;
> diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h
> index c996e151081e..757095a6c6fc 100644
> --- a/include/uapi/rdma/hns-abi.h
> +++ b/include/uapi/rdma/hns-abi.h
> @@ -81,6 +81,9 @@ struct hns_roce_ib_create_qp {
>  	__u8    sq_no_prefetch;
>  	__u8    reserved[5];
>  	__aligned_u64 sdb_addr;
> +	__aligned_u64 comp_mask; /* Use enum hns_roce_create_qp_comp_mask */
> +	__aligned_u64 create_flags;
> +	__aligned_u64 cong_type_flags;
>  };
>  
>  enum hns_roce_qp_cap_flags {
> @@ -107,6 +110,17 @@ enum {
>  	HNS_ROCE_RSP_CQE_INLINE_FLAGS = 1 << 2,
>  };
>  
> +enum hns_roce_congest_type_flags {
> +	HNS_ROCE_CREATE_QP_FLAGS_DCQCN = 1 << 0,
> +	HNS_ROCE_CREATE_QP_FLAGS_LDCP = 1 << 1,
> +	HNS_ROCE_CREATE_QP_FLAGS_HC3 = 1 << 2,
> +	HNS_ROCE_CREATE_QP_FLAGS_DIP = 1 << 3,
> +};

Why are these bit flags if they are exclusive?

> +
> +enum hns_roce_create_qp_comp_mask {
> +	HNS_ROCE_CREATE_QP_MASK_CONGEST_TYPE = 1 << 1,

Why 1<<1 not 1<<0?

These are in the wrong order in the file too, they should be before
their first "use", ie move above struct hns_roce_ib_create_qp

> @@ -114,6 +128,9 @@ struct hns_roce_ib_alloc_ucontext_resp {
>  	__u32	reserved;
>  	__u32	config;
>  	__u32	max_inline_data;
> +	__u8	reserved0;
> +	__u8	congest_type;

Why this layout?

Jason
Junxian Huang Feb. 22, 2024, 7:06 a.m. UTC | #2
On 2024/2/21 23:52, Jason Gunthorpe wrote:
> On Thu, Feb 08, 2024 at 11:50:38AM +0800, Junxian Huang wrote:
>> Support userspace configuring congestion control algorithm with
>> QP granularity. If the algorithm is not specified in userspace,
>> use the default one.
>>
>> Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
>> ---
>>  drivers/infiniband/hw/hns/hns_roce_device.h | 23 +++++--
>>  drivers/infiniband/hw/hns/hns_roce_hw_v2.c  | 14 +---
>>  drivers/infiniband/hw/hns/hns_roce_hw_v2.h  |  3 +-
>>  drivers/infiniband/hw/hns/hns_roce_main.c   |  3 +
>>  drivers/infiniband/hw/hns/hns_roce_qp.c     | 71 +++++++++++++++++++++
>>  include/uapi/rdma/hns-abi.h                 | 17 +++++
>>  6 files changed, 112 insertions(+), 19 deletions(-)
>>
>> diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
>> index c88ba7e053bf..55f2f54e15fb 100644
>> --- a/drivers/infiniband/hw/hns/hns_roce_device.h
>> +++ b/drivers/infiniband/hw/hns/hns_roce_device.h
>> @@ -594,11 +594,19 @@ struct hns_roce_work {
>>  	u32 queue_num;
>>  };
>>  
>> -enum cong_type {
>> -	CONG_TYPE_DCQCN,
>> -	CONG_TYPE_LDCP,
>> -	CONG_TYPE_HC3,
>> -	CONG_TYPE_DIP,
>> +enum hns_roce_scc_algo {
>> +	HNS_ROCE_SCC_ALGO_DCQCN = 0,
>> +	HNS_ROCE_SCC_ALGO_LDCP,
>> +	HNS_ROCE_SCC_ALGO_HC3,
>> +	HNS_ROCE_SCC_ALGO_DIP,
>> +	HNS_ROCE_SCC_ALGO_TOTAL,
>> +};
>> +
>> +enum hns_roce_cong_type {
>> +	CONG_TYPE_DCQCN = 1 << HNS_ROCE_SCC_ALGO_DCQCN,
>> +	CONG_TYPE_LDCP = 1 << HNS_ROCE_SCC_ALGO_LDCP,
>> +	CONG_TYPE_HC3 = 1 << HNS_ROCE_SCC_ALGO_HC3,
>> +	CONG_TYPE_DIP = 1 << HNS_ROCE_SCC_ALGO_DIP,
>>  };
>>  
>>  struct hns_roce_qp {
>> @@ -644,7 +652,7 @@ struct hns_roce_qp {
>>  	struct list_head	sq_node; /* all send qps are on a list */
>>  	struct hns_user_mmap_entry *dwqe_mmap_entry;
>>  	u32			config;
>> -	enum cong_type		cong_type;
>> +	enum hns_roce_cong_type	cong_type;
>>  };
>>  
>>  struct hns_roce_ib_iboe {
>> @@ -845,7 +853,8 @@ struct hns_roce_caps {
>>  	u16		default_aeq_period;
>>  	u16		default_aeq_arm_st;
>>  	u16		default_ceq_arm_st;
>> -	enum cong_type	cong_type;
>> +	u8		cong_cap;
>> +	enum hns_roce_cong_type	default_cong_type;
>>  };
>>  
>>  enum hns_roce_device_state {
>> diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
>> index 42e28586cefa..21532f213b0f 100644
>> --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
>> +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
>> @@ -2209,11 +2209,12 @@ static int hns_roce_query_caps(struct hns_roce_dev *hr_dev)
>>  	caps->max_wqes = 1 << le16_to_cpu(resp_c->sq_depth);
>>  
>>  	caps->num_srqs = 1 << hr_reg_read(resp_d, PF_CAPS_D_NUM_SRQS);
>> -	caps->cong_type = hr_reg_read(resp_d, PF_CAPS_D_CONG_TYPE);
>> +	caps->cong_cap = hr_reg_read(resp_d, PF_CAPS_D_CONG_CAP);
>>  	caps->max_srq_wrs = 1 << le16_to_cpu(resp_d->srq_depth);
>>  	caps->ceqe_depth = 1 << hr_reg_read(resp_d, PF_CAPS_D_CEQ_DEPTH);
>>  	caps->num_comp_vectors = hr_reg_read(resp_d, PF_CAPS_D_NUM_CEQS);
>>  	caps->aeqe_depth = 1 << hr_reg_read(resp_d, PF_CAPS_D_AEQ_DEPTH);
>> +	caps->default_cong_type = hr_reg_read(resp_d, PF_CAPS_D_DEFAULT_ALG);
>>  	caps->reserved_pds = hr_reg_read(resp_d, PF_CAPS_D_RSV_PDS);
>>  	caps->num_uars = 1 << hr_reg_read(resp_d, PF_CAPS_D_NUM_UARS);
>>  	caps->reserved_qps = hr_reg_read(resp_d, PF_CAPS_D_RSV_QPS);
>> @@ -4737,14 +4738,8 @@ enum {
>>  static int check_cong_type(struct ib_qp *ibqp,
>>  			   struct hns_roce_congestion_algorithm *cong_alg)
>>  {
>> -	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
>>  	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
>>  
>> -	if (ibqp->qp_type == IB_QPT_UD || ibqp->qp_type == IB_QPT_GSI)
>> -		hr_qp->cong_type = CONG_TYPE_DCQCN;
>> -	else
>> -		hr_qp->cong_type = hr_dev->caps.cong_type;
>> -
>>  	/* different congestion types match different configurations */
>>  	switch (hr_qp->cong_type) {
>>  	case CONG_TYPE_DCQCN:
>> @@ -4772,9 +4767,6 @@ static int check_cong_type(struct ib_qp *ibqp,
>>  		cong_alg->wnd_mode_sel = WND_LIMIT;
>>  		break;
>>  	default:
>> -		ibdev_warn(&hr_dev->ib_dev,
>> -			   "invalid type(%u) for congestion selection.\n",
>> -			   hr_qp->cong_type);
>>  		hr_qp->cong_type = CONG_TYPE_DCQCN;
>>  		cong_alg->alg_sel = CONG_DCQCN;
>>  		cong_alg->alg_sub_sel = UNSUPPORT_CONG_LEVEL;
>> @@ -4807,7 +4799,7 @@ static int fill_cong_field(struct ib_qp *ibqp, const struct ib_qp_attr *attr,
>>  		return ret;
>>  
>>  	hr_reg_write(context, QPC_CONG_ALGO_TMPL_ID, hr_dev->cong_algo_tmpl_id +
>> -		     hr_qp->cong_type * HNS_ROCE_CONG_SIZE);
>> +		     ilog2(hr_qp->cong_type) * HNS_ROCE_CONG_SIZE);
>>  	hr_reg_clear(qpc_mask, QPC_CONG_ALGO_TMPL_ID);
>>  	hr_reg_write(&context->ext, QPCEX_CONG_ALG_SEL, cong_field.alg_sel);
>>  	hr_reg_clear(&qpc_mask->ext, QPCEX_CONG_ALG_SEL);
>> diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
>> index cd97cbee682a..359a74672ba1 100644
>> --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
>> +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
>> @@ -1214,12 +1214,13 @@ struct hns_roce_query_pf_caps_d {
>>  #define PF_CAPS_D_RQWQE_HOP_NUM PF_CAPS_D_FIELD_LOC(21, 20)
>>  #define PF_CAPS_D_EX_SGE_HOP_NUM PF_CAPS_D_FIELD_LOC(23, 22)
>>  #define PF_CAPS_D_SQWQE_HOP_NUM PF_CAPS_D_FIELD_LOC(25, 24)
>> -#define PF_CAPS_D_CONG_TYPE PF_CAPS_D_FIELD_LOC(29, 26)
>> +#define PF_CAPS_D_CONG_CAP PF_CAPS_D_FIELD_LOC(29, 26)
>>  #define PF_CAPS_D_CEQ_DEPTH PF_CAPS_D_FIELD_LOC(85, 64)
>>  #define PF_CAPS_D_NUM_CEQS PF_CAPS_D_FIELD_LOC(95, 86)
>>  #define PF_CAPS_D_AEQ_DEPTH PF_CAPS_D_FIELD_LOC(117, 96)
>>  #define PF_CAPS_D_AEQ_ARM_ST PF_CAPS_D_FIELD_LOC(119, 118)
>>  #define PF_CAPS_D_CEQ_ARM_ST PF_CAPS_D_FIELD_LOC(121, 120)
>> +#define PF_CAPS_D_DEFAULT_ALG PF_CAPS_D_FIELD_LOC(127, 122)
>>  #define PF_CAPS_D_RSV_PDS PF_CAPS_D_FIELD_LOC(147, 128)
>>  #define PF_CAPS_D_NUM_UARS PF_CAPS_D_FIELD_LOC(155, 148)
>>  #define PF_CAPS_D_RSV_QPS PF_CAPS_D_FIELD_LOC(179, 160)
>> diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
>> index b55fe6911f9f..e5b678814f58 100644
>> --- a/drivers/infiniband/hw/hns/hns_roce_main.c
>> +++ b/drivers/infiniband/hw/hns/hns_roce_main.c
>> @@ -394,6 +394,9 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx,
>>  			resp.config |= HNS_ROCE_RSP_CQE_INLINE_FLAGS;
>>  	}
>>  
>> +	if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09)
>> +		resp.congest_type  = hr_dev->caps.cong_cap;
>> +
>>  	ret = hns_roce_uar_alloc(hr_dev, &context->uar);
>>  	if (ret)
>>  		goto error_out;
>> diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
>> index 31b147210688..e22911d6b6a9 100644
>> --- a/drivers/infiniband/hw/hns/hns_roce_qp.c
>> +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
>> @@ -1004,6 +1004,70 @@ static void free_kernel_wrid(struct hns_roce_qp *hr_qp)
>>  	kfree(hr_qp->sq.wrid);
>>  }
>>  
>> +static void default_congest_type(struct hns_roce_dev *hr_dev,
>> +				 struct hns_roce_qp *hr_qp)
>> +{
>> +	struct hns_roce_caps *caps = &hr_dev->caps;
>> +
>> +	if (hr_qp->ibqp.qp_type == IB_QPT_UD ||
>> +	    hr_qp->ibqp.qp_type == IB_QPT_GSI)
>> +		hr_qp->cong_type = CONG_TYPE_DCQCN;
>> +	else
>> +		hr_qp->cong_type = 1 << caps->default_cong_type;
>> +}
>> +
>> +static int set_congest_type(struct hns_roce_qp *hr_qp,
>> +			    struct hns_roce_ib_create_qp *ucmd)
>> +{
>> +	struct hns_roce_dev *hr_dev = to_hr_dev(hr_qp->ibqp.device);
>> +
>> +	switch (ucmd->cong_type_flags) {
>> +	case HNS_ROCE_CREATE_QP_FLAGS_DCQCN:
>> +		hr_qp->cong_type = CONG_TYPE_DCQCN;
>> +		break;
>> +	case HNS_ROCE_CREATE_QP_FLAGS_LDCP:
>> +		hr_qp->cong_type = CONG_TYPE_LDCP;
>> +		break;
>> +	case HNS_ROCE_CREATE_QP_FLAGS_HC3:
>> +		hr_qp->cong_type = CONG_TYPE_HC3;
>> +		break;
>> +	case HNS_ROCE_CREATE_QP_FLAGS_DIP:
>> +		hr_qp->cong_type = CONG_TYPE_DIP;
>> +		break;
>> +	default:
>> +		hr_qp->cong_type = 0;
>> +	}
>> +
>> +	if (!(hr_qp->cong_type & hr_dev->caps.cong_cap)) {
>> +		ibdev_err_ratelimited(&hr_dev->ib_dev,
>> +				      "Unsupported congest type 0x%x, cong_cap = 0x%x.\n",
>> +				      hr_qp->cong_type, hr_dev->caps.cong_cap);
>> +		return -EOPNOTSUPP;
>> +	}
>> +
>> +	if (hr_qp->ibqp.qp_type == IB_QPT_UD &&
>> +	    !(hr_qp->cong_type & CONG_TYPE_DCQCN)) {
>> +		ibdev_err_ratelimited(&hr_dev->ib_dev,
>> +				      "Only DCQCN supported for UD. Unsupported congest type 0x%x.\n",
>> +				      hr_qp->cong_type);
> 
> Do not print kernel messages triggered by bad userspace input.
> 
> Jason
> 

OK. Will remove these printing in next version.

>> +		return -EOPNOTSUPP;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static int set_congest_param(struct hns_roce_dev *hr_dev,
>> +			     struct hns_roce_qp *hr_qp,
>> +			     struct hns_roce_ib_create_qp *ucmd)
>> +{
>> +	if (ucmd->comp_mask & HNS_ROCE_CREATE_QP_MASK_CONGEST_TYPE)
>> +		return set_congest_type(hr_qp, ucmd);
>> +
>> +	default_congest_type(hr_dev, hr_qp);
>> +
>> +	return 0;
>> +}
>> +
>>  static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
>>  			struct ib_qp_init_attr *init_attr,
>>  			struct ib_udata *udata,
>> @@ -1026,6 +1090,9 @@ static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
>>  		return ret;
>>  	}
>>  
>> +	if (init_attr->qp_type == IB_QPT_XRC_TGT)
>> +		default_congest_type(hr_dev, hr_qp);
>> +
>>  	if (udata) {
>>  		ret = ib_copy_from_udata(ucmd, udata,
>>  					 min(udata->inlen, sizeof(*ucmd)));
>> @@ -1043,6 +1110,10 @@ static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
>>  			ibdev_err(ibdev,
>>  				  "failed to set user SQ size, ret = %d.\n",
>>  				  ret);
>> +
>> +		ret = set_congest_param(hr_dev, hr_qp, ucmd);
>> +		if (ret)
>> +			return ret;
>>  	} else {
>>  		if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09)
>>  			hr_qp->config = HNS_ROCE_EXSGE_FLAGS;
>> diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h
>> index c996e151081e..757095a6c6fc 100644
>> --- a/include/uapi/rdma/hns-abi.h
>> +++ b/include/uapi/rdma/hns-abi.h
>> @@ -81,6 +81,9 @@ struct hns_roce_ib_create_qp {
>>  	__u8    sq_no_prefetch;
>>  	__u8    reserved[5];
>>  	__aligned_u64 sdb_addr;
>> +	__aligned_u64 comp_mask; /* Use enum hns_roce_create_qp_comp_mask */
>> +	__aligned_u64 create_flags;
>> +	__aligned_u64 cong_type_flags;
>>  };
>>  
>>  enum hns_roce_qp_cap_flags {
>> @@ -107,6 +110,17 @@ enum {
>>  	HNS_ROCE_RSP_CQE_INLINE_FLAGS = 1 << 2,
>>  };
>>  
>> +enum hns_roce_congest_type_flags {
>> +	HNS_ROCE_CREATE_QP_FLAGS_DCQCN = 1 << 0,
>> +	HNS_ROCE_CREATE_QP_FLAGS_LDCP = 1 << 1,
>> +	HNS_ROCE_CREATE_QP_FLAGS_HC3 = 1 << 2,
>> +	HNS_ROCE_CREATE_QP_FLAGS_DIP = 1 << 3,
>> +};
> 
> Why are these bit flags if they are exclusive?
> 

Our FW uses bit flags. Although there is no direct relationship between
FW and ABI, but from the perspective of readability, bit flags are also
used consistently here in ABI.

>> +
>> +enum hns_roce_create_qp_comp_mask {
>> +	HNS_ROCE_CREATE_QP_MASK_CONGEST_TYPE = 1 << 1,
> 
> Why 1<<1 not 1<<0?

This is to keep consistent with our internal ABI, there are some
features not upstream yet.

> 
> These are in the wrong order in the file too, they should be before
> their first "use", ie move above struct hns_roce_ib_create_qp
> 

OK.

>> @@ -114,6 +128,9 @@ struct hns_roce_ib_alloc_ucontext_resp {
>>  	__u32	reserved;
>>  	__u32	config;
>>  	__u32	max_inline_data;
>> +	__u8	reserved0;
>> +	__u8	congest_type;
> 
> Why this layout?
> > Jason

Same as the 1<<1 issue, to keep consistent with our internal ABI.

Thanks,
Junxian
Leon Romanovsky Feb. 26, 2024, 8:09 a.m. UTC | #3
On Thu, Feb 22, 2024 at 03:06:20PM +0800, Junxian Huang wrote:
> 
> 
> On 2024/2/21 23:52, Jason Gunthorpe wrote:
> > On Thu, Feb 08, 2024 at 11:50:38AM +0800, Junxian Huang wrote:
> >> Support userspace configuring congestion control algorithm with
> >> QP granularity. If the algorithm is not specified in userspace,
> >> use the default one.
> >>
> >> Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
> >> ---
> >>  drivers/infiniband/hw/hns/hns_roce_device.h | 23 +++++--
> >>  drivers/infiniband/hw/hns/hns_roce_hw_v2.c  | 14 +---
> >>  drivers/infiniband/hw/hns/hns_roce_hw_v2.h  |  3 +-
> >>  drivers/infiniband/hw/hns/hns_roce_main.c   |  3 +
> >>  drivers/infiniband/hw/hns/hns_roce_qp.c     | 71 +++++++++++++++++++++
> >>  include/uapi/rdma/hns-abi.h                 | 17 +++++
> >>  6 files changed, 112 insertions(+), 19 deletions(-)

<...>

> >> +
> >> +enum hns_roce_create_qp_comp_mask {
> >> +	HNS_ROCE_CREATE_QP_MASK_CONGEST_TYPE = 1 << 1,
> > 
> > Why 1<<1 not 1<<0?
> 
> This is to keep consistent with our internal ABI, there are some
> features not upstream yet.
> 

<...>

> >> @@ -114,6 +128,9 @@ struct hns_roce_ib_alloc_ucontext_resp {
> >>  	__u32	reserved;
> >>  	__u32	config;
> >>  	__u32	max_inline_data;
> >> +	__u8	reserved0;
> >> +	__u8	congest_type;
> > 
> > Why this layout?
> > > Jason
> 
> Same as the 1<<1 issue, to keep consistent with our internal ABI.

We are talking about upstream kernel UAPI, there is no internal ABI here.

Please fix it.

Thanks

> 
> Thanks,
> Junxian
Junxian Huang Feb. 26, 2024, 12:57 p.m. UTC | #4
On 2024/2/26 16:09, Leon Romanovsky wrote:
> On Thu, Feb 22, 2024 at 03:06:20PM +0800, Junxian Huang wrote:
>>
>>
>> On 2024/2/21 23:52, Jason Gunthorpe wrote:
>>> On Thu, Feb 08, 2024 at 11:50:38AM +0800, Junxian Huang wrote:
>>>> Support userspace configuring congestion control algorithm with
>>>> QP granularity. If the algorithm is not specified in userspace,
>>>> use the default one.
>>>>
>>>> Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
>>>> ---
>>>>  drivers/infiniband/hw/hns/hns_roce_device.h | 23 +++++--
>>>>  drivers/infiniband/hw/hns/hns_roce_hw_v2.c  | 14 +---
>>>>  drivers/infiniband/hw/hns/hns_roce_hw_v2.h  |  3 +-
>>>>  drivers/infiniband/hw/hns/hns_roce_main.c   |  3 +
>>>>  drivers/infiniband/hw/hns/hns_roce_qp.c     | 71 +++++++++++++++++++++
>>>>  include/uapi/rdma/hns-abi.h                 | 17 +++++
>>>>  6 files changed, 112 insertions(+), 19 deletions(-)
> 
> <...>
> 
>>>> +
>>>> +enum hns_roce_create_qp_comp_mask {
>>>> +	HNS_ROCE_CREATE_QP_MASK_CONGEST_TYPE = 1 << 1,
>>>
>>> Why 1<<1 not 1<<0?
>>
>> This is to keep consistent with our internal ABI, there are some
>> features not upstream yet.
>>
> 
> <...>
> 
>>>> @@ -114,6 +128,9 @@ struct hns_roce_ib_alloc_ucontext_resp {
>>>>  	__u32	reserved;
>>>>  	__u32	config;
>>>>  	__u32	max_inline_data;
>>>> +	__u8	reserved0;
>>>> +	__u8	congest_type;
>>>
>>> Why this layout?
>>>> Jason
>>
>> Same as the 1<<1 issue, to keep consistent with our internal ABI.
> 
> We are talking about upstream kernel UAPI, there is no internal ABI here.
> 
> Please fix it.
> 
> Thanks
> 

Sure. Will fix it in next version.

Thanks,
Junxian

>>
>> Thanks,
>> Junxian
Jason Gunthorpe Feb. 26, 2024, 2:06 p.m. UTC | #5
On Thu, Feb 22, 2024 at 03:06:20PM +0800, Junxian Huang wrote:
> >> +enum hns_roce_congest_type_flags {
> >> +	HNS_ROCE_CREATE_QP_FLAGS_DCQCN = 1 << 0,
> >> +	HNS_ROCE_CREATE_QP_FLAGS_LDCP = 1 << 1,
> >> +	HNS_ROCE_CREATE_QP_FLAGS_HC3 = 1 << 2,
> >> +	HNS_ROCE_CREATE_QP_FLAGS_DIP = 1 << 3,
> >> +};
> > 
> > Why are these bit flags if they are exclusive?
> > 
> 
> Our FW uses bit flags. Although there is no direct relationship between
> FW and ABI, but from the perspective of readability, bit flags are also
> used consistently here in ABI.

Don't do that in uapi.

> >> +enum hns_roce_create_qp_comp_mask {
> >> +	HNS_ROCE_CREATE_QP_MASK_CONGEST_TYPE = 1 << 1,
> > 
> > Why 1<<1 not 1<<0?
> 
> This is to keep consistent with our internal ABI, there are some
> features not upstream yet.

Nope, pack them tightly. Don't keep an "internal ABI"

> >> @@ -114,6 +128,9 @@ struct hns_roce_ib_alloc_ucontext_resp {
> >>  	__u32	reserved;
> >>  	__u32	config;
> >>  	__u32	max_inline_data;
> >> +	__u8	reserved0;
> >> +	__u8	congest_type;
> > 
> > Why this layout?
> 
> Same as the 1<<1 issue, to keep consistent with our internal ABI.

Same answer

Jason
Junxian Huang Feb. 27, 2024, 7:28 a.m. UTC | #6
Problems below will be fixed in next version.

Thanks,
Junxian

On 2024/2/26 22:06, Jason Gunthorpe wrote:
> On Thu, Feb 22, 2024 at 03:06:20PM +0800, Junxian Huang wrote:
>>>> +enum hns_roce_congest_type_flags {
>>>> +	HNS_ROCE_CREATE_QP_FLAGS_DCQCN = 1 << 0,
>>>> +	HNS_ROCE_CREATE_QP_FLAGS_LDCP = 1 << 1,
>>>> +	HNS_ROCE_CREATE_QP_FLAGS_HC3 = 1 << 2,
>>>> +	HNS_ROCE_CREATE_QP_FLAGS_DIP = 1 << 3,
>>>> +};
>>>
>>> Why are these bit flags if they are exclusive?
>>>
>>
>> Our FW uses bit flags. Although there is no direct relationship between
>> FW and ABI, but from the perspective of readability, bit flags are also
>> used consistently here in ABI.
> 
> Don't do that in uapi.
> 
>>>> +enum hns_roce_create_qp_comp_mask {
>>>> +	HNS_ROCE_CREATE_QP_MASK_CONGEST_TYPE = 1 << 1,
>>>
>>> Why 1<<1 not 1<<0?
>>
>> This is to keep consistent with our internal ABI, there are some
>> features not upstream yet.
> 
> Nope, pack them tightly. Don't keep an "internal ABI"
> 
>>>> @@ -114,6 +128,9 @@ struct hns_roce_ib_alloc_ucontext_resp {
>>>>  	__u32	reserved;
>>>>  	__u32	config;
>>>>  	__u32	max_inline_data;
>>>> +	__u8	reserved0;
>>>> +	__u8	congest_type;
>>>
>>> Why this layout?
>>
>> Same as the 1<<1 issue, to keep consistent with our internal ABI.
> 
> Same answer
> 
> Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index c88ba7e053bf..55f2f54e15fb 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -594,11 +594,19 @@  struct hns_roce_work {
 	u32 queue_num;
 };
 
-enum cong_type {
-	CONG_TYPE_DCQCN,
-	CONG_TYPE_LDCP,
-	CONG_TYPE_HC3,
-	CONG_TYPE_DIP,
+enum hns_roce_scc_algo {
+	HNS_ROCE_SCC_ALGO_DCQCN = 0,
+	HNS_ROCE_SCC_ALGO_LDCP,
+	HNS_ROCE_SCC_ALGO_HC3,
+	HNS_ROCE_SCC_ALGO_DIP,
+	HNS_ROCE_SCC_ALGO_TOTAL,
+};
+
+enum hns_roce_cong_type {
+	CONG_TYPE_DCQCN = 1 << HNS_ROCE_SCC_ALGO_DCQCN,
+	CONG_TYPE_LDCP = 1 << HNS_ROCE_SCC_ALGO_LDCP,
+	CONG_TYPE_HC3 = 1 << HNS_ROCE_SCC_ALGO_HC3,
+	CONG_TYPE_DIP = 1 << HNS_ROCE_SCC_ALGO_DIP,
 };
 
 struct hns_roce_qp {
@@ -644,7 +652,7 @@  struct hns_roce_qp {
 	struct list_head	sq_node; /* all send qps are on a list */
 	struct hns_user_mmap_entry *dwqe_mmap_entry;
 	u32			config;
-	enum cong_type		cong_type;
+	enum hns_roce_cong_type	cong_type;
 };
 
 struct hns_roce_ib_iboe {
@@ -845,7 +853,8 @@  struct hns_roce_caps {
 	u16		default_aeq_period;
 	u16		default_aeq_arm_st;
 	u16		default_ceq_arm_st;
-	enum cong_type	cong_type;
+	u8		cong_cap;
+	enum hns_roce_cong_type	default_cong_type;
 };
 
 enum hns_roce_device_state {
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 42e28586cefa..21532f213b0f 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -2209,11 +2209,12 @@  static int hns_roce_query_caps(struct hns_roce_dev *hr_dev)
 	caps->max_wqes = 1 << le16_to_cpu(resp_c->sq_depth);
 
 	caps->num_srqs = 1 << hr_reg_read(resp_d, PF_CAPS_D_NUM_SRQS);
-	caps->cong_type = hr_reg_read(resp_d, PF_CAPS_D_CONG_TYPE);
+	caps->cong_cap = hr_reg_read(resp_d, PF_CAPS_D_CONG_CAP);
 	caps->max_srq_wrs = 1 << le16_to_cpu(resp_d->srq_depth);
 	caps->ceqe_depth = 1 << hr_reg_read(resp_d, PF_CAPS_D_CEQ_DEPTH);
 	caps->num_comp_vectors = hr_reg_read(resp_d, PF_CAPS_D_NUM_CEQS);
 	caps->aeqe_depth = 1 << hr_reg_read(resp_d, PF_CAPS_D_AEQ_DEPTH);
+	caps->default_cong_type = hr_reg_read(resp_d, PF_CAPS_D_DEFAULT_ALG);
 	caps->reserved_pds = hr_reg_read(resp_d, PF_CAPS_D_RSV_PDS);
 	caps->num_uars = 1 << hr_reg_read(resp_d, PF_CAPS_D_NUM_UARS);
 	caps->reserved_qps = hr_reg_read(resp_d, PF_CAPS_D_RSV_QPS);
@@ -4737,14 +4738,8 @@  enum {
 static int check_cong_type(struct ib_qp *ibqp,
 			   struct hns_roce_congestion_algorithm *cong_alg)
 {
-	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
 	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
 
-	if (ibqp->qp_type == IB_QPT_UD || ibqp->qp_type == IB_QPT_GSI)
-		hr_qp->cong_type = CONG_TYPE_DCQCN;
-	else
-		hr_qp->cong_type = hr_dev->caps.cong_type;
-
 	/* different congestion types match different configurations */
 	switch (hr_qp->cong_type) {
 	case CONG_TYPE_DCQCN:
@@ -4772,9 +4767,6 @@  static int check_cong_type(struct ib_qp *ibqp,
 		cong_alg->wnd_mode_sel = WND_LIMIT;
 		break;
 	default:
-		ibdev_warn(&hr_dev->ib_dev,
-			   "invalid type(%u) for congestion selection.\n",
-			   hr_qp->cong_type);
 		hr_qp->cong_type = CONG_TYPE_DCQCN;
 		cong_alg->alg_sel = CONG_DCQCN;
 		cong_alg->alg_sub_sel = UNSUPPORT_CONG_LEVEL;
@@ -4807,7 +4799,7 @@  static int fill_cong_field(struct ib_qp *ibqp, const struct ib_qp_attr *attr,
 		return ret;
 
 	hr_reg_write(context, QPC_CONG_ALGO_TMPL_ID, hr_dev->cong_algo_tmpl_id +
-		     hr_qp->cong_type * HNS_ROCE_CONG_SIZE);
+		     ilog2(hr_qp->cong_type) * HNS_ROCE_CONG_SIZE);
 	hr_reg_clear(qpc_mask, QPC_CONG_ALGO_TMPL_ID);
 	hr_reg_write(&context->ext, QPCEX_CONG_ALG_SEL, cong_field.alg_sel);
 	hr_reg_clear(&qpc_mask->ext, QPCEX_CONG_ALG_SEL);
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
index cd97cbee682a..359a74672ba1 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
@@ -1214,12 +1214,13 @@  struct hns_roce_query_pf_caps_d {
 #define PF_CAPS_D_RQWQE_HOP_NUM PF_CAPS_D_FIELD_LOC(21, 20)
 #define PF_CAPS_D_EX_SGE_HOP_NUM PF_CAPS_D_FIELD_LOC(23, 22)
 #define PF_CAPS_D_SQWQE_HOP_NUM PF_CAPS_D_FIELD_LOC(25, 24)
-#define PF_CAPS_D_CONG_TYPE PF_CAPS_D_FIELD_LOC(29, 26)
+#define PF_CAPS_D_CONG_CAP PF_CAPS_D_FIELD_LOC(29, 26)
 #define PF_CAPS_D_CEQ_DEPTH PF_CAPS_D_FIELD_LOC(85, 64)
 #define PF_CAPS_D_NUM_CEQS PF_CAPS_D_FIELD_LOC(95, 86)
 #define PF_CAPS_D_AEQ_DEPTH PF_CAPS_D_FIELD_LOC(117, 96)
 #define PF_CAPS_D_AEQ_ARM_ST PF_CAPS_D_FIELD_LOC(119, 118)
 #define PF_CAPS_D_CEQ_ARM_ST PF_CAPS_D_FIELD_LOC(121, 120)
+#define PF_CAPS_D_DEFAULT_ALG PF_CAPS_D_FIELD_LOC(127, 122)
 #define PF_CAPS_D_RSV_PDS PF_CAPS_D_FIELD_LOC(147, 128)
 #define PF_CAPS_D_NUM_UARS PF_CAPS_D_FIELD_LOC(155, 148)
 #define PF_CAPS_D_RSV_QPS PF_CAPS_D_FIELD_LOC(179, 160)
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index b55fe6911f9f..e5b678814f58 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -394,6 +394,9 @@  static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx,
 			resp.config |= HNS_ROCE_RSP_CQE_INLINE_FLAGS;
 	}
 
+	if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09)
+		resp.congest_type  = hr_dev->caps.cong_cap;
+
 	ret = hns_roce_uar_alloc(hr_dev, &context->uar);
 	if (ret)
 		goto error_out;
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index 31b147210688..e22911d6b6a9 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -1004,6 +1004,70 @@  static void free_kernel_wrid(struct hns_roce_qp *hr_qp)
 	kfree(hr_qp->sq.wrid);
 }
 
+static void default_congest_type(struct hns_roce_dev *hr_dev,
+				 struct hns_roce_qp *hr_qp)
+{
+	struct hns_roce_caps *caps = &hr_dev->caps;
+
+	if (hr_qp->ibqp.qp_type == IB_QPT_UD ||
+	    hr_qp->ibqp.qp_type == IB_QPT_GSI)
+		hr_qp->cong_type = CONG_TYPE_DCQCN;
+	else
+		hr_qp->cong_type = 1 << caps->default_cong_type;
+}
+
+static int set_congest_type(struct hns_roce_qp *hr_qp,
+			    struct hns_roce_ib_create_qp *ucmd)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(hr_qp->ibqp.device);
+
+	switch (ucmd->cong_type_flags) {
+	case HNS_ROCE_CREATE_QP_FLAGS_DCQCN:
+		hr_qp->cong_type = CONG_TYPE_DCQCN;
+		break;
+	case HNS_ROCE_CREATE_QP_FLAGS_LDCP:
+		hr_qp->cong_type = CONG_TYPE_LDCP;
+		break;
+	case HNS_ROCE_CREATE_QP_FLAGS_HC3:
+		hr_qp->cong_type = CONG_TYPE_HC3;
+		break;
+	case HNS_ROCE_CREATE_QP_FLAGS_DIP:
+		hr_qp->cong_type = CONG_TYPE_DIP;
+		break;
+	default:
+		hr_qp->cong_type = 0;
+	}
+
+	if (!(hr_qp->cong_type & hr_dev->caps.cong_cap)) {
+		ibdev_err_ratelimited(&hr_dev->ib_dev,
+				      "Unsupported congest type 0x%x, cong_cap = 0x%x.\n",
+				      hr_qp->cong_type, hr_dev->caps.cong_cap);
+		return -EOPNOTSUPP;
+	}
+
+	if (hr_qp->ibqp.qp_type == IB_QPT_UD &&
+	    !(hr_qp->cong_type & CONG_TYPE_DCQCN)) {
+		ibdev_err_ratelimited(&hr_dev->ib_dev,
+				      "Only DCQCN supported for UD. Unsupported congest type 0x%x.\n",
+				      hr_qp->cong_type);
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int set_congest_param(struct hns_roce_dev *hr_dev,
+			     struct hns_roce_qp *hr_qp,
+			     struct hns_roce_ib_create_qp *ucmd)
+{
+	if (ucmd->comp_mask & HNS_ROCE_CREATE_QP_MASK_CONGEST_TYPE)
+		return set_congest_type(hr_qp, ucmd);
+
+	default_congest_type(hr_dev, hr_qp);
+
+	return 0;
+}
+
 static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
 			struct ib_qp_init_attr *init_attr,
 			struct ib_udata *udata,
@@ -1026,6 +1090,9 @@  static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
 		return ret;
 	}
 
+	if (init_attr->qp_type == IB_QPT_XRC_TGT)
+		default_congest_type(hr_dev, hr_qp);
+
 	if (udata) {
 		ret = ib_copy_from_udata(ucmd, udata,
 					 min(udata->inlen, sizeof(*ucmd)));
@@ -1043,6 +1110,10 @@  static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
 			ibdev_err(ibdev,
 				  "failed to set user SQ size, ret = %d.\n",
 				  ret);
+
+		ret = set_congest_param(hr_dev, hr_qp, ucmd);
+		if (ret)
+			return ret;
 	} else {
 		if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09)
 			hr_qp->config = HNS_ROCE_EXSGE_FLAGS;
diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h
index c996e151081e..757095a6c6fc 100644
--- a/include/uapi/rdma/hns-abi.h
+++ b/include/uapi/rdma/hns-abi.h
@@ -81,6 +81,9 @@  struct hns_roce_ib_create_qp {
 	__u8    sq_no_prefetch;
 	__u8    reserved[5];
 	__aligned_u64 sdb_addr;
+	__aligned_u64 comp_mask; /* Use enum hns_roce_create_qp_comp_mask */
+	__aligned_u64 create_flags;
+	__aligned_u64 cong_type_flags;
 };
 
 enum hns_roce_qp_cap_flags {
@@ -107,6 +110,17 @@  enum {
 	HNS_ROCE_RSP_CQE_INLINE_FLAGS = 1 << 2,
 };
 
+enum hns_roce_congest_type_flags {
+	HNS_ROCE_CREATE_QP_FLAGS_DCQCN = 1 << 0,
+	HNS_ROCE_CREATE_QP_FLAGS_LDCP = 1 << 1,
+	HNS_ROCE_CREATE_QP_FLAGS_HC3 = 1 << 2,
+	HNS_ROCE_CREATE_QP_FLAGS_DIP = 1 << 3,
+};
+
+enum hns_roce_create_qp_comp_mask {
+	HNS_ROCE_CREATE_QP_MASK_CONGEST_TYPE = 1 << 1,
+};
+
 struct hns_roce_ib_alloc_ucontext_resp {
 	__u32	qp_tab_size;
 	__u32	cqe_size;
@@ -114,6 +128,9 @@  struct hns_roce_ib_alloc_ucontext_resp {
 	__u32	reserved;
 	__u32	config;
 	__u32	max_inline_data;
+	__u8	reserved0;
+	__u8	congest_type;
+	__u8	reserved1[6];
 };
 
 struct hns_roce_ib_alloc_ucontext {