diff mbox

[for-next,05/20] RDMA/hns: Add command queue support for hip08 RoCE driver

Message ID 1504084998-64397-6-git-send-email-xavier.huwei@huawei.com (mailing list archive)
State Accepted
Headers show

Commit Message

Wei Hu (Xavier) Aug. 30, 2017, 9:23 a.m. UTC
The command queue is the configuration queue. The software
configures hardware by filling the commands into command
queues. It includes command send queue and receive queue.

In hip08 RoCE engine, It supports to configure and query
registers by command queue.

Signed-off-by: Lijun Ou <oulijun@huawei.com>
Signed-off-by: Shaobo Xu <xushaobo2@huawei.com>
Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
---
 drivers/infiniband/hw/hns/hns_roce_common.h |  13 ++
 drivers/infiniband/hw/hns/hns_roce_device.h |   2 +
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c  | 281 +++++++++++++++++++++++++++-
 drivers/infiniband/hw/hns/hns_roce_hw_v2.h  | 111 +++++++++++
 drivers/infiniband/hw/hns/hns_roce_main.c   |  14 ++
 5 files changed, 420 insertions(+), 1 deletion(-)
 create mode 100644 drivers/infiniband/hw/hns/hns_roce_hw_v2.h

Comments

Doug Ledford Sept. 25, 2017, 5:06 p.m. UTC | #1
On Wed, 2017-08-30 at 17:23 +0800, Wei Hu (Xavier) wrote:

So, we set the send timeout limit here:

> +	/* Setup Tx write back timeout */
> +	priv->cmq.tx_timeout = HNS_ROCE_CMQ_TX_TIMEOUT;
> +

[ snip ]

> +int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
> +		      struct hns_roce_cmq_desc *desc, int num)
> +{
> +	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv
> *)hr_dev->priv;
> +	struct hns_roce_v2_cmq_ring *csq = &priv->cmq.csq;
> +	struct hns_roce_cmq_desc *desc_to_use;
> +	bool complete = false;
> +	u32 timeout = 0;
> +	int handle = 0;
> +	u16 desc_ret;
> +	int ret = 0;
> +	int ntc;
> +
> +	spin_lock_bh(&csq->lock);

and we take a bh lock here...

> +	if (num > hns_roce_cmq_space(csq)) {
> +		spin_unlock_bh(&csq->lock);
> +		return -EBUSY;
> +	}
> +
> +	/*
> +	 * Record the location of desc in the cmq for this time
> +	 * which will be use for hardware to write back
> +	 */
> +	ntc = csq->next_to_use;
> +
> +	while (handle < num) {
> +		desc_to_use = &csq->desc[csq->next_to_use];
> +		*desc_to_use = desc[handle];
> +		dev_dbg(hr_dev->dev, "set cmq desc:\n");
> +		csq->next_to_use++;
> +		if (csq->next_to_use == csq->desc_num)
> +			csq->next_to_use = 0;
> +		handle++;
> +	}
> +
> +	/* Write to hardware */
> +	roce_write(hr_dev, ROCEE_TX_CMQ_TAIL_REG, csq->next_to_use);
> +
> +	/*
> +	 * If the command is sync, wait for the firmware to write
> back,
> +	 * if multi descriptors to be sent, use the first one to
> check
> +	 */
> +	if ((desc->flag) & HNS_ROCE_CMD_FLAG_NO_INTR) {
> +		do {
> +			if (hns_roce_cmq_csq_done(hr_dev))
> +				break;
> +			usleep_range(1000, 2000);
> +			timeout++;
> +		} while (timeout < priv->cmq.tx_timeout);
> +	}

then we spin here for a maximum amount of time between 200 and 400ms,
so 1/4 to 1/2 a second.  All the time we are holding the bh lock on
this CPU.  That seems excessive to me.  If we are going to spin that
long, can you find a way to allocate/reserve your resources, send the
command, then drop the bh lock while you spin, and retake it before you
complete once the spinning is done?

> +#define HNS_ROCE_CMQ_TX_TIMEOUT			200

or you could reduce the size of this define...
Leon Romanovsky Sept. 25, 2017, 5:18 p.m. UTC | #2
On Mon, Sep 25, 2017 at 01:06:53PM -0400, Doug Ledford wrote:
> On Wed, 2017-08-30 at 17:23 +0800, Wei Hu (Xavier) wrote:
>
> So, we set the send timeout limit here:
>
> > +	/* Setup Tx write back timeout */
> > +	priv->cmq.tx_timeout = HNS_ROCE_CMQ_TX_TIMEOUT;
> > +
>
> [ snip ]
>
> > +int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
> > +		      struct hns_roce_cmq_desc *desc, int num)
> > +{
> > +	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv
> > *)hr_dev->priv;
> > +	struct hns_roce_v2_cmq_ring *csq = &priv->cmq.csq;
> > +	struct hns_roce_cmq_desc *desc_to_use;
> > +	bool complete = false;
> > +	u32 timeout = 0;
> > +	int handle = 0;
> > +	u16 desc_ret;
> > +	int ret = 0;
> > +	int ntc;
> > +
> > +	spin_lock_bh(&csq->lock);
>
> and we take a bh lock here...
>
> > +	if (num > hns_roce_cmq_space(csq)) {
> > +		spin_unlock_bh(&csq->lock);
> > +		return -EBUSY;
> > +	}
> > +
> > +	/*
> > +	 * Record the location of desc in the cmq for this time
> > +	 * which will be use for hardware to write back
> > +	 */
> > +	ntc = csq->next_to_use;
> > +
> > +	while (handle < num) {
> > +		desc_to_use = &csq->desc[csq->next_to_use];
> > +		*desc_to_use = desc[handle];
> > +		dev_dbg(hr_dev->dev, "set cmq desc:\n");
> > +		csq->next_to_use++;
> > +		if (csq->next_to_use == csq->desc_num)
> > +			csq->next_to_use = 0;
> > +		handle++;
> > +	}
> > +
> > +	/* Write to hardware */
> > +	roce_write(hr_dev, ROCEE_TX_CMQ_TAIL_REG, csq->next_to_use);
> > +
> > +	/*
> > +	 * If the command is sync, wait for the firmware to write
> > back,
> > +	 * if multi descriptors to be sent, use the first one to
> > check
> > +	 */
> > +	if ((desc->flag) & HNS_ROCE_CMD_FLAG_NO_INTR) {
> > +		do {
> > +			if (hns_roce_cmq_csq_done(hr_dev))
> > +				break;
> > +			usleep_range(1000, 2000);
> > +			timeout++;
> > +		} while (timeout < priv->cmq.tx_timeout);
> > +	}
>
> then we spin here for a maximum amount of time between 200 and 400ms,
> so 1/4 to 1/2 a second.  All the time we are holding the bh lock on
> this CPU.  That seems excessive to me.  If we are going to spin that
> long, can you find a way to allocate/reserve your resources, send the
> command, then drop the bh lock while you spin, and retake it before you
> complete once the spinning is done?

They don't allocate anything in this loop, but checking the pointers are
the same, see hns_roce_cmq_csq_done.

>
> > +#define HNS_ROCE_CMQ_TX_TIMEOUT			200
>
> or you could reduce the size of this define...
>
> --
> Doug Ledford <dledford@redhat.com>
>     GPG KeyID: B826A3330E572FDD
>     Key fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57 2FDD
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford Sept. 25, 2017, 5:36 p.m. UTC | #3
On Mon, 2017-09-25 at 20:18 +0300, Leon Romanovsky wrote:
> On Mon, Sep 25, 2017 at 01:06:53PM -0400, Doug Ledford wrote:
> > On Wed, 2017-08-30 at 17:23 +0800, Wei Hu (Xavier) wrote:
> > 
> > > +	/*
> > > +	 * If the command is sync, wait for the firmware to
> > > write
> > > back,
> > > +	 * if multi descriptors to be sent, use the first one to
> > > check
> > > +	 */
> > > +	if ((desc->flag) & HNS_ROCE_CMD_FLAG_NO_INTR) {
> > > +		do {
> > > +			if (hns_roce_cmq_csq_done(hr_dev))
> > > +				break;
> > > +			usleep_range(1000, 2000);
> > > +			timeout++;
> > > +		} while (timeout < priv->cmq.tx_timeout);
> > > +	}
> > 
> > then we spin here for a maximum amount of time between 200 and
> > 400ms,
> > so 1/4 to 1/2 a second.  All the time we are holding the bh lock on
> > this CPU.  That seems excessive to me.  If we are going to spin
> > that
> > long, can you find a way to allocate/reserve your resources, send
> > the
> > command, then drop the bh lock while you spin, and retake it before
> > you
> > complete once the spinning is done?
> 
> They don't allocate anything in this loop, but checking the pointers
> are
> the same, see hns_roce_cmq_csq_done.

I'm not sure I understand your intended implication of your comment.  I
wasn't concerned about them allocating anything, only that if the
hardware is hung, then this loop will hang out for 1/4 to 1/2 a second
and hold up all bottom half processing on this CPU in the meantime. 
That's the sort of things that provides poor overall system behavior.

Now, since they are really only checking to see if the hardware has
gotten around to their particular command, and their command is part of
a ring structure, it's possible to record the original head command,
and our new head command, and then release the spin_lock_bh around the
entire do{ }while construct, and in hns_roce_cmd_csq_done() you could
check that head is not in the range old_head:new_head.  That would
protect you in case something in the bottom half processing queued up
some more commands and from one sleep to the next the head jumped from
something other than the new_head to something past new_head, so that
head == priv->cmq.csq.next_to_use ends up being perpetually false. 
But, that's just from a quick read of the code, I could easily be
missing something here...

> > 
> > > +#define HNS_ROCE_CMQ_TX_TIMEOUT			200
> > 
> > or you could reduce the size of this define...
> > 
> > --
> > Doug Ledford <dledford@redhat.com>
> >     GPG KeyID: B826A3330E572FDD
> >     Key fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57
> > 2FDD
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-
> > rdma" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
Leon Romanovsky Sept. 26, 2017, 5:15 a.m. UTC | #4
On Mon, Sep 25, 2017 at 01:36:55PM -0400, Doug Ledford wrote:
> On Mon, 2017-09-25 at 20:18 +0300, Leon Romanovsky wrote:
> > On Mon, Sep 25, 2017 at 01:06:53PM -0400, Doug Ledford wrote:
> > > On Wed, 2017-08-30 at 17:23 +0800, Wei Hu (Xavier) wrote:
> > >
> > > > +	/*
> > > > +	 * If the command is sync, wait for the firmware to
> > > > write
> > > > back,
> > > > +	 * if multi descriptors to be sent, use the first one to
> > > > check
> > > > +	 */
> > > > +	if ((desc->flag) & HNS_ROCE_CMD_FLAG_NO_INTR) {
> > > > +		do {
> > > > +			if (hns_roce_cmq_csq_done(hr_dev))
> > > > +				break;
> > > > +			usleep_range(1000, 2000);
> > > > +			timeout++;
> > > > +		} while (timeout < priv->cmq.tx_timeout);
> > > > +	}
> > >
> > > then we spin here for a maximum amount of time between 200 and
> > > 400ms,
> > > so 1/4 to 1/2 a second.  All the time we are holding the bh lock on
> > > this CPU.  That seems excessive to me.  If we are going to spin
> > > that
> > > long, can you find a way to allocate/reserve your resources, send
> > > the
> > > command, then drop the bh lock while you spin, and retake it before
> > > you
> > > complete once the spinning is done?
> >
> > They don't allocate anything in this loop, but checking the pointers
> > are
> > the same, see hns_roce_cmq_csq_done.
>
> I'm not sure I understand your intended implication of your comment.  I
> wasn't concerned about them allocating anything, only that if the
> hardware is hung, then this loop will hang out for 1/4 to 1/2 a second
> and hold up all bottom half processing on this CPU in the meantime.
> That's the sort of things that provides poor overall system behavior.
>
> Now, since they are really only checking to see if the hardware has
> gotten around to their particular command, and their command is part of
> a ring structure, it's possible to record the original head command,
> and our new head command, and then release the spin_lock_bh around the
> entire do{ }while construct, and in hns_roce_cmd_csq_done() you could
> check that head is not in the range old_head:new_head.  That would
> protect you in case something in the bottom half processing queued up
> some more commands and from one sleep to the next the head jumped from
> something other than the new_head to something past new_head, so that
> head == priv->cmq.csq.next_to_use ends up being perpetually false.
> But, that's just from a quick read of the code, I could easily be
> missing something here...

Thanks,
I got your point, the phrase "can you find a way to allocate/reserve
your resources," confused me back then.

>
> > >
> > > > +#define HNS_ROCE_CMQ_TX_TIMEOUT			200
> > >
> > > or you could reduce the size of this define...
> > >
> > > --
> > > Doug Ledford <dledford@redhat.com>
> > >     GPG KeyID: B826A3330E572FDD
> > >     Key fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57
> > > 2FDD
> > >
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-
> > > rdma" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> Doug Ledford <dledford@redhat.com>
>     GPG KeyID: B826A3330E572FDD
>     Key fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57 2FDD
>
Wei Hu (Xavier) Sept. 26, 2017, 1:13 p.m. UTC | #5
On 2017/9/26 1:36, Doug Ledford wrote:
> On Mon, 2017-09-25 at 20:18 +0300, Leon Romanovsky wrote:
>> On Mon, Sep 25, 2017 at 01:06:53PM -0400, Doug Ledford wrote:
>>> On Wed, 2017-08-30 at 17:23 +0800, Wei Hu (Xavier) wrote:
>>>
>>>> +	/*
>>>> +	 * If the command is sync, wait for the firmware to
>>>> write
>>>> back,
>>>> +	 * if multi descriptors to be sent, use the first one to
>>>> check
>>>> +	 */
>>>> +	if ((desc->flag) & HNS_ROCE_CMD_FLAG_NO_INTR) {
>>>> +		do {
>>>> +			if (hns_roce_cmq_csq_done(hr_dev))
>>>> +				break;
>>>> +			usleep_range(1000, 2000);
>>>> +			timeout++;
>>>> +		} while (timeout < priv->cmq.tx_timeout);
>>>> +	}
>>> then we spin here for a maximum amount of time between 200 and
>>> 400ms,
>>> so 1/4 to 1/2 a second.  All the time we are holding the bh lock on
>>> this CPU.  That seems excessive to me.  If we are going to spin
>>> that
>>> long, can you find a way to allocate/reserve your resources, send
>>> the
>>> command, then drop the bh lock while you spin, and retake it before
>>> you
>>> complete once the spinning is done?
>> They don't allocate anything in this loop, but checking the pointers
>> are
>> the same, see hns_roce_cmq_csq_done.
> I'm not sure I understand your intended implication of your comment.  I
> wasn't concerned about them allocating anything, only that if the
> hardware is hung, then this loop will hang out for 1/4 to 1/2 a second
> and hold up all bottom half processing on this CPU in the meantime.
> That's the sort of things that provides poor overall system behavior.
>
> Now, since they are really only checking to see if the hardware has
> gotten around to their particular command, and their command is part of
> a ring structure, it's possible to record the original head command,
> and our new head command, and then release the spin_lock_bh around the
> entire do{ }while construct, and in hns_roce_cmd_csq_done() you could
> check that head is not in the range old_head:new_head.  That would
> protect you in case something in the bottom half processing queued up
> some more commands and from one sleep to the next the head jumped from
> something other than the new_head to something past new_head, so that
> head == priv->cmq.csq.next_to_use ends up being perpetually false.
> But, that's just from a quick read of the code, I could easily be
> missing something here...
Hi, Doug
     Driver issues the cmds in cmq, and firmware gets and processes them.
     The firmware process only one cmd at the same time, and it will take
     about serveral to 200 us in one cmd currently, so the driver need
     not to use stream mode to issue cmd.

     Regards
Wei Hu
>>>> +#define HNS_ROCE_CMQ_TX_TIMEOUT			200
>>> or you could reduce the size of this define...
>>>
>>> --
>>> Doug Ledford <dledford@redhat.com>
>>>      GPG KeyID: B826A3330E572FDD
>>>      Key fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57
>>> 2FDD
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-
>>> rdma" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wei Hu (Xavier) Sept. 26, 2017, 3:24 p.m. UTC | #6
On 2017/9/26 21:13, Wei Hu (Xavier) wrote:
>
>
> On 2017/9/26 1:36, Doug Ledford wrote:
>> On Mon, 2017-09-25 at 20:18 +0300, Leon Romanovsky wrote:
>>> On Mon, Sep 25, 2017 at 01:06:53PM -0400, Doug Ledford wrote:
>>>> On Wed, 2017-08-30 at 17:23 +0800, Wei Hu (Xavier) wrote:
>>>>
>>>>> +    /*
>>>>> +     * If the command is sync, wait for the firmware to
>>>>> write
>>>>> back,
>>>>> +     * if multi descriptors to be sent, use the first one to
>>>>> check
>>>>> +     */
>>>>> +    if ((desc->flag) & HNS_ROCE_CMD_FLAG_NO_INTR) {
>>>>> +        do {
>>>>> +            if (hns_roce_cmq_csq_done(hr_dev))
>>>>> +                break;
>>>>> +            usleep_range(1000, 2000);
>>>>> +            timeout++;
>>>>> +        } while (timeout < priv->cmq.tx_timeout);
>>>>> +    }
>>>> then we spin here for a maximum amount of time between 200 and
>>>> 400ms,
>>>> so 1/4 to 1/2 a second.  All the time we are holding the bh lock on
>>>> this CPU.  That seems excessive to me.  If we are going to spin
>>>> that
>>>> long, can you find a way to allocate/reserve your resources, send
>>>> the
>>>> command, then drop the bh lock while you spin, and retake it before
>>>> you
>>>> complete once the spinning is done?
>>> They don't allocate anything in this loop, but checking the pointers
>>> are
>>> the same, see hns_roce_cmq_csq_done.
>> I'm not sure I understand your intended implication of your comment.  I
>> wasn't concerned about them allocating anything, only that if the
>> hardware is hung, then this loop will hang out for 1/4 to 1/2 a second
>> and hold up all bottom half processing on this CPU in the meantime.
>> That's the sort of things that provides poor overall system behavior.
>>
>> Now, since they are really only checking to see if the hardware has
>> gotten around to their particular command, and their command is part of
>> a ring structure, it's possible to record the original head command,
>> and our new head command, and then release the spin_lock_bh around the
>> entire do{ }while construct, and in hns_roce_cmd_csq_done() you could
>> check that head is not in the range old_head:new_head.  That would
>> protect you in case something in the bottom half processing queued up
>> some more commands and from one sleep to the next the head jumped from
>> something other than the new_head to something past new_head, so that
>> head == priv->cmq.csq.next_to_use ends up being perpetually false.
>> But, that's just from a quick read of the code, I could easily be
>> missing something here...
> Hi, Doug
>     Driver issues the cmds in cmq, and firmware gets and processes them.
>     The firmware process only one cmd at the same time, and it will take
>     about serveral to 200 us in one cmd currently, so the driver need
>     not to use stream mode to issue cmd.
>
>     Regards
> Wei Hu
Hi, Doug
     We can replace usleep_range(1000, 2000); with the following statement:
         udelay(1);
     to avoid spinning too long time and using usleep_range function in
     spin_lock_bh locked region.

     Can you give some suggestions?

     Regards
Wei Hu
>>>>> +#define HNS_ROCE_CMQ_TX_TIMEOUT            200
>>>> or you could reduce the size of this define...
>>>>
>>>> -- 
>>>> Doug Ledford <dledford@redhat.com>
>>>>      GPG KeyID: B826A3330E572FDD
>>>>      Key fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57
>>>> 2FDD
>>>>
>>>> -- 
>>>> To unsubscribe from this list: send the line "unsubscribe linux-
>>>> rdma" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
> -- 
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Leon Romanovsky Sept. 26, 2017, 3:51 p.m. UTC | #7
On Tue, Sep 26, 2017 at 11:24:41PM +0800, Wei Hu (Xavier) wrote:
>
>
> On 2017/9/26 21:13, Wei Hu (Xavier) wrote:
> >
> >
> > On 2017/9/26 1:36, Doug Ledford wrote:
> > > On Mon, 2017-09-25 at 20:18 +0300, Leon Romanovsky wrote:
> > > > On Mon, Sep 25, 2017 at 01:06:53PM -0400, Doug Ledford wrote:
> > > > > On Wed, 2017-08-30 at 17:23 +0800, Wei Hu (Xavier) wrote:
> > > > >
> > > > > > +    /*
> > > > > > +     * If the command is sync, wait for the firmware to
> > > > > > write
> > > > > > back,
> > > > > > +     * if multi descriptors to be sent, use the first one to
> > > > > > check
> > > > > > +     */
> > > > > > +    if ((desc->flag) & HNS_ROCE_CMD_FLAG_NO_INTR) {
> > > > > > +        do {
> > > > > > +            if (hns_roce_cmq_csq_done(hr_dev))
> > > > > > +                break;
> > > > > > +            usleep_range(1000, 2000);
> > > > > > +            timeout++;
> > > > > > +        } while (timeout < priv->cmq.tx_timeout);
> > > > > > +    }
> > > > > then we spin here for a maximum amount of time between 200 and
> > > > > 400ms,
> > > > > so 1/4 to 1/2 a second.  All the time we are holding the bh lock on
> > > > > this CPU.  That seems excessive to me.  If we are going to spin
> > > > > that
> > > > > long, can you find a way to allocate/reserve your resources, send
> > > > > the
> > > > > command, then drop the bh lock while you spin, and retake it before
> > > > > you
> > > > > complete once the spinning is done?
> > > > They don't allocate anything in this loop, but checking the pointers
> > > > are
> > > > the same, see hns_roce_cmq_csq_done.
> > > I'm not sure I understand your intended implication of your comment.  I
> > > wasn't concerned about them allocating anything, only that if the
> > > hardware is hung, then this loop will hang out for 1/4 to 1/2 a second
> > > and hold up all bottom half processing on this CPU in the meantime.
> > > That's the sort of things that provides poor overall system behavior.
> > >
> > > Now, since they are really only checking to see if the hardware has
> > > gotten around to their particular command, and their command is part of
> > > a ring structure, it's possible to record the original head command,
> > > and our new head command, and then release the spin_lock_bh around the
> > > entire do{ }while construct, and in hns_roce_cmd_csq_done() you could
> > > check that head is not in the range old_head:new_head.  That would
> > > protect you in case something in the bottom half processing queued up
> > > some more commands and from one sleep to the next the head jumped from
> > > something other than the new_head to something past new_head, so that
> > > head == priv->cmq.csq.next_to_use ends up being perpetually false.
> > > But, that's just from a quick read of the code, I could easily be
> > > missing something here...
> > Hi, Doug
> >     Driver issues the cmds in cmq, and firmware gets and processes them.
> >     The firmware process only one cmd at the same time, and it will take
> >     about serveral to 200 us in one cmd currently, so the driver need
> >     not to use stream mode to issue cmd.
> >
> >     Regards
> > Wei Hu
> Hi, Doug
>     We can replace usleep_range(1000, 2000); with the following statement:
>         udelay(1);
>     to avoid spinning too long time and using usleep_range function in
>     spin_lock_bh locked region.
>
>     Can you give some suggestions?

He already suggested number of options, while the simple one was to reduce
HNS_ROCE_CMQ_TX_TIMEOUT from 200 to be something sane (for example 5).

Thanks
>
>     Regards
> Wei Hu
> > > > > > +#define HNS_ROCE_CMQ_TX_TIMEOUT            200
> > > > > or you could reduce the size of this define...
> > > > >
> > > > > --
> > > > > Doug Ledford <dledford@redhat.com>
> > > > >      GPG KeyID: B826A3330E572FDD
> > > > >      Key fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57
> > > > > 2FDD
> > > > >
> > > > > --
> > > > > To unsubscribe from this list: send the line "unsubscribe linux-
> > > > > rdma" in
> > > > > the body of a message to majordomo@vger.kernel.org
> > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html
> >
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
Wei Hu (Xavier) Sept. 26, 2017, 4:13 p.m. UTC | #8
On 2017/9/26 23:51, Leon Romanovsky wrote:
> On Tue, Sep 26, 2017 at 11:24:41PM +0800, Wei Hu (Xavier) wrote:
>>
>> On 2017/9/26 21:13, Wei Hu (Xavier) wrote:
>>>
>>> On 2017/9/26 1:36, Doug Ledford wrote:
>>>> On Mon, 2017-09-25 at 20:18 +0300, Leon Romanovsky wrote:
>>>>> On Mon, Sep 25, 2017 at 01:06:53PM -0400, Doug Ledford wrote:
>>>>>> On Wed, 2017-08-30 at 17:23 +0800, Wei Hu (Xavier) wrote:
>>>>>>
>>>>>>> +    /*
>>>>>>> +     * If the command is sync, wait for the firmware to
>>>>>>> write
>>>>>>> back,
>>>>>>> +     * if multi descriptors to be sent, use the first one to
>>>>>>> check
>>>>>>> +     */
>>>>>>> +    if ((desc->flag) & HNS_ROCE_CMD_FLAG_NO_INTR) {
>>>>>>> +        do {
>>>>>>> +            if (hns_roce_cmq_csq_done(hr_dev))
>>>>>>> +                break;
>>>>>>> +            usleep_range(1000, 2000);
>>>>>>> +            timeout++;
>>>>>>> +        } while (timeout < priv->cmq.tx_timeout);
>>>>>>> +    }
>>>>>> then we spin here for a maximum amount of time between 200 and
>>>>>> 400ms,
>>>>>> so 1/4 to 1/2 a second.  All the time we are holding the bh lock on
>>>>>> this CPU.  That seems excessive to me.  If we are going to spin
>>>>>> that
>>>>>> long, can you find a way to allocate/reserve your resources, send
>>>>>> the
>>>>>> command, then drop the bh lock while you spin, and retake it before
>>>>>> you
>>>>>> complete once the spinning is done?
>>>>> They don't allocate anything in this loop, but checking the pointers
>>>>> are
>>>>> the same, see hns_roce_cmq_csq_done.
>>>> I'm not sure I understand your intended implication of your comment.  I
>>>> wasn't concerned about them allocating anything, only that if the
>>>> hardware is hung, then this loop will hang out for 1/4 to 1/2 a second
>>>> and hold up all bottom half processing on this CPU in the meantime.
>>>> That's the sort of things that provides poor overall system behavior.
>>>>
>>>> Now, since they are really only checking to see if the hardware has
>>>> gotten around to their particular command, and their command is part of
>>>> a ring structure, it's possible to record the original head command,
>>>> and our new head command, and then release the spin_lock_bh around the
>>>> entire do{ }while construct, and in hns_roce_cmd_csq_done() you could
>>>> check that head is not in the range old_head:new_head.  That would
>>>> protect you in case something in the bottom half processing queued up
>>>> some more commands and from one sleep to the next the head jumped from
>>>> something other than the new_head to something past new_head, so that
>>>> head == priv->cmq.csq.next_to_use ends up being perpetually false.
>>>> But, that's just from a quick read of the code, I could easily be
>>>> missing something here...
>>> Hi, Doug
>>>      Driver issues the cmds in cmq, and firmware gets and processes them.
>>>      The firmware process only one cmd at the same time, and it will take
>>>      about serveral to 200 us in one cmd currently, so the driver need
>>>      not to use stream mode to issue cmd.
>>>
>>>      Regards
>>> Wei Hu
>> Hi, Doug
>>      We can replace usleep_range(1000, 2000); with the following statement:
>>          udelay(1);
>>      to avoid spinning too long time and using usleep_range function in
>>      spin_lock_bh locked region.
>>
>>      Can you give som=e suggestions?
> He already suggested number of options, while the simple one was to reduce
> HNS_ROCE_CMQ_TX_TIMEOUT from 200 to be something sane (for example 5).
>
> Thanks
Thanks, Leon
     Actually We want to use another option:
     replace usleep_range(1000, 2000); with the following statement:
          udelay(1);
     And if so, we can avoid using usleep_range function in spin_lock_bh 
spin region,
     because it probally cause calltrace by rcu.

     Best regards
Wei Hu
>>      Regards
>> Wei Hu
>>>>>>> +#define HNS_ROCE_CMQ_TX_TIMEOUT            200
>>>>>> or you could reduce the size of this define...
>>>>>>
>>>>>> --
>>>>>> Doug Ledford <dledford@redhat.com>
>>>>>>       GPG KeyID: B826A3330E572FDD
>>>>>>       Key fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57
>>>>>> 2FDD
>>>>>>
>>>>>> --
>>>>>> To unsubscribe from this list: send the line "unsubscribe linux-
>>>>>> rdma" in
>>>>>> the body of a message to majordomo@vger.kernel.org
>>>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford Sept. 26, 2017, 4:18 p.m. UTC | #9
On 9/26/2017 9:13 AM, Wei Hu (Xavier) wrote:
> 
> 
> On 2017/9/26 1:36, Doug Ledford wrote:
>> On Mon, 2017-09-25 at 20:18 +0300, Leon Romanovsky wrote:
>>> On Mon, Sep 25, 2017 at 01:06:53PM -0400, Doug Ledford wrote:
>>>> On Wed, 2017-08-30 at 17:23 +0800, Wei Hu (Xavier) wrote:
>>>>
>>>>> +    /*
>>>>> +     * If the command is sync, wait for the firmware to
>>>>> write
>>>>> back,
>>>>> +     * if multi descriptors to be sent, use the first one to
>>>>> check
>>>>> +     */
>>>>> +    if ((desc->flag) & HNS_ROCE_CMD_FLAG_NO_INTR) {
>>>>> +        do {
>>>>> +            if (hns_roce_cmq_csq_done(hr_dev))
>>>>> +                break;
>>>>> +            usleep_range(1000, 2000);
>>>>> +            timeout++;
>>>>> +        } while (timeout < priv->cmq.tx_timeout);
>>>>> +    }
>>>> then we spin here for a maximum amount of time between 200 and
>>>> 400ms,
>>>> so 1/4 to 1/2 a second.  All the time we are holding the bh lock on
>>>> this CPU.  That seems excessive to me.  If we are going to spin
>>>> that
>>>> long, can you find a way to allocate/reserve your resources, send
>>>> the
>>>> command, then drop the bh lock while you spin, and retake it before
>>>> you
>>>> complete once the spinning is done?
>>> They don't allocate anything in this loop, but checking the pointers
>>> are
>>> the same, see hns_roce_cmq_csq_done.
>> I'm not sure I understand your intended implication of your comment.  I
>> wasn't concerned about them allocating anything, only that if the
>> hardware is hung, then this loop will hang out for 1/4 to 1/2 a second
>> and hold up all bottom half processing on this CPU in the meantime.
>> That's the sort of things that provides poor overall system behavior.
>>
>> Now, since they are really only checking to see if the hardware has
>> gotten around to their particular command, and their command is part of
>> a ring structure, it's possible to record the original head command,
>> and our new head command, and then release the spin_lock_bh around the
>> entire do{ }while construct, and in hns_roce_cmd_csq_done() you could
>> check that head is not in the range old_head:new_head.  That would
>> protect you in case something in the bottom half processing queued up
>> some more commands and from one sleep to the next the head jumped from
>> something other than the new_head to something past new_head, so that
>> head == priv->cmq.csq.next_to_use ends up being perpetually false.
>> But, that's just from a quick read of the code, I could easily be
>> missing something here...
> Hi, Doug
>     Driver issues the cmds in cmq, and firmware gets and processes them.
>     The firmware process only one cmd at the same time, and it will take
>     about serveral to 200 us in one cmd currently, so the driver need
>     not to use stream mode to issue cmd.

I'm not sure I understand your response here.

I get that the driver issues cmds in the cmq, and that the firmware gets
them and processes them.

I get that the firmware will only work on one command at a time and only
move to the next one once the current one is complete.

I get that commands take anywhere from a few usec to a couple hundred usec.

I also get that because you are sleeping for somewhere in between 1000
and 2000 usecs, that the driver could easily finish a whole slew of
commands.  It could do 10 slow commands, or 100 or more fast commands.
What this tells me is that the only reason your current implementation
of hns_roce_cmq_csq_done() works at all is because you keep the device
locked out from any other commands being put on the queue.  As far as I
can tell, that's the only way you can guarantee that at some point you
will wake up and the head pointer will be exactly at csq->next_to_use.
Otherwise, if you didn't block them out, then you could sleep with the
head pointer before csq->next_to_use and wake up the next time with it
already well past csq->next_to_use.  Am I right about that?  While you
are waiting on this command queue, any other commands are blocked from
being placed on the command queue?

I don't understand what you mean by "so the driver need not to use
stream mode to issue cmd".

>     Regards
> Wei Hu
>>>>> +#define HNS_ROCE_CMQ_TX_TIMEOUT            200
>>>> or you could reduce the size of this define...
>>>>
>>>> -- 
>>>> Doug Ledford <dledford@redhat.com>
>>>>      GPG KeyID: B826A3330E572FDD
>>>>      Key fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57
>>>> 2FDD
>>>>
>>>> -- 
>>>> To unsubscribe from this list: send the line "unsubscribe linux-
>>>> rdma" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
>
Wei Hu (Xavier) Sept. 26, 2017, 9:12 p.m. UTC | #10
On 2017/9/27 0:13, Wei Hu (Xavier) wrote:
>
>
> On 2017/9/26 23:51, Leon Romanovsky wrote:
>> On Tue, Sep 26, 2017 at 11:24:41PM +0800, Wei Hu (Xavier) wrote:
>>>
>>> On 2017/9/26 21:13, Wei Hu (Xavier) wrote:
>>>>
>>>> On 2017/9/26 1:36, Doug Ledford wrote:
>>>>> On Mon, 2017-09-25 at 20:18 +0300, Leon Romanovsky wrote:
>>>>>> On Mon, Sep 25, 2017 at 01:06:53PM -0400, Doug Ledford wrote:
>>>>>>> On Wed, 2017-08-30 at 17:23 +0800, Wei Hu (Xavier) wrote:
>>>>>>>
>>>>>>>> +    /*
>>>>>>>> +     * If the command is sync, wait for the firmware to
>>>>>>>> write
>>>>>>>> back,
>>>>>>>> +     * if multi descriptors to be sent, use the first one to
>>>>>>>> check
>>>>>>>> +     */
>>>>>>>> +    if ((desc->flag) & HNS_ROCE_CMD_FLAG_NO_INTR) {
>>>>>>>> +        do {
>>>>>>>> +            if (hns_roce_cmq_csq_done(hr_dev))
>>>>>>>> +                break;
>>>>>>>> +            usleep_range(1000, 2000);
>>>>>>>> +            timeout++;
>>>>>>>> +        } while (timeout < priv->cmq.tx_timeout);
>>>>>>>> +    }
>>>>>>> then we spin here for a maximum amount of time between 200 and
>>>>>>> 400ms,
>>>>>>> so 1/4 to 1/2 a second.  All the time we are holding the bh lock on
>>>>>>> this CPU.  That seems excessive to me.  If we are going to spin
>>>>>>> that
>>>>>>> long, can you find a way to allocate/reserve your resources, send
>>>>>>> the
>>>>>>> command, then drop the bh lock while you spin, and retake it before
>>>>>>> you
>>>>>>> complete once the spinning is done?
>>>>>> They don't allocate anything in this loop, but checking the pointers
>>>>>> are
>>>>>> the same, see hns_roce_cmq_csq_done.
>>>>> I'm not sure I understand your intended implication of your 
>>>>> comment.  I
>>>>> wasn't concerned about them allocating anything, only that if the
>>>>> hardware is hung, then this loop will hang out for 1/4 to 1/2 a 
>>>>> second
>>>>> and hold up all bottom half processing on this CPU in the meantime.
>>>>> That's the sort of things that provides poor overall system behavior.
>>>>>
>>>>> Now, since they are really only checking to see if the hardware has
>>>>> gotten around to their particular command, and their command is 
>>>>> part of
>>>>> a ring structure, it's possible to record the original head command,
>>>>> and our new head command, and then release the spin_lock_bh around 
>>>>> the
>>>>> entire do{ }while construct, and in hns_roce_cmd_csq_done() you could
>>>>> check that head is not in the range old_head:new_head. That would
>>>>> protect you in case something in the bottom half processing queued up
>>>>> some more commands and from one sleep to the next the head jumped 
>>>>> from
>>>>> something other than the new_head to something past new_head, so that
>>>>> head == priv->cmq.csq.next_to_use ends up being perpetually false.
>>>>> But, that's just from a quick read of the code, I could easily be
>>>>> missing something here...
>>>> Hi, Doug
>>>>      Driver issues the cmds in cmq, and firmware gets and processes 
>>>> them.
>>>>      The firmware process only one cmd at the same time, and it 
>>>> will take
>>>>      about serveral to 200 us in one cmd currently, so the driver need
>>>>      not to use stream mode to issue cmd.
>>>>
>>>>      Regards
>>>> Wei Hu
>>> Hi, Doug
>>>      We can replace usleep_range(1000, 2000); with the following 
>>> statement:
>>>          udelay(1);
>>>      to avoid spinning too long time and using usleep_range function in
>>>      spin_lock_bh locked region.
>>>
>>>      Can you give som=e suggestions?
>> He already suggested number of options, while the simple one was to 
>> reduce
>> HNS_ROCE_CMQ_TX_TIMEOUT from 200 to be something sane (for example 5).
>>
>> Thanks
> Thanks, Leon
>     Actually We want to use another option:
>     replace usleep_range(1000, 2000); with the following statement:
>          udelay(1);
>     And if so, we can avoid using usleep_range function in 
> spin_lock_bh spin region,
>     because it probally cause calltrace by rcu.
>
Can we send a following up patch to fix this problem? or send patch v2?
Thanks
>     Best regards
> Wei Hu
>>>     Regards
>>> Wei Hu
>>>>>>>> +#define HNS_ROCE_CMQ_TX_TIMEOUT            200
>>>>>>> or you could reduce the size of this define...
>>>>>>>
>>>>>>> -- 
>>>>>>> Doug Ledford <dledford@redhat.com>
>>>>>>>       GPG KeyID: B826A3330E572FDD
>>>>>>>       Key fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 
>>>>>>> 0E57
>>>>>>> 2FDD
>>>>>>>
>>>>>>> -- 
>>>>>>> To unsubscribe from this list: send the line "unsubscribe linux-
>>>>>>> rdma" in
>>>>>>> the body of a message to majordomo@vger.kernel.org
>>>>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>>>
>>>> -- 
>>>> To unsubscribe from this list: send the line "unsubscribe 
>>>> linux-rdma" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wei Hu (Xavier) Sept. 27, 2017, 2:46 a.m. UTC | #11
On 2017/9/27 0:18, Doug Ledford wrote:
> On 9/26/2017 9:13 AM, Wei Hu (Xavier) wrote:
>>
>> On 2017/9/26 1:36, Doug Ledford wrote:
>>> On Mon, 2017-09-25 at 20:18 +0300, Leon Romanovsky wrote:
>>>> On Mon, Sep 25, 2017 at 01:06:53PM -0400, Doug Ledford wrote:
>>>>> On Wed, 2017-08-30 at 17:23 +0800, Wei Hu (Xavier) wrote:
>>>>>
>>>>>> +    /*
>>>>>> +     * If the command is sync, wait for the firmware to
>>>>>> write
>>>>>> back,
>>>>>> +     * if multi descriptors to be sent, use the first one to
>>>>>> check
>>>>>> +     */
>>>>>> +    if ((desc->flag) & HNS_ROCE_CMD_FLAG_NO_INTR) {
>>>>>> +        do {
>>>>>> +            if (hns_roce_cmq_csq_done(hr_dev))
>>>>>> +                break;
>>>>>> +            usleep_range(1000, 2000);
>>>>>> +            timeout++;
>>>>>> +        } while (timeout < priv->cmq.tx_timeout);
>>>>>> +    }
>>>>> then we spin here for a maximum amount of time between 200 and
>>>>> 400ms,
>>>>> so 1/4 to 1/2 a second.  All the time we are holding the bh lock on
>>>>> this CPU.  That seems excessive to me.  If we are going to spin
>>>>> that
>>>>> long, can you find a way to allocate/reserve your resources, send
>>>>> the
>>>>> command, then drop the bh lock while you spin, and retake it before
>>>>> you
>>>>> complete once the spinning is done?
>>>> They don't allocate anything in this loop, but checking the pointers
>>>> are
>>>> the same, see hns_roce_cmq_csq_done.
>>> I'm not sure I understand your intended implication of your comment.  I
>>> wasn't concerned about them allocating anything, only that if the
>>> hardware is hung, then this loop will hang out for 1/4 to 1/2 a second
>>> and hold up all bottom half processing on this CPU in the meantime.
>>> That's the sort of things that provides poor overall system behavior.
>>>
>>> Now, since they are really only checking to see if the hardware has
>>> gotten around to their particular command, and their command is part of
>>> a ring structure, it's possible to record the original head command,
>>> and our new head command, and then release the spin_lock_bh around the
>>> entire do{ }while construct, and in hns_roce_cmd_csq_done() you could
>>> check that head is not in the range old_head:new_head.  That would
>>> protect you in case something in the bottom half processing queued up
>>> some more commands and from one sleep to the next the head jumped from
>>> something other than the new_head to something past new_head, so that
>>> head == priv->cmq.csq.next_to_use ends up being perpetually false.
>>> But, that's just from a quick read of the code, I could easily be
>>> missing something here...
>> Hi, Doug
>>      Driver issues the cmds in cmq, and firmware gets and processes them.
>>      The firmware process only one cmd at the same time, and it will take
>>      about serveral to 200 us in one cmd currently, so the driver need
>>      not to use stream mode to issue cmd.
> I'm not sure I understand your response here.
>
> I get that the driver issues cmds in the cmq, and that the firmware gets
> them and processes them.
>
> I get that the firmware will only work on one command at a time and only
> move to the next one once the current one is complete.
>
> I get that commands take anywhere from a few usec to a couple hundred usec.
>
> I also get that because you are sleeping for somewhere in between 1000
> and 2000 usecs, that the driver could easily finish a whole slew of
> commands.  It could do 10 slow commands, or 100 or more fast commands.
> What this tells me is that the only reason your current implementation
> of hns_roce_cmq_csq_done() works at all is because you keep the device
> locked out from any other commands being put on the queue.  As far as I
> can tell, that's the only way you can guarantee that at some point you
> will wake up and the head pointer will be exactly at csq->next_to_use.
> Otherwise, if you didn't block them out, then you could sleep with the
> head pointer before csq->next_to_use and wake up the next time with it
> already well past csq->next_to_use.  Am I right about that?  While you
> are waiting on this command queue, any other commands are blocked from
> being placed on the command queue?
Hi, Doug,
you are right.
And one "hns_x" ib device only has one command queue in hip08,
other commands will be blocked when waiting on the command queue.
>
> I don't understand what you mean by "so the driver need not to use
> stream mode to issue cmd".
Sorry, my expression error.
stream -> pipeline

And if you argee, after this patchset has been accepted we will send a 
following up patch :
     In hns_roce_cmq_send function, replace
         usleep_range(1000, 2000);
     with the following statement:
          udelay(1);
     And if so, we can avoid using usleep_range function in spin_lock_bh 
spin region,
     because it probally cause calltrace.

     Best regards
Wei Hu
>>      Regards
>> Wei Hu
>>>>>> +#define HNS_ROCE_CMQ_TX_TIMEOUT            200
>>>>> or you could reduce the size of this define...
>>>>>
>>>>> -- 
>>>>> Doug Ledford <dledford@redhat.com>
>>>>>       GPG KeyID: B826A3330E572FDD
>>>>>       Key fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57
>>>>> 2FDD
>>>>>
>>>>> -- 
>>>>> To unsubscribe from this list: send the line "unsubscribe linux-
>>>>> rdma" in
>>>>> the body of a message to majordomo@vger.kernel.org
>>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford Sept. 27, 2017, 12:21 p.m. UTC | #12
On 9/26/2017 10:46 PM, Wei Hu (Xavier) wrote:
> 
> 
> On 2017/9/27 0:18, Doug Ledford wrote:
>> On 9/26/2017 9:13 AM, Wei Hu (Xavier) wrote:
>>>
>>> On 2017/9/26 1:36, Doug Ledford wrote:
>>>> On Mon, 2017-09-25 at 20:18 +0300, Leon Romanovsky wrote:
>>>>> On Mon, Sep 25, 2017 at 01:06:53PM -0400, Doug Ledford wrote:
>>>>>> On Wed, 2017-08-30 at 17:23 +0800, Wei Hu (Xavier) wrote:
>>>>>>
>>>>>>> +    /*
>>>>>>> +     * If the command is sync, wait for the firmware to
>>>>>>> write
>>>>>>> back,
>>>>>>> +     * if multi descriptors to be sent, use the first one to
>>>>>>> check
>>>>>>> +     */
>>>>>>> +    if ((desc->flag) & HNS_ROCE_CMD_FLAG_NO_INTR) {
>>>>>>> +        do {
>>>>>>> +            if (hns_roce_cmq_csq_done(hr_dev))
>>>>>>> +                break;
>>>>>>> +            usleep_range(1000, 2000);
>>>>>>> +            timeout++;
>>>>>>> +        } while (timeout < priv->cmq.tx_timeout);
>>>>>>> +    }
>>>>>> then we spin here for a maximum amount of time between 200 and
>>>>>> 400ms,
>>>>>> so 1/4 to 1/2 a second.  All the time we are holding the bh lock on
>>>>>> this CPU.  That seems excessive to me.  If we are going to spin
>>>>>> that
>>>>>> long, can you find a way to allocate/reserve your resources, send
>>>>>> the
>>>>>> command, then drop the bh lock while you spin, and retake it before
>>>>>> you
>>>>>> complete once the spinning is done?
>>>>> They don't allocate anything in this loop, but checking the pointers
>>>>> are
>>>>> the same, see hns_roce_cmq_csq_done.
>>>> I'm not sure I understand your intended implication of your comment.  I
>>>> wasn't concerned about them allocating anything, only that if the
>>>> hardware is hung, then this loop will hang out for 1/4 to 1/2 a second
>>>> and hold up all bottom half processing on this CPU in the meantime.
>>>> That's the sort of things that provides poor overall system behavior.
>>>>
>>>> Now, since they are really only checking to see if the hardware has
>>>> gotten around to their particular command, and their command is part of
>>>> a ring structure, it's possible to record the original head command,
>>>> and our new head command, and then release the spin_lock_bh around the
>>>> entire do{ }while construct, and in hns_roce_cmd_csq_done() you could
>>>> check that head is not in the range old_head:new_head.  That would
>>>> protect you in case something in the bottom half processing queued up
>>>> some more commands and from one sleep to the next the head jumped from
>>>> something other than the new_head to something past new_head, so that
>>>> head == priv->cmq.csq.next_to_use ends up being perpetually false.
>>>> But, that's just from a quick read of the code, I could easily be
>>>> missing something here...
>>> Hi, Doug
>>>      Driver issues the cmds in cmq, and firmware gets and processes
>>> them.
>>>      The firmware process only one cmd at the same time, and it will
>>> take
>>>      about serveral to 200 us in one cmd currently, so the driver need
>>>      not to use stream mode to issue cmd.
>> I'm not sure I understand your response here.
>>
>> I get that the driver issues cmds in the cmq, and that the firmware gets
>> them and processes them.
>>
>> I get that the firmware will only work on one command at a time and only
>> move to the next one once the current one is complete.
>>
>> I get that commands take anywhere from a few usec to a couple hundred
>> usec.
>>
>> I also get that because you are sleeping for somewhere in between 1000
>> and 2000 usecs, that the driver could easily finish a whole slew of
>> commands.  It could do 10 slow commands, or 100 or more fast commands.
>> What this tells me is that the only reason your current implementation
>> of hns_roce_cmq_csq_done() works at all is because you keep the device
>> locked out from any other commands being put on the queue.  As far as I
>> can tell, that's the only way you can guarantee that at some point you
>> will wake up and the head pointer will be exactly at csq->next_to_use.
>> Otherwise, if you didn't block them out, then you could sleep with the
>> head pointer before csq->next_to_use and wake up the next time with it
>> already well past csq->next_to_use.  Am I right about that?  While you
>> are waiting on this command queue, any other commands are blocked from
>> being placed on the command queue?
> Hi, Doug,
> you are right.
> And one "hns_x" ib device only has one command queue in hip08,
> other commands will be blocked when waiting on the command queue.
>>
>> I don't understand what you mean by "so the driver need not to use
>> stream mode to issue cmd".
> Sorry, my expression error.
> stream -> pipeline
> 
> And if you argee, after this patchset has been accepted we will send a
> following up patch :
>     In hns_roce_cmq_send function, replace
>         usleep_range(1000, 2000);
>     with the following statement:
>          udelay(1);
>     And if so, we can avoid using usleep_range function in spin_lock_bh
> spin region,
>     because it probally cause calltrace.

Ok, I'm fine with that.  I'll pull these in.
Doug Ledford Sept. 27, 2017, 12:41 p.m. UTC | #13
On Wed, 2017-09-27 at 08:21 -0400, Doug Ledford wrote:
> > And if you argee, after this patchset has been accepted we will
> send a
> > following up patch :
> >     In hns_roce_cmq_send function, replace
> >         usleep_range(1000, 2000);
> >     with the following statement:
> >          udelay(1);
> >     And if so, we can avoid using usleep_range function in
> spin_lock_bh
> > spin region,
> >     because it probally cause calltrace.
> 
> Ok, I'm fine with that.  I'll pull these in.

OK, these are in my tree, thanks.
Wei Hu (Xavier) Sept. 28, 2017, 4:34 a.m. UTC | #14
On 2017/9/27 20:41, Doug Ledford wrote:
> On Wed, 2017-09-27 at 08:21 -0400, Doug Ledford wrote:
>>> And if you argee, after this patchset has been accepted we will
>> send a
>>> following up patch :
>>>      In hns_roce_cmq_send function, replace
>>>          usleep_range(1000, 2000);
>>>      with the following statement:
>>>           udelay(1);
>>>      And if so, we can avoid using usleep_range function in
>> spin_lock_bh
>>> spin region,
>>>      because it probally cause calltrace.
>> Ok, I'm fine with that.  I'll pull these in.
> OK, these are in my tree, thanks.
Thanks a lot.

     Regards
Wei Hu

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/infiniband/hw/hns/hns_roce_common.h b/drivers/infiniband/hw/hns/hns_roce_common.h
index 4af403e..94381c2 100644
--- a/drivers/infiniband/hw/hns/hns_roce_common.h
+++ b/drivers/infiniband/hw/hns/hns_roce_common.h
@@ -362,4 +362,17 @@ 
 #define ROCEE_ECC_UCERR_ALM0_REG		0xB34
 #define ROCEE_ECC_CERR_ALM0_REG			0xB40
 
+/* V2 ROCEE REG */
+#define ROCEE_TX_CMQ_BASEADDR_L_REG		0x07000
+#define ROCEE_TX_CMQ_BASEADDR_H_REG		0x07004
+#define ROCEE_TX_CMQ_DEPTH_REG			0x07008
+#define ROCEE_TX_CMQ_TAIL_REG			0x07010
+#define ROCEE_TX_CMQ_HEAD_REG			0x07014
+
+#define ROCEE_RX_CMQ_BASEADDR_L_REG		0x07018
+#define ROCEE_RX_CMQ_BASEADDR_H_REG		0x0701c
+#define ROCEE_RX_CMQ_DEPTH_REG			0x07020
+#define ROCEE_RX_CMQ_TAIL_REG			0x07024
+#define ROCEE_RX_CMQ_HEAD_REG			0x07028
+
 #endif /* _HNS_ROCE_COMMON_H */
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index 3132b68..87c2e07 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -506,6 +506,8 @@  struct hns_roce_caps {
 
 struct hns_roce_hw {
 	int (*reset)(struct hns_roce_dev *hr_dev, bool enable);
+	int (*cmq_init)(struct hns_roce_dev *hr_dev);
+	void (*cmq_exit)(struct hns_roce_dev *hr_dev);
 	void (*hw_profile)(struct hns_roce_dev *hr_dev);
 	int (*hw_init)(struct hns_roce_dev *hr_dev);
 	void (*hw_exit)(struct hns_roce_dev *hr_dev);
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index b35c72d..b610f3a 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -41,8 +41,277 @@ 
 #include "hns_roce_device.h"
 #include "hns_roce_cmd.h"
 #include "hns_roce_hem.h"
+#include "hns_roce_hw_v2.h"
 
-static const struct hns_roce_hw hns_roce_hw_v2;
+static int hns_roce_cmq_space(struct hns_roce_v2_cmq_ring *ring)
+{
+	int ntu = ring->next_to_use;
+	int ntc = ring->next_to_clean;
+	int used = (ntu - ntc + ring->desc_num) % ring->desc_num;
+
+	return ring->desc_num - used - 1;
+}
+
+static int hns_roce_alloc_cmq_desc(struct hns_roce_dev *hr_dev,
+				   struct hns_roce_v2_cmq_ring *ring)
+{
+	int size = ring->desc_num * sizeof(struct hns_roce_cmq_desc);
+
+	ring->desc = kzalloc(size, GFP_KERNEL);
+	if (!ring->desc)
+		return -ENOMEM;
+
+	ring->desc_dma_addr = dma_map_single(hr_dev->dev, ring->desc, size,
+					     DMA_BIDIRECTIONAL);
+	if (dma_mapping_error(hr_dev->dev, ring->desc_dma_addr)) {
+		ring->desc_dma_addr = 0;
+		kfree(ring->desc);
+		ring->desc = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void hns_roce_free_cmq_desc(struct hns_roce_dev *hr_dev,
+				   struct hns_roce_v2_cmq_ring *ring)
+{
+	dma_unmap_single(hr_dev->dev, ring->desc_dma_addr,
+			 ring->desc_num * sizeof(struct hns_roce_cmq_desc),
+			 DMA_BIDIRECTIONAL);
+	kfree(ring->desc);
+}
+
+static int hns_roce_init_cmq_ring(struct hns_roce_dev *hr_dev, bool ring_type)
+{
+	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	struct hns_roce_v2_cmq_ring *ring = (ring_type == TYPE_CSQ) ?
+					    &priv->cmq.csq : &priv->cmq.crq;
+
+	ring->flag = ring_type;
+	ring->next_to_clean = 0;
+	ring->next_to_use = 0;
+
+	return hns_roce_alloc_cmq_desc(hr_dev, ring);
+}
+
+static void hns_roce_cmq_init_regs(struct hns_roce_dev *hr_dev, bool ring_type)
+{
+	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	struct hns_roce_v2_cmq_ring *ring = (ring_type == TYPE_CSQ) ?
+					    &priv->cmq.csq : &priv->cmq.crq;
+	dma_addr_t dma = ring->desc_dma_addr;
+
+	if (ring_type == TYPE_CSQ) {
+		roce_write(hr_dev, ROCEE_TX_CMQ_BASEADDR_L_REG, (u32)dma);
+		roce_write(hr_dev, ROCEE_TX_CMQ_BASEADDR_H_REG,
+			   upper_32_bits(dma));
+		roce_write(hr_dev, ROCEE_TX_CMQ_DEPTH_REG,
+			  (ring->desc_num >> HNS_ROCE_CMQ_DESC_NUM_S) |
+			   HNS_ROCE_CMQ_ENABLE);
+		roce_write(hr_dev, ROCEE_TX_CMQ_HEAD_REG, 0);
+		roce_write(hr_dev, ROCEE_TX_CMQ_TAIL_REG, 0);
+	} else {
+		roce_write(hr_dev, ROCEE_RX_CMQ_BASEADDR_L_REG, (u32)dma);
+		roce_write(hr_dev, ROCEE_RX_CMQ_BASEADDR_H_REG,
+			   upper_32_bits(dma));
+		roce_write(hr_dev, ROCEE_RX_CMQ_DEPTH_REG,
+			  (ring->desc_num >> HNS_ROCE_CMQ_DESC_NUM_S) |
+			   HNS_ROCE_CMQ_ENABLE);
+		roce_write(hr_dev, ROCEE_RX_CMQ_HEAD_REG, 0);
+		roce_write(hr_dev, ROCEE_RX_CMQ_TAIL_REG, 0);
+	}
+}
+
+static int hns_roce_v2_cmq_init(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	int ret;
+
+	/* Setup the queue entries for command queue */
+	priv->cmq.csq.desc_num = 1024;
+	priv->cmq.crq.desc_num = 1024;
+
+	/* Setup the lock for command queue */
+	spin_lock_init(&priv->cmq.csq.lock);
+	spin_lock_init(&priv->cmq.crq.lock);
+
+	/* Setup Tx write back timeout */
+	priv->cmq.tx_timeout = HNS_ROCE_CMQ_TX_TIMEOUT;
+
+	/* Init CSQ */
+	ret = hns_roce_init_cmq_ring(hr_dev, TYPE_CSQ);
+	if (ret) {
+		dev_err(hr_dev->dev, "Init CSQ error, ret = %d.\n", ret);
+		return ret;
+	}
+
+	/* Init CRQ */
+	ret = hns_roce_init_cmq_ring(hr_dev, TYPE_CRQ);
+	if (ret) {
+		dev_err(hr_dev->dev, "Init CRQ error, ret = %d.\n", ret);
+		goto err_crq;
+	}
+
+	/* Init CSQ REG */
+	hns_roce_cmq_init_regs(hr_dev, TYPE_CSQ);
+
+	/* Init CRQ REG */
+	hns_roce_cmq_init_regs(hr_dev, TYPE_CRQ);
+
+	return 0;
+
+err_crq:
+	hns_roce_free_cmq_desc(hr_dev, &priv->cmq.csq);
+
+	return ret;
+}
+
+static void hns_roce_v2_cmq_exit(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+
+	hns_roce_free_cmq_desc(hr_dev, &priv->cmq.csq);
+	hns_roce_free_cmq_desc(hr_dev, &priv->cmq.crq);
+}
+
+void hns_roce_cmq_setup_basic_desc(struct hns_roce_cmq_desc *desc,
+				   enum hns_roce_opcode_type opcode,
+				   bool is_read)
+{
+	memset((void *)desc, 0, sizeof(struct hns_roce_cmq_desc));
+	desc->opcode = cpu_to_le16(opcode);
+	desc->flag =
+		cpu_to_le16(HNS_ROCE_CMD_FLAG_NO_INTR | HNS_ROCE_CMD_FLAG_IN);
+	if (is_read)
+		desc->flag |= cpu_to_le16(HNS_ROCE_CMD_FLAG_WR);
+	else
+		desc->flag &= cpu_to_le16(~HNS_ROCE_CMD_FLAG_WR);
+}
+
+static int hns_roce_cmq_csq_done(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	u32 head = roce_read(hr_dev, ROCEE_TX_CMQ_HEAD_REG);
+
+	return head == priv->cmq.csq.next_to_use;
+}
+
+static int hns_roce_cmq_csq_clean(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	struct hns_roce_v2_cmq_ring *csq = &priv->cmq.csq;
+	struct hns_roce_cmq_desc *desc;
+	u16 ntc = csq->next_to_clean;
+	u32 head;
+	int clean = 0;
+
+	desc = &csq->desc[ntc];
+	head = roce_read(hr_dev, ROCEE_TX_CMQ_HEAD_REG);
+	while (head != ntc) {
+		memset(desc, 0, sizeof(*desc));
+		ntc++;
+		if (ntc == csq->desc_num)
+			ntc = 0;
+		desc = &csq->desc[ntc];
+		clean++;
+	}
+	csq->next_to_clean = ntc;
+
+	return clean;
+}
+
+int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
+		      struct hns_roce_cmq_desc *desc, int num)
+{
+	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	struct hns_roce_v2_cmq_ring *csq = &priv->cmq.csq;
+	struct hns_roce_cmq_desc *desc_to_use;
+	bool complete = false;
+	u32 timeout = 0;
+	int handle = 0;
+	u16 desc_ret;
+	int ret = 0;
+	int ntc;
+
+	spin_lock_bh(&csq->lock);
+
+	if (num > hns_roce_cmq_space(csq)) {
+		spin_unlock_bh(&csq->lock);
+		return -EBUSY;
+	}
+
+	/*
+	 * Record the location of desc in the cmq for this time
+	 * which will be use for hardware to write back
+	 */
+	ntc = csq->next_to_use;
+
+	while (handle < num) {
+		desc_to_use = &csq->desc[csq->next_to_use];
+		*desc_to_use = desc[handle];
+		dev_dbg(hr_dev->dev, "set cmq desc:\n");
+		csq->next_to_use++;
+		if (csq->next_to_use == csq->desc_num)
+			csq->next_to_use = 0;
+		handle++;
+	}
+
+	/* Write to hardware */
+	roce_write(hr_dev, ROCEE_TX_CMQ_TAIL_REG, csq->next_to_use);
+
+	/*
+	 * If the command is sync, wait for the firmware to write back,
+	 * if multi descriptors to be sent, use the first one to check
+	 */
+	if ((desc->flag) & HNS_ROCE_CMD_FLAG_NO_INTR) {
+		do {
+			if (hns_roce_cmq_csq_done(hr_dev))
+				break;
+			usleep_range(1000, 2000);
+			timeout++;
+		} while (timeout < priv->cmq.tx_timeout);
+	}
+
+	if (hns_roce_cmq_csq_done(hr_dev)) {
+		complete = true;
+		handle = 0;
+		while (handle < num) {
+			/* get the result of hardware write back */
+			desc_to_use = &csq->desc[ntc];
+			desc[handle] = *desc_to_use;
+			dev_dbg(hr_dev->dev, "Get cmq desc:\n");
+			desc_ret = desc[handle].retval;
+			if (desc_ret == CMD_EXEC_SUCCESS)
+				ret = 0;
+			else
+				ret = -EIO;
+			priv->cmq.last_status = desc_ret;
+			ntc++;
+			handle++;
+			if (ntc == csq->desc_num)
+				ntc = 0;
+		}
+	}
+
+	if (!complete)
+		ret = -EAGAIN;
+
+	/* clean the command send queue */
+	handle = hns_roce_cmq_csq_clean(hr_dev);
+	if (handle != num)
+		dev_warn(hr_dev->dev, "Cleaned %d, need to clean %d\n",
+			 handle, num);
+
+	spin_unlock_bh(&csq->lock);
+
+	return ret;
+}
+
+static const struct hns_roce_hw hns_roce_hw_v2 = {
+	.cmq_init = hns_roce_v2_cmq_init,
+	.cmq_exit = hns_roce_v2_cmq_exit,
+};
 
 static const struct pci_device_id hns_roce_hw_v2_pci_tbl[] = {
 	{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA), 0},
@@ -87,6 +356,12 @@  static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle)
 	if (!hr_dev)
 		return -ENOMEM;
 
+	hr_dev->priv = kzalloc(sizeof(struct hns_roce_v2_priv), GFP_KERNEL);
+	if (!hr_dev->priv) {
+		ret = -ENOMEM;
+		goto error_failed_kzalloc;
+	}
+
 	hr_dev->pci_dev = handle->pdev;
 	hr_dev->dev = &handle->pdev->dev;
 	handle->priv = hr_dev;
@@ -106,6 +381,9 @@  static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle)
 	return 0;
 
 error_failed_get_cfg:
+	kfree(hr_dev->priv);
+
+error_failed_kzalloc:
 	ib_dealloc_device(&hr_dev->ib_dev);
 
 	return ret;
@@ -117,6 +395,7 @@  static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
 	struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv;
 
 	hns_roce_exit(hr_dev);
+	kfree(hr_dev->priv);
 	ib_dealloc_device(&hr_dev->ib_dev);
 }
 
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
new file mode 100644
index 0000000..781ae74
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
@@ -0,0 +1,111 @@ 
+/*
+ * Copyright (c) 2016-2017 Hisilicon Limited.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _HNS_ROCE_HW_V2_H
+#define _HNS_ROCE_HW_V2_H
+
+#define HNS_ROCE_CMQ_TX_TIMEOUT			200
+
+#define HNS_ROCE_CMD_FLAG_IN_VALID_SHIFT	0
+#define HNS_ROCE_CMD_FLAG_OUT_VALID_SHIFT	1
+#define HNS_ROCE_CMD_FLAG_NEXT_SHIFT		2
+#define HNS_ROCE_CMD_FLAG_WR_OR_RD_SHIFT	3
+#define HNS_ROCE_CMD_FLAG_NO_INTR_SHIFT		4
+#define HNS_ROCE_CMD_FLAG_ERR_INTR_SHIFT	5
+
+#define HNS_ROCE_CMD_FLAG_IN		BIT(HNS_ROCE_CMD_FLAG_IN_VALID_SHIFT)
+#define HNS_ROCE_CMD_FLAG_OUT		BIT(HNS_ROCE_CMD_FLAG_OUT_VALID_SHIFT)
+#define HNS_ROCE_CMD_FLAG_NEXT		BIT(HNS_ROCE_CMD_FLAG_NEXT_SHIFT)
+#define HNS_ROCE_CMD_FLAG_WR		BIT(HNS_ROCE_CMD_FLAG_WR_OR_RD_SHIFT)
+#define HNS_ROCE_CMD_FLAG_NO_INTR	BIT(HNS_ROCE_CMD_FLAG_NO_INTR_SHIFT)
+#define HNS_ROCE_CMD_FLAG_ERR_INTR	BIT(HNS_ROCE_CMD_FLAG_ERR_INTR_SHIFT)
+
+#define HNS_ROCE_CMQ_DESC_NUM_S		3
+#define HNS_ROCE_CMQ_EN_B		16
+#define HNS_ROCE_CMQ_ENABLE		BIT(HNS_ROCE_CMQ_EN_B)
+
+/* CMQ command */
+enum hns_roce_opcode_type {
+	HNS_ROCE_OPC_QUERY_HW_VER			= 0x8000,
+	HNS_ROCE_OPC_CFG_GLOBAL_PARAM			= 0x8001,
+	HNS_ROCE_OPC_ALLOC_PF_RES			= 0x8004,
+	HNS_ROCE_OPC_QUERY_PF_RES			= 0x8400,
+	HNS_ROCE_OPC_ALLOC_VF_RES			= 0x8401,
+	HNS_ROCE_OPC_CFG_BT_ATTR			= 0x8506,
+};
+
+enum {
+	TYPE_CRQ,
+	TYPE_CSQ,
+};
+
+enum hns_roce_cmd_return_status {
+	CMD_EXEC_SUCCESS	= 0,
+	CMD_NO_AUTH		= 1,
+	CMD_NOT_EXEC		= 2,
+	CMD_QUEUE_FULL		= 3,
+};
+
+struct hns_roce_cmq_desc {
+	u16 opcode;
+	u16 flag;
+	u16 retval;
+	u16 rsv;
+	u32 data[6];
+};
+
+struct hns_roce_v2_cmq_ring {
+	dma_addr_t desc_dma_addr;
+	struct hns_roce_cmq_desc *desc;
+	u32 head;
+	u32 tail;
+
+	u16 buf_size;
+	u16 desc_num;
+	int next_to_use;
+	int next_to_clean;
+	u8 flag;
+	spinlock_t lock; /* command queue lock */
+};
+
+struct hns_roce_v2_cmq {
+	struct hns_roce_v2_cmq_ring csq;
+	struct hns_roce_v2_cmq_ring crq;
+	u16 tx_timeout;
+	u16 last_status;
+};
+
+struct hns_roce_v2_priv {
+	struct hns_roce_v2_cmq cmq;
+};
+
+#endif
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index b07d437..4f5a6fd 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -678,6 +678,14 @@  int hns_roce_init(struct hns_roce_dev *hr_dev)
 		}
 	}
 
+	if (hr_dev->hw->cmq_init) {
+		ret = hr_dev->hw->cmq_init(hr_dev);
+		if (ret) {
+			dev_err(dev, "Init RoCE Command Queue failed!\n");
+			goto error_failed_cmq_init;
+		}
+	}
+
 	hr_dev->hw->hw_profile(hr_dev);
 
 	ret = hns_roce_cmd_init(hr_dev);
@@ -750,6 +758,10 @@  int hns_roce_init(struct hns_roce_dev *hr_dev)
 	hns_roce_cmd_cleanup(hr_dev);
 
 error_failed_cmd_init:
+	if (hr_dev->hw->cmq_exit)
+		hr_dev->hw->cmq_exit(hr_dev);
+
+error_failed_cmq_init:
 	if (hr_dev->hw->reset) {
 		ret = hr_dev->hw->reset(hr_dev, false);
 		if (ret)
@@ -774,6 +786,8 @@  void hns_roce_exit(struct hns_roce_dev *hr_dev)
 	if (hr_dev->cmd_mod)
 		hns_roce_cleanup_eq_table(hr_dev);
 	hns_roce_cmd_cleanup(hr_dev);
+	if (hr_dev->hw->cmq_exit)
+		hr_dev->hw->cmq_exit(hr_dev);
 	if (hr_dev->hw->reset)
 		hr_dev->hw->reset(hr_dev, false);
 }