Message ID | 1504084998-64397-6-git-send-email-xavier.huwei@huawei.com (mailing list archive) |
---|---|
State | Accepted |
Headers | show |
On Wed, 2017-08-30 at 17:23 +0800, Wei Hu (Xavier) wrote: So, we set the send timeout limit here: > + /* Setup Tx write back timeout */ > + priv->cmq.tx_timeout = HNS_ROCE_CMQ_TX_TIMEOUT; > + [ snip ] > +int hns_roce_cmq_send(struct hns_roce_dev *hr_dev, > + struct hns_roce_cmq_desc *desc, int num) > +{ > + struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv > *)hr_dev->priv; > + struct hns_roce_v2_cmq_ring *csq = &priv->cmq.csq; > + struct hns_roce_cmq_desc *desc_to_use; > + bool complete = false; > + u32 timeout = 0; > + int handle = 0; > + u16 desc_ret; > + int ret = 0; > + int ntc; > + > + spin_lock_bh(&csq->lock); and we take a bh lock here... > + if (num > hns_roce_cmq_space(csq)) { > + spin_unlock_bh(&csq->lock); > + return -EBUSY; > + } > + > + /* > + * Record the location of desc in the cmq for this time > + * which will be use for hardware to write back > + */ > + ntc = csq->next_to_use; > + > + while (handle < num) { > + desc_to_use = &csq->desc[csq->next_to_use]; > + *desc_to_use = desc[handle]; > + dev_dbg(hr_dev->dev, "set cmq desc:\n"); > + csq->next_to_use++; > + if (csq->next_to_use == csq->desc_num) > + csq->next_to_use = 0; > + handle++; > + } > + > + /* Write to hardware */ > + roce_write(hr_dev, ROCEE_TX_CMQ_TAIL_REG, csq->next_to_use); > + > + /* > + * If the command is sync, wait for the firmware to write > back, > + * if multi descriptors to be sent, use the first one to > check > + */ > + if ((desc->flag) & HNS_ROCE_CMD_FLAG_NO_INTR) { > + do { > + if (hns_roce_cmq_csq_done(hr_dev)) > + break; > + usleep_range(1000, 2000); > + timeout++; > + } while (timeout < priv->cmq.tx_timeout); > + } then we spin here for a maximum amount of time between 200 and 400ms, so 1/4 to 1/2 a second. All the time we are holding the bh lock on this CPU. That seems excessive to me. If we are going to spin that long, can you find a way to allocate/reserve your resources, send the command, then drop the bh lock while you spin, and retake it before you complete once the spinning is done? > +#define HNS_ROCE_CMQ_TX_TIMEOUT 200 or you could reduce the size of this define...
On Mon, Sep 25, 2017 at 01:06:53PM -0400, Doug Ledford wrote: > On Wed, 2017-08-30 at 17:23 +0800, Wei Hu (Xavier) wrote: > > So, we set the send timeout limit here: > > > + /* Setup Tx write back timeout */ > > + priv->cmq.tx_timeout = HNS_ROCE_CMQ_TX_TIMEOUT; > > + > > [ snip ] > > > +int hns_roce_cmq_send(struct hns_roce_dev *hr_dev, > > + struct hns_roce_cmq_desc *desc, int num) > > +{ > > + struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv > > *)hr_dev->priv; > > + struct hns_roce_v2_cmq_ring *csq = &priv->cmq.csq; > > + struct hns_roce_cmq_desc *desc_to_use; > > + bool complete = false; > > + u32 timeout = 0; > > + int handle = 0; > > + u16 desc_ret; > > + int ret = 0; > > + int ntc; > > + > > + spin_lock_bh(&csq->lock); > > and we take a bh lock here... > > > + if (num > hns_roce_cmq_space(csq)) { > > + spin_unlock_bh(&csq->lock); > > + return -EBUSY; > > + } > > + > > + /* > > + * Record the location of desc in the cmq for this time > > + * which will be use for hardware to write back > > + */ > > + ntc = csq->next_to_use; > > + > > + while (handle < num) { > > + desc_to_use = &csq->desc[csq->next_to_use]; > > + *desc_to_use = desc[handle]; > > + dev_dbg(hr_dev->dev, "set cmq desc:\n"); > > + csq->next_to_use++; > > + if (csq->next_to_use == csq->desc_num) > > + csq->next_to_use = 0; > > + handle++; > > + } > > + > > + /* Write to hardware */ > > + roce_write(hr_dev, ROCEE_TX_CMQ_TAIL_REG, csq->next_to_use); > > + > > + /* > > + * If the command is sync, wait for the firmware to write > > back, > > + * if multi descriptors to be sent, use the first one to > > check > > + */ > > + if ((desc->flag) & HNS_ROCE_CMD_FLAG_NO_INTR) { > > + do { > > + if (hns_roce_cmq_csq_done(hr_dev)) > > + break; > > + usleep_range(1000, 2000); > > + timeout++; > > + } while (timeout < priv->cmq.tx_timeout); > > + } > > then we spin here for a maximum amount of time between 200 and 400ms, > so 1/4 to 1/2 a second. All the time we are holding the bh lock on > this CPU. That seems excessive to me. If we are going to spin that > long, can you find a way to allocate/reserve your resources, send the > command, then drop the bh lock while you spin, and retake it before you > complete once the spinning is done? They don't allocate anything in this loop, but checking the pointers are the same, see hns_roce_cmq_csq_done. > > > +#define HNS_ROCE_CMQ_TX_TIMEOUT 200 > > or you could reduce the size of this define... > > -- > Doug Ledford <dledford@redhat.com> > GPG KeyID: B826A3330E572FDD > Key fingerprint = AE6B 1BDA 122B 23B4 265B 1274 B826 A333 0E57 2FDD > > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2017-09-25 at 20:18 +0300, Leon Romanovsky wrote: > On Mon, Sep 25, 2017 at 01:06:53PM -0400, Doug Ledford wrote: > > On Wed, 2017-08-30 at 17:23 +0800, Wei Hu (Xavier) wrote: > > > > > + /* > > > + * If the command is sync, wait for the firmware to > > > write > > > back, > > > + * if multi descriptors to be sent, use the first one to > > > check > > > + */ > > > + if ((desc->flag) & HNS_ROCE_CMD_FLAG_NO_INTR) { > > > + do { > > > + if (hns_roce_cmq_csq_done(hr_dev)) > > > + break; > > > + usleep_range(1000, 2000); > > > + timeout++; > > > + } while (timeout < priv->cmq.tx_timeout); > > > + } > > > > then we spin here for a maximum amount of time between 200 and > > 400ms, > > so 1/4 to 1/2 a second. All the time we are holding the bh lock on > > this CPU. That seems excessive to me. If we are going to spin > > that > > long, can you find a way to allocate/reserve your resources, send > > the > > command, then drop the bh lock while you spin, and retake it before > > you > > complete once the spinning is done? > > They don't allocate anything in this loop, but checking the pointers > are > the same, see hns_roce_cmq_csq_done. I'm not sure I understand your intended implication of your comment. I wasn't concerned about them allocating anything, only that if the hardware is hung, then this loop will hang out for 1/4 to 1/2 a second and hold up all bottom half processing on this CPU in the meantime. That's the sort of things that provides poor overall system behavior. Now, since they are really only checking to see if the hardware has gotten around to their particular command, and their command is part of a ring structure, it's possible to record the original head command, and our new head command, and then release the spin_lock_bh around the entire do{ }while construct, and in hns_roce_cmd_csq_done() you could check that head is not in the range old_head:new_head. That would protect you in case something in the bottom half processing queued up some more commands and from one sleep to the next the head jumped from something other than the new_head to something past new_head, so that head == priv->cmq.csq.next_to_use ends up being perpetually false. But, that's just from a quick read of the code, I could easily be missing something here... > > > > > +#define HNS_ROCE_CMQ_TX_TIMEOUT 200 > > > > or you could reduce the size of this define... > > > > -- > > Doug Ledford <dledford@redhat.com> > > GPG KeyID: B826A3330E572FDD > > Key fingerprint = AE6B 1BDA 122B 23B4 265B 1274 B826 A333 0E57 > > 2FDD > > > > -- > > To unsubscribe from this list: send the line "unsubscribe linux- > > rdma" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Sep 25, 2017 at 01:36:55PM -0400, Doug Ledford wrote: > On Mon, 2017-09-25 at 20:18 +0300, Leon Romanovsky wrote: > > On Mon, Sep 25, 2017 at 01:06:53PM -0400, Doug Ledford wrote: > > > On Wed, 2017-08-30 at 17:23 +0800, Wei Hu (Xavier) wrote: > > > > > > > + /* > > > > + * If the command is sync, wait for the firmware to > > > > write > > > > back, > > > > + * if multi descriptors to be sent, use the first one to > > > > check > > > > + */ > > > > + if ((desc->flag) & HNS_ROCE_CMD_FLAG_NO_INTR) { > > > > + do { > > > > + if (hns_roce_cmq_csq_done(hr_dev)) > > > > + break; > > > > + usleep_range(1000, 2000); > > > > + timeout++; > > > > + } while (timeout < priv->cmq.tx_timeout); > > > > + } > > > > > > then we spin here for a maximum amount of time between 200 and > > > 400ms, > > > so 1/4 to 1/2 a second. All the time we are holding the bh lock on > > > this CPU. That seems excessive to me. If we are going to spin > > > that > > > long, can you find a way to allocate/reserve your resources, send > > > the > > > command, then drop the bh lock while you spin, and retake it before > > > you > > > complete once the spinning is done? > > > > They don't allocate anything in this loop, but checking the pointers > > are > > the same, see hns_roce_cmq_csq_done. > > I'm not sure I understand your intended implication of your comment. I > wasn't concerned about them allocating anything, only that if the > hardware is hung, then this loop will hang out for 1/4 to 1/2 a second > and hold up all bottom half processing on this CPU in the meantime. > That's the sort of things that provides poor overall system behavior. > > Now, since they are really only checking to see if the hardware has > gotten around to their particular command, and their command is part of > a ring structure, it's possible to record the original head command, > and our new head command, and then release the spin_lock_bh around the > entire do{ }while construct, and in hns_roce_cmd_csq_done() you could > check that head is not in the range old_head:new_head. That would > protect you in case something in the bottom half processing queued up > some more commands and from one sleep to the next the head jumped from > something other than the new_head to something past new_head, so that > head == priv->cmq.csq.next_to_use ends up being perpetually false. > But, that's just from a quick read of the code, I could easily be > missing something here... Thanks, I got your point, the phrase "can you find a way to allocate/reserve your resources," confused me back then. > > > > > > > > +#define HNS_ROCE_CMQ_TX_TIMEOUT 200 > > > > > > or you could reduce the size of this define... > > > > > > -- > > > Doug Ledford <dledford@redhat.com> > > > GPG KeyID: B826A3330E572FDD > > > Key fingerprint = AE6B 1BDA 122B 23B4 265B 1274 B826 A333 0E57 > > > 2FDD > > > > > > -- > > > To unsubscribe from this list: send the line "unsubscribe linux- > > > rdma" in > > > the body of a message to majordomo@vger.kernel.org > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- > Doug Ledford <dledford@redhat.com> > GPG KeyID: B826A3330E572FDD > Key fingerprint = AE6B 1BDA 122B 23B4 265B 1274 B826 A333 0E57 2FDD >
On 2017/9/26 1:36, Doug Ledford wrote: > On Mon, 2017-09-25 at 20:18 +0300, Leon Romanovsky wrote: >> On Mon, Sep 25, 2017 at 01:06:53PM -0400, Doug Ledford wrote: >>> On Wed, 2017-08-30 at 17:23 +0800, Wei Hu (Xavier) wrote: >>> >>>> + /* >>>> + * If the command is sync, wait for the firmware to >>>> write >>>> back, >>>> + * if multi descriptors to be sent, use the first one to >>>> check >>>> + */ >>>> + if ((desc->flag) & HNS_ROCE_CMD_FLAG_NO_INTR) { >>>> + do { >>>> + if (hns_roce_cmq_csq_done(hr_dev)) >>>> + break; >>>> + usleep_range(1000, 2000); >>>> + timeout++; >>>> + } while (timeout < priv->cmq.tx_timeout); >>>> + } >>> then we spin here for a maximum amount of time between 200 and >>> 400ms, >>> so 1/4 to 1/2 a second. All the time we are holding the bh lock on >>> this CPU. That seems excessive to me. If we are going to spin >>> that >>> long, can you find a way to allocate/reserve your resources, send >>> the >>> command, then drop the bh lock while you spin, and retake it before >>> you >>> complete once the spinning is done? >> They don't allocate anything in this loop, but checking the pointers >> are >> the same, see hns_roce_cmq_csq_done. > I'm not sure I understand your intended implication of your comment. I > wasn't concerned about them allocating anything, only that if the > hardware is hung, then this loop will hang out for 1/4 to 1/2 a second > and hold up all bottom half processing on this CPU in the meantime. > That's the sort of things that provides poor overall system behavior. > > Now, since they are really only checking to see if the hardware has > gotten around to their particular command, and their command is part of > a ring structure, it's possible to record the original head command, > and our new head command, and then release the spin_lock_bh around the > entire do{ }while construct, and in hns_roce_cmd_csq_done() you could > check that head is not in the range old_head:new_head. That would > protect you in case something in the bottom half processing queued up > some more commands and from one sleep to the next the head jumped from > something other than the new_head to something past new_head, so that > head == priv->cmq.csq.next_to_use ends up being perpetually false. > But, that's just from a quick read of the code, I could easily be > missing something here... Hi, Doug Driver issues the cmds in cmq, and firmware gets and processes them. The firmware process only one cmd at the same time, and it will take about serveral to 200 us in one cmd currently, so the driver need not to use stream mode to issue cmd. Regards Wei Hu >>>> +#define HNS_ROCE_CMQ_TX_TIMEOUT 200 >>> or you could reduce the size of this define... >>> >>> -- >>> Doug Ledford <dledford@redhat.com> >>> GPG KeyID: B826A3330E572FDD >>> Key fingerprint = AE6B 1BDA 122B 23B4 265B 1274 B826 A333 0E57 >>> 2FDD >>> >>> -- >>> To unsubscribe from this list: send the line "unsubscribe linux- >>> rdma" in >>> the body of a message to majordomo@vger.kernel.org >>> More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2017/9/26 21:13, Wei Hu (Xavier) wrote: > > > On 2017/9/26 1:36, Doug Ledford wrote: >> On Mon, 2017-09-25 at 20:18 +0300, Leon Romanovsky wrote: >>> On Mon, Sep 25, 2017 at 01:06:53PM -0400, Doug Ledford wrote: >>>> On Wed, 2017-08-30 at 17:23 +0800, Wei Hu (Xavier) wrote: >>>> >>>>> + /* >>>>> + * If the command is sync, wait for the firmware to >>>>> write >>>>> back, >>>>> + * if multi descriptors to be sent, use the first one to >>>>> check >>>>> + */ >>>>> + if ((desc->flag) & HNS_ROCE_CMD_FLAG_NO_INTR) { >>>>> + do { >>>>> + if (hns_roce_cmq_csq_done(hr_dev)) >>>>> + break; >>>>> + usleep_range(1000, 2000); >>>>> + timeout++; >>>>> + } while (timeout < priv->cmq.tx_timeout); >>>>> + } >>>> then we spin here for a maximum amount of time between 200 and >>>> 400ms, >>>> so 1/4 to 1/2 a second. All the time we are holding the bh lock on >>>> this CPU. That seems excessive to me. If we are going to spin >>>> that >>>> long, can you find a way to allocate/reserve your resources, send >>>> the >>>> command, then drop the bh lock while you spin, and retake it before >>>> you >>>> complete once the spinning is done? >>> They don't allocate anything in this loop, but checking the pointers >>> are >>> the same, see hns_roce_cmq_csq_done. >> I'm not sure I understand your intended implication of your comment. I >> wasn't concerned about them allocating anything, only that if the >> hardware is hung, then this loop will hang out for 1/4 to 1/2 a second >> and hold up all bottom half processing on this CPU in the meantime. >> That's the sort of things that provides poor overall system behavior. >> >> Now, since they are really only checking to see if the hardware has >> gotten around to their particular command, and their command is part of >> a ring structure, it's possible to record the original head command, >> and our new head command, and then release the spin_lock_bh around the >> entire do{ }while construct, and in hns_roce_cmd_csq_done() you could >> check that head is not in the range old_head:new_head. That would >> protect you in case something in the bottom half processing queued up >> some more commands and from one sleep to the next the head jumped from >> something other than the new_head to something past new_head, so that >> head == priv->cmq.csq.next_to_use ends up being perpetually false. >> But, that's just from a quick read of the code, I could easily be >> missing something here... > Hi, Doug > Driver issues the cmds in cmq, and firmware gets and processes them. > The firmware process only one cmd at the same time, and it will take > about serveral to 200 us in one cmd currently, so the driver need > not to use stream mode to issue cmd. > > Regards > Wei Hu Hi, Doug We can replace usleep_range(1000, 2000); with the following statement: udelay(1); to avoid spinning too long time and using usleep_range function in spin_lock_bh locked region. Can you give some suggestions? Regards Wei Hu >>>>> +#define HNS_ROCE_CMQ_TX_TIMEOUT 200 >>>> or you could reduce the size of this define... >>>> >>>> -- >>>> Doug Ledford <dledford@redhat.com> >>>> GPG KeyID: B826A3330E572FDD >>>> Key fingerprint = AE6B 1BDA 122B 23B4 265B 1274 B826 A333 0E57 >>>> 2FDD >>>> >>>> -- >>>> To unsubscribe from this list: send the line "unsubscribe linux- >>>> rdma" in >>>> the body of a message to majordomo@vger.kernel.org >>>> More majordomo info at http://vger.kernel.org/majordomo-info.html > > > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Sep 26, 2017 at 11:24:41PM +0800, Wei Hu (Xavier) wrote: > > > On 2017/9/26 21:13, Wei Hu (Xavier) wrote: > > > > > > On 2017/9/26 1:36, Doug Ledford wrote: > > > On Mon, 2017-09-25 at 20:18 +0300, Leon Romanovsky wrote: > > > > On Mon, Sep 25, 2017 at 01:06:53PM -0400, Doug Ledford wrote: > > > > > On Wed, 2017-08-30 at 17:23 +0800, Wei Hu (Xavier) wrote: > > > > > > > > > > > + /* > > > > > > + * If the command is sync, wait for the firmware to > > > > > > write > > > > > > back, > > > > > > + * if multi descriptors to be sent, use the first one to > > > > > > check > > > > > > + */ > > > > > > + if ((desc->flag) & HNS_ROCE_CMD_FLAG_NO_INTR) { > > > > > > + do { > > > > > > + if (hns_roce_cmq_csq_done(hr_dev)) > > > > > > + break; > > > > > > + usleep_range(1000, 2000); > > > > > > + timeout++; > > > > > > + } while (timeout < priv->cmq.tx_timeout); > > > > > > + } > > > > > then we spin here for a maximum amount of time between 200 and > > > > > 400ms, > > > > > so 1/4 to 1/2 a second. All the time we are holding the bh lock on > > > > > this CPU. That seems excessive to me. If we are going to spin > > > > > that > > > > > long, can you find a way to allocate/reserve your resources, send > > > > > the > > > > > command, then drop the bh lock while you spin, and retake it before > > > > > you > > > > > complete once the spinning is done? > > > > They don't allocate anything in this loop, but checking the pointers > > > > are > > > > the same, see hns_roce_cmq_csq_done. > > > I'm not sure I understand your intended implication of your comment. I > > > wasn't concerned about them allocating anything, only that if the > > > hardware is hung, then this loop will hang out for 1/4 to 1/2 a second > > > and hold up all bottom half processing on this CPU in the meantime. > > > That's the sort of things that provides poor overall system behavior. > > > > > > Now, since they are really only checking to see if the hardware has > > > gotten around to their particular command, and their command is part of > > > a ring structure, it's possible to record the original head command, > > > and our new head command, and then release the spin_lock_bh around the > > > entire do{ }while construct, and in hns_roce_cmd_csq_done() you could > > > check that head is not in the range old_head:new_head. That would > > > protect you in case something in the bottom half processing queued up > > > some more commands and from one sleep to the next the head jumped from > > > something other than the new_head to something past new_head, so that > > > head == priv->cmq.csq.next_to_use ends up being perpetually false. > > > But, that's just from a quick read of the code, I could easily be > > > missing something here... > > Hi, Doug > > Driver issues the cmds in cmq, and firmware gets and processes them. > > The firmware process only one cmd at the same time, and it will take > > about serveral to 200 us in one cmd currently, so the driver need > > not to use stream mode to issue cmd. > > > > Regards > > Wei Hu > Hi, Doug > We can replace usleep_range(1000, 2000); with the following statement: > udelay(1); > to avoid spinning too long time and using usleep_range function in > spin_lock_bh locked region. > > Can you give some suggestions? He already suggested number of options, while the simple one was to reduce HNS_ROCE_CMQ_TX_TIMEOUT from 200 to be something sane (for example 5). Thanks > > Regards > Wei Hu > > > > > > +#define HNS_ROCE_CMQ_TX_TIMEOUT 200 > > > > > or you could reduce the size of this define... > > > > > > > > > > -- > > > > > Doug Ledford <dledford@redhat.com> > > > > > GPG KeyID: B826A3330E572FDD > > > > > Key fingerprint = AE6B 1BDA 122B 23B4 265B 1274 B826 A333 0E57 > > > > > 2FDD > > > > > > > > > > -- > > > > > To unsubscribe from this list: send the line "unsubscribe linux- > > > > > rdma" in > > > > > the body of a message to majordomo@vger.kernel.org > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > -- > > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html >
On 2017/9/26 23:51, Leon Romanovsky wrote: > On Tue, Sep 26, 2017 at 11:24:41PM +0800, Wei Hu (Xavier) wrote: >> >> On 2017/9/26 21:13, Wei Hu (Xavier) wrote: >>> >>> On 2017/9/26 1:36, Doug Ledford wrote: >>>> On Mon, 2017-09-25 at 20:18 +0300, Leon Romanovsky wrote: >>>>> On Mon, Sep 25, 2017 at 01:06:53PM -0400, Doug Ledford wrote: >>>>>> On Wed, 2017-08-30 at 17:23 +0800, Wei Hu (Xavier) wrote: >>>>>> >>>>>>> + /* >>>>>>> + * If the command is sync, wait for the firmware to >>>>>>> write >>>>>>> back, >>>>>>> + * if multi descriptors to be sent, use the first one to >>>>>>> check >>>>>>> + */ >>>>>>> + if ((desc->flag) & HNS_ROCE_CMD_FLAG_NO_INTR) { >>>>>>> + do { >>>>>>> + if (hns_roce_cmq_csq_done(hr_dev)) >>>>>>> + break; >>>>>>> + usleep_range(1000, 2000); >>>>>>> + timeout++; >>>>>>> + } while (timeout < priv->cmq.tx_timeout); >>>>>>> + } >>>>>> then we spin here for a maximum amount of time between 200 and >>>>>> 400ms, >>>>>> so 1/4 to 1/2 a second. All the time we are holding the bh lock on >>>>>> this CPU. That seems excessive to me. If we are going to spin >>>>>> that >>>>>> long, can you find a way to allocate/reserve your resources, send >>>>>> the >>>>>> command, then drop the bh lock while you spin, and retake it before >>>>>> you >>>>>> complete once the spinning is done? >>>>> They don't allocate anything in this loop, but checking the pointers >>>>> are >>>>> the same, see hns_roce_cmq_csq_done. >>>> I'm not sure I understand your intended implication of your comment. I >>>> wasn't concerned about them allocating anything, only that if the >>>> hardware is hung, then this loop will hang out for 1/4 to 1/2 a second >>>> and hold up all bottom half processing on this CPU in the meantime. >>>> That's the sort of things that provides poor overall system behavior. >>>> >>>> Now, since they are really only checking to see if the hardware has >>>> gotten around to their particular command, and their command is part of >>>> a ring structure, it's possible to record the original head command, >>>> and our new head command, and then release the spin_lock_bh around the >>>> entire do{ }while construct, and in hns_roce_cmd_csq_done() you could >>>> check that head is not in the range old_head:new_head. That would >>>> protect you in case something in the bottom half processing queued up >>>> some more commands and from one sleep to the next the head jumped from >>>> something other than the new_head to something past new_head, so that >>>> head == priv->cmq.csq.next_to_use ends up being perpetually false. >>>> But, that's just from a quick read of the code, I could easily be >>>> missing something here... >>> Hi, Doug >>> Driver issues the cmds in cmq, and firmware gets and processes them. >>> The firmware process only one cmd at the same time, and it will take >>> about serveral to 200 us in one cmd currently, so the driver need >>> not to use stream mode to issue cmd. >>> >>> Regards >>> Wei Hu >> Hi, Doug >> We can replace usleep_range(1000, 2000); with the following statement: >> udelay(1); >> to avoid spinning too long time and using usleep_range function in >> spin_lock_bh locked region. >> >> Can you give som=e suggestions? > He already suggested number of options, while the simple one was to reduce > HNS_ROCE_CMQ_TX_TIMEOUT from 200 to be something sane (for example 5). > > Thanks Thanks, Leon Actually We want to use another option: replace usleep_range(1000, 2000); with the following statement: udelay(1); And if so, we can avoid using usleep_range function in spin_lock_bh spin region, because it probally cause calltrace by rcu. Best regards Wei Hu >> Regards >> Wei Hu >>>>>>> +#define HNS_ROCE_CMQ_TX_TIMEOUT 200 >>>>>> or you could reduce the size of this define... >>>>>> >>>>>> -- >>>>>> Doug Ledford <dledford@redhat.com> >>>>>> GPG KeyID: B826A3330E572FDD >>>>>> Key fingerprint = AE6B 1BDA 122B 23B4 265B 1274 B826 A333 0E57 >>>>>> 2FDD >>>>>> >>>>>> -- >>>>>> To unsubscribe from this list: send the line "unsubscribe linux- >>>>>> rdma" in >>>>>> the body of a message to majordomo@vger.kernel.org >>>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html >>> >>> -- >>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in >>> the body of a message to majordomo@vger.kernel.org >>> More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 9/26/2017 9:13 AM, Wei Hu (Xavier) wrote: > > > On 2017/9/26 1:36, Doug Ledford wrote: >> On Mon, 2017-09-25 at 20:18 +0300, Leon Romanovsky wrote: >>> On Mon, Sep 25, 2017 at 01:06:53PM -0400, Doug Ledford wrote: >>>> On Wed, 2017-08-30 at 17:23 +0800, Wei Hu (Xavier) wrote: >>>> >>>>> + /* >>>>> + * If the command is sync, wait for the firmware to >>>>> write >>>>> back, >>>>> + * if multi descriptors to be sent, use the first one to >>>>> check >>>>> + */ >>>>> + if ((desc->flag) & HNS_ROCE_CMD_FLAG_NO_INTR) { >>>>> + do { >>>>> + if (hns_roce_cmq_csq_done(hr_dev)) >>>>> + break; >>>>> + usleep_range(1000, 2000); >>>>> + timeout++; >>>>> + } while (timeout < priv->cmq.tx_timeout); >>>>> + } >>>> then we spin here for a maximum amount of time between 200 and >>>> 400ms, >>>> so 1/4 to 1/2 a second. All the time we are holding the bh lock on >>>> this CPU. That seems excessive to me. If we are going to spin >>>> that >>>> long, can you find a way to allocate/reserve your resources, send >>>> the >>>> command, then drop the bh lock while you spin, and retake it before >>>> you >>>> complete once the spinning is done? >>> They don't allocate anything in this loop, but checking the pointers >>> are >>> the same, see hns_roce_cmq_csq_done. >> I'm not sure I understand your intended implication of your comment. I >> wasn't concerned about them allocating anything, only that if the >> hardware is hung, then this loop will hang out for 1/4 to 1/2 a second >> and hold up all bottom half processing on this CPU in the meantime. >> That's the sort of things that provides poor overall system behavior. >> >> Now, since they are really only checking to see if the hardware has >> gotten around to their particular command, and their command is part of >> a ring structure, it's possible to record the original head command, >> and our new head command, and then release the spin_lock_bh around the >> entire do{ }while construct, and in hns_roce_cmd_csq_done() you could >> check that head is not in the range old_head:new_head. That would >> protect you in case something in the bottom half processing queued up >> some more commands and from one sleep to the next the head jumped from >> something other than the new_head to something past new_head, so that >> head == priv->cmq.csq.next_to_use ends up being perpetually false. >> But, that's just from a quick read of the code, I could easily be >> missing something here... > Hi, Doug > Driver issues the cmds in cmq, and firmware gets and processes them. > The firmware process only one cmd at the same time, and it will take > about serveral to 200 us in one cmd currently, so the driver need > not to use stream mode to issue cmd. I'm not sure I understand your response here. I get that the driver issues cmds in the cmq, and that the firmware gets them and processes them. I get that the firmware will only work on one command at a time and only move to the next one once the current one is complete. I get that commands take anywhere from a few usec to a couple hundred usec. I also get that because you are sleeping for somewhere in between 1000 and 2000 usecs, that the driver could easily finish a whole slew of commands. It could do 10 slow commands, or 100 or more fast commands. What this tells me is that the only reason your current implementation of hns_roce_cmq_csq_done() works at all is because you keep the device locked out from any other commands being put on the queue. As far as I can tell, that's the only way you can guarantee that at some point you will wake up and the head pointer will be exactly at csq->next_to_use. Otherwise, if you didn't block them out, then you could sleep with the head pointer before csq->next_to_use and wake up the next time with it already well past csq->next_to_use. Am I right about that? While you are waiting on this command queue, any other commands are blocked from being placed on the command queue? I don't understand what you mean by "so the driver need not to use stream mode to issue cmd". > Regards > Wei Hu >>>>> +#define HNS_ROCE_CMQ_TX_TIMEOUT 200 >>>> or you could reduce the size of this define... >>>> >>>> -- >>>> Doug Ledford <dledford@redhat.com> >>>> GPG KeyID: B826A3330E572FDD >>>> Key fingerprint = AE6B 1BDA 122B 23B4 265B 1274 B826 A333 0E57 >>>> 2FDD >>>> >>>> -- >>>> To unsubscribe from this list: send the line "unsubscribe linux- >>>> rdma" in >>>> the body of a message to majordomo@vger.kernel.org >>>> More majordomo info at http://vger.kernel.org/majordomo-info.html > >
On 2017/9/27 0:13, Wei Hu (Xavier) wrote: > > > On 2017/9/26 23:51, Leon Romanovsky wrote: >> On Tue, Sep 26, 2017 at 11:24:41PM +0800, Wei Hu (Xavier) wrote: >>> >>> On 2017/9/26 21:13, Wei Hu (Xavier) wrote: >>>> >>>> On 2017/9/26 1:36, Doug Ledford wrote: >>>>> On Mon, 2017-09-25 at 20:18 +0300, Leon Romanovsky wrote: >>>>>> On Mon, Sep 25, 2017 at 01:06:53PM -0400, Doug Ledford wrote: >>>>>>> On Wed, 2017-08-30 at 17:23 +0800, Wei Hu (Xavier) wrote: >>>>>>> >>>>>>>> + /* >>>>>>>> + * If the command is sync, wait for the firmware to >>>>>>>> write >>>>>>>> back, >>>>>>>> + * if multi descriptors to be sent, use the first one to >>>>>>>> check >>>>>>>> + */ >>>>>>>> + if ((desc->flag) & HNS_ROCE_CMD_FLAG_NO_INTR) { >>>>>>>> + do { >>>>>>>> + if (hns_roce_cmq_csq_done(hr_dev)) >>>>>>>> + break; >>>>>>>> + usleep_range(1000, 2000); >>>>>>>> + timeout++; >>>>>>>> + } while (timeout < priv->cmq.tx_timeout); >>>>>>>> + } >>>>>>> then we spin here for a maximum amount of time between 200 and >>>>>>> 400ms, >>>>>>> so 1/4 to 1/2 a second. All the time we are holding the bh lock on >>>>>>> this CPU. That seems excessive to me. If we are going to spin >>>>>>> that >>>>>>> long, can you find a way to allocate/reserve your resources, send >>>>>>> the >>>>>>> command, then drop the bh lock while you spin, and retake it before >>>>>>> you >>>>>>> complete once the spinning is done? >>>>>> They don't allocate anything in this loop, but checking the pointers >>>>>> are >>>>>> the same, see hns_roce_cmq_csq_done. >>>>> I'm not sure I understand your intended implication of your >>>>> comment. I >>>>> wasn't concerned about them allocating anything, only that if the >>>>> hardware is hung, then this loop will hang out for 1/4 to 1/2 a >>>>> second >>>>> and hold up all bottom half processing on this CPU in the meantime. >>>>> That's the sort of things that provides poor overall system behavior. >>>>> >>>>> Now, since they are really only checking to see if the hardware has >>>>> gotten around to their particular command, and their command is >>>>> part of >>>>> a ring structure, it's possible to record the original head command, >>>>> and our new head command, and then release the spin_lock_bh around >>>>> the >>>>> entire do{ }while construct, and in hns_roce_cmd_csq_done() you could >>>>> check that head is not in the range old_head:new_head. That would >>>>> protect you in case something in the bottom half processing queued up >>>>> some more commands and from one sleep to the next the head jumped >>>>> from >>>>> something other than the new_head to something past new_head, so that >>>>> head == priv->cmq.csq.next_to_use ends up being perpetually false. >>>>> But, that's just from a quick read of the code, I could easily be >>>>> missing something here... >>>> Hi, Doug >>>> Driver issues the cmds in cmq, and firmware gets and processes >>>> them. >>>> The firmware process only one cmd at the same time, and it >>>> will take >>>> about serveral to 200 us in one cmd currently, so the driver need >>>> not to use stream mode to issue cmd. >>>> >>>> Regards >>>> Wei Hu >>> Hi, Doug >>> We can replace usleep_range(1000, 2000); with the following >>> statement: >>> udelay(1); >>> to avoid spinning too long time and using usleep_range function in >>> spin_lock_bh locked region. >>> >>> Can you give som=e suggestions? >> He already suggested number of options, while the simple one was to >> reduce >> HNS_ROCE_CMQ_TX_TIMEOUT from 200 to be something sane (for example 5). >> >> Thanks > Thanks, Leon > Actually We want to use another option: > replace usleep_range(1000, 2000); with the following statement: > udelay(1); > And if so, we can avoid using usleep_range function in > spin_lock_bh spin region, > because it probally cause calltrace by rcu. > Can we send a following up patch to fix this problem? or send patch v2? Thanks > Best regards > Wei Hu >>> Regards >>> Wei Hu >>>>>>>> +#define HNS_ROCE_CMQ_TX_TIMEOUT 200 >>>>>>> or you could reduce the size of this define... >>>>>>> >>>>>>> -- >>>>>>> Doug Ledford <dledford@redhat.com> >>>>>>> GPG KeyID: B826A3330E572FDD >>>>>>> Key fingerprint = AE6B 1BDA 122B 23B4 265B 1274 B826 A333 >>>>>>> 0E57 >>>>>>> 2FDD >>>>>>> >>>>>>> -- >>>>>>> To unsubscribe from this list: send the line "unsubscribe linux- >>>>>>> rdma" in >>>>>>> the body of a message to majordomo@vger.kernel.org >>>>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html >>>> >>>> -- >>>> To unsubscribe from this list: send the line "unsubscribe >>>> linux-rdma" in >>>> the body of a message to majordomo@vger.kernel.org >>>> More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2017/9/27 0:18, Doug Ledford wrote: > On 9/26/2017 9:13 AM, Wei Hu (Xavier) wrote: >> >> On 2017/9/26 1:36, Doug Ledford wrote: >>> On Mon, 2017-09-25 at 20:18 +0300, Leon Romanovsky wrote: >>>> On Mon, Sep 25, 2017 at 01:06:53PM -0400, Doug Ledford wrote: >>>>> On Wed, 2017-08-30 at 17:23 +0800, Wei Hu (Xavier) wrote: >>>>> >>>>>> + /* >>>>>> + * If the command is sync, wait for the firmware to >>>>>> write >>>>>> back, >>>>>> + * if multi descriptors to be sent, use the first one to >>>>>> check >>>>>> + */ >>>>>> + if ((desc->flag) & HNS_ROCE_CMD_FLAG_NO_INTR) { >>>>>> + do { >>>>>> + if (hns_roce_cmq_csq_done(hr_dev)) >>>>>> + break; >>>>>> + usleep_range(1000, 2000); >>>>>> + timeout++; >>>>>> + } while (timeout < priv->cmq.tx_timeout); >>>>>> + } >>>>> then we spin here for a maximum amount of time between 200 and >>>>> 400ms, >>>>> so 1/4 to 1/2 a second. All the time we are holding the bh lock on >>>>> this CPU. That seems excessive to me. If we are going to spin >>>>> that >>>>> long, can you find a way to allocate/reserve your resources, send >>>>> the >>>>> command, then drop the bh lock while you spin, and retake it before >>>>> you >>>>> complete once the spinning is done? >>>> They don't allocate anything in this loop, but checking the pointers >>>> are >>>> the same, see hns_roce_cmq_csq_done. >>> I'm not sure I understand your intended implication of your comment. I >>> wasn't concerned about them allocating anything, only that if the >>> hardware is hung, then this loop will hang out for 1/4 to 1/2 a second >>> and hold up all bottom half processing on this CPU in the meantime. >>> That's the sort of things that provides poor overall system behavior. >>> >>> Now, since they are really only checking to see if the hardware has >>> gotten around to their particular command, and their command is part of >>> a ring structure, it's possible to record the original head command, >>> and our new head command, and then release the spin_lock_bh around the >>> entire do{ }while construct, and in hns_roce_cmd_csq_done() you could >>> check that head is not in the range old_head:new_head. That would >>> protect you in case something in the bottom half processing queued up >>> some more commands and from one sleep to the next the head jumped from >>> something other than the new_head to something past new_head, so that >>> head == priv->cmq.csq.next_to_use ends up being perpetually false. >>> But, that's just from a quick read of the code, I could easily be >>> missing something here... >> Hi, Doug >> Driver issues the cmds in cmq, and firmware gets and processes them. >> The firmware process only one cmd at the same time, and it will take >> about serveral to 200 us in one cmd currently, so the driver need >> not to use stream mode to issue cmd. > I'm not sure I understand your response here. > > I get that the driver issues cmds in the cmq, and that the firmware gets > them and processes them. > > I get that the firmware will only work on one command at a time and only > move to the next one once the current one is complete. > > I get that commands take anywhere from a few usec to a couple hundred usec. > > I also get that because you are sleeping for somewhere in between 1000 > and 2000 usecs, that the driver could easily finish a whole slew of > commands. It could do 10 slow commands, or 100 or more fast commands. > What this tells me is that the only reason your current implementation > of hns_roce_cmq_csq_done() works at all is because you keep the device > locked out from any other commands being put on the queue. As far as I > can tell, that's the only way you can guarantee that at some point you > will wake up and the head pointer will be exactly at csq->next_to_use. > Otherwise, if you didn't block them out, then you could sleep with the > head pointer before csq->next_to_use and wake up the next time with it > already well past csq->next_to_use. Am I right about that? While you > are waiting on this command queue, any other commands are blocked from > being placed on the command queue? Hi, Doug, you are right. And one "hns_x" ib device only has one command queue in hip08, other commands will be blocked when waiting on the command queue. > > I don't understand what you mean by "so the driver need not to use > stream mode to issue cmd". Sorry, my expression error. stream -> pipeline And if you argee, after this patchset has been accepted we will send a following up patch : In hns_roce_cmq_send function, replace usleep_range(1000, 2000); with the following statement: udelay(1); And if so, we can avoid using usleep_range function in spin_lock_bh spin region, because it probally cause calltrace. Best regards Wei Hu >> Regards >> Wei Hu >>>>>> +#define HNS_ROCE_CMQ_TX_TIMEOUT 200 >>>>> or you could reduce the size of this define... >>>>> >>>>> -- >>>>> Doug Ledford <dledford@redhat.com> >>>>> GPG KeyID: B826A3330E572FDD >>>>> Key fingerprint = AE6B 1BDA 122B 23B4 265B 1274 B826 A333 0E57 >>>>> 2FDD >>>>> >>>>> -- >>>>> To unsubscribe from this list: send the line "unsubscribe linux- >>>>> rdma" in >>>>> the body of a message to majordomo@vger.kernel.org >>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html >> > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 9/26/2017 10:46 PM, Wei Hu (Xavier) wrote: > > > On 2017/9/27 0:18, Doug Ledford wrote: >> On 9/26/2017 9:13 AM, Wei Hu (Xavier) wrote: >>> >>> On 2017/9/26 1:36, Doug Ledford wrote: >>>> On Mon, 2017-09-25 at 20:18 +0300, Leon Romanovsky wrote: >>>>> On Mon, Sep 25, 2017 at 01:06:53PM -0400, Doug Ledford wrote: >>>>>> On Wed, 2017-08-30 at 17:23 +0800, Wei Hu (Xavier) wrote: >>>>>> >>>>>>> + /* >>>>>>> + * If the command is sync, wait for the firmware to >>>>>>> write >>>>>>> back, >>>>>>> + * if multi descriptors to be sent, use the first one to >>>>>>> check >>>>>>> + */ >>>>>>> + if ((desc->flag) & HNS_ROCE_CMD_FLAG_NO_INTR) { >>>>>>> + do { >>>>>>> + if (hns_roce_cmq_csq_done(hr_dev)) >>>>>>> + break; >>>>>>> + usleep_range(1000, 2000); >>>>>>> + timeout++; >>>>>>> + } while (timeout < priv->cmq.tx_timeout); >>>>>>> + } >>>>>> then we spin here for a maximum amount of time between 200 and >>>>>> 400ms, >>>>>> so 1/4 to 1/2 a second. All the time we are holding the bh lock on >>>>>> this CPU. That seems excessive to me. If we are going to spin >>>>>> that >>>>>> long, can you find a way to allocate/reserve your resources, send >>>>>> the >>>>>> command, then drop the bh lock while you spin, and retake it before >>>>>> you >>>>>> complete once the spinning is done? >>>>> They don't allocate anything in this loop, but checking the pointers >>>>> are >>>>> the same, see hns_roce_cmq_csq_done. >>>> I'm not sure I understand your intended implication of your comment. I >>>> wasn't concerned about them allocating anything, only that if the >>>> hardware is hung, then this loop will hang out for 1/4 to 1/2 a second >>>> and hold up all bottom half processing on this CPU in the meantime. >>>> That's the sort of things that provides poor overall system behavior. >>>> >>>> Now, since they are really only checking to see if the hardware has >>>> gotten around to their particular command, and their command is part of >>>> a ring structure, it's possible to record the original head command, >>>> and our new head command, and then release the spin_lock_bh around the >>>> entire do{ }while construct, and in hns_roce_cmd_csq_done() you could >>>> check that head is not in the range old_head:new_head. That would >>>> protect you in case something in the bottom half processing queued up >>>> some more commands and from one sleep to the next the head jumped from >>>> something other than the new_head to something past new_head, so that >>>> head == priv->cmq.csq.next_to_use ends up being perpetually false. >>>> But, that's just from a quick read of the code, I could easily be >>>> missing something here... >>> Hi, Doug >>> Driver issues the cmds in cmq, and firmware gets and processes >>> them. >>> The firmware process only one cmd at the same time, and it will >>> take >>> about serveral to 200 us in one cmd currently, so the driver need >>> not to use stream mode to issue cmd. >> I'm not sure I understand your response here. >> >> I get that the driver issues cmds in the cmq, and that the firmware gets >> them and processes them. >> >> I get that the firmware will only work on one command at a time and only >> move to the next one once the current one is complete. >> >> I get that commands take anywhere from a few usec to a couple hundred >> usec. >> >> I also get that because you are sleeping for somewhere in between 1000 >> and 2000 usecs, that the driver could easily finish a whole slew of >> commands. It could do 10 slow commands, or 100 or more fast commands. >> What this tells me is that the only reason your current implementation >> of hns_roce_cmq_csq_done() works at all is because you keep the device >> locked out from any other commands being put on the queue. As far as I >> can tell, that's the only way you can guarantee that at some point you >> will wake up and the head pointer will be exactly at csq->next_to_use. >> Otherwise, if you didn't block them out, then you could sleep with the >> head pointer before csq->next_to_use and wake up the next time with it >> already well past csq->next_to_use. Am I right about that? While you >> are waiting on this command queue, any other commands are blocked from >> being placed on the command queue? > Hi, Doug, > you are right. > And one "hns_x" ib device only has one command queue in hip08, > other commands will be blocked when waiting on the command queue. >> >> I don't understand what you mean by "so the driver need not to use >> stream mode to issue cmd". > Sorry, my expression error. > stream -> pipeline > > And if you argee, after this patchset has been accepted we will send a > following up patch : > In hns_roce_cmq_send function, replace > usleep_range(1000, 2000); > with the following statement: > udelay(1); > And if so, we can avoid using usleep_range function in spin_lock_bh > spin region, > because it probally cause calltrace. Ok, I'm fine with that. I'll pull these in.
On Wed, 2017-09-27 at 08:21 -0400, Doug Ledford wrote: > > And if you argee, after this patchset has been accepted we will > send a > > following up patch : > > In hns_roce_cmq_send function, replace > > usleep_range(1000, 2000); > > with the following statement: > > udelay(1); > > And if so, we can avoid using usleep_range function in > spin_lock_bh > > spin region, > > because it probally cause calltrace. > > Ok, I'm fine with that. I'll pull these in. OK, these are in my tree, thanks.
On 2017/9/27 20:41, Doug Ledford wrote: > On Wed, 2017-09-27 at 08:21 -0400, Doug Ledford wrote: >>> And if you argee, after this patchset has been accepted we will >> send a >>> following up patch : >>> In hns_roce_cmq_send function, replace >>> usleep_range(1000, 2000); >>> with the following statement: >>> udelay(1); >>> And if so, we can avoid using usleep_range function in >> spin_lock_bh >>> spin region, >>> because it probally cause calltrace. >> Ok, I'm fine with that. I'll pull these in. > OK, these are in my tree, thanks. Thanks a lot. Regards Wei Hu -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/infiniband/hw/hns/hns_roce_common.h b/drivers/infiniband/hw/hns/hns_roce_common.h index 4af403e..94381c2 100644 --- a/drivers/infiniband/hw/hns/hns_roce_common.h +++ b/drivers/infiniband/hw/hns/hns_roce_common.h @@ -362,4 +362,17 @@ #define ROCEE_ECC_UCERR_ALM0_REG 0xB34 #define ROCEE_ECC_CERR_ALM0_REG 0xB40 +/* V2 ROCEE REG */ +#define ROCEE_TX_CMQ_BASEADDR_L_REG 0x07000 +#define ROCEE_TX_CMQ_BASEADDR_H_REG 0x07004 +#define ROCEE_TX_CMQ_DEPTH_REG 0x07008 +#define ROCEE_TX_CMQ_TAIL_REG 0x07010 +#define ROCEE_TX_CMQ_HEAD_REG 0x07014 + +#define ROCEE_RX_CMQ_BASEADDR_L_REG 0x07018 +#define ROCEE_RX_CMQ_BASEADDR_H_REG 0x0701c +#define ROCEE_RX_CMQ_DEPTH_REG 0x07020 +#define ROCEE_RX_CMQ_TAIL_REG 0x07024 +#define ROCEE_RX_CMQ_HEAD_REG 0x07028 + #endif /* _HNS_ROCE_COMMON_H */ diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 3132b68..87c2e07 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -506,6 +506,8 @@ struct hns_roce_caps { struct hns_roce_hw { int (*reset)(struct hns_roce_dev *hr_dev, bool enable); + int (*cmq_init)(struct hns_roce_dev *hr_dev); + void (*cmq_exit)(struct hns_roce_dev *hr_dev); void (*hw_profile)(struct hns_roce_dev *hr_dev); int (*hw_init)(struct hns_roce_dev *hr_dev); void (*hw_exit)(struct hns_roce_dev *hr_dev); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index b35c72d..b610f3a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -41,8 +41,277 @@ #include "hns_roce_device.h" #include "hns_roce_cmd.h" #include "hns_roce_hem.h" +#include "hns_roce_hw_v2.h" -static const struct hns_roce_hw hns_roce_hw_v2; +static int hns_roce_cmq_space(struct hns_roce_v2_cmq_ring *ring) +{ + int ntu = ring->next_to_use; + int ntc = ring->next_to_clean; + int used = (ntu - ntc + ring->desc_num) % ring->desc_num; + + return ring->desc_num - used - 1; +} + +static int hns_roce_alloc_cmq_desc(struct hns_roce_dev *hr_dev, + struct hns_roce_v2_cmq_ring *ring) +{ + int size = ring->desc_num * sizeof(struct hns_roce_cmq_desc); + + ring->desc = kzalloc(size, GFP_KERNEL); + if (!ring->desc) + return -ENOMEM; + + ring->desc_dma_addr = dma_map_single(hr_dev->dev, ring->desc, size, + DMA_BIDIRECTIONAL); + if (dma_mapping_error(hr_dev->dev, ring->desc_dma_addr)) { + ring->desc_dma_addr = 0; + kfree(ring->desc); + ring->desc = NULL; + return -ENOMEM; + } + + return 0; +} + +static void hns_roce_free_cmq_desc(struct hns_roce_dev *hr_dev, + struct hns_roce_v2_cmq_ring *ring) +{ + dma_unmap_single(hr_dev->dev, ring->desc_dma_addr, + ring->desc_num * sizeof(struct hns_roce_cmq_desc), + DMA_BIDIRECTIONAL); + kfree(ring->desc); +} + +static int hns_roce_init_cmq_ring(struct hns_roce_dev *hr_dev, bool ring_type) +{ + struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv; + struct hns_roce_v2_cmq_ring *ring = (ring_type == TYPE_CSQ) ? + &priv->cmq.csq : &priv->cmq.crq; + + ring->flag = ring_type; + ring->next_to_clean = 0; + ring->next_to_use = 0; + + return hns_roce_alloc_cmq_desc(hr_dev, ring); +} + +static void hns_roce_cmq_init_regs(struct hns_roce_dev *hr_dev, bool ring_type) +{ + struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv; + struct hns_roce_v2_cmq_ring *ring = (ring_type == TYPE_CSQ) ? + &priv->cmq.csq : &priv->cmq.crq; + dma_addr_t dma = ring->desc_dma_addr; + + if (ring_type == TYPE_CSQ) { + roce_write(hr_dev, ROCEE_TX_CMQ_BASEADDR_L_REG, (u32)dma); + roce_write(hr_dev, ROCEE_TX_CMQ_BASEADDR_H_REG, + upper_32_bits(dma)); + roce_write(hr_dev, ROCEE_TX_CMQ_DEPTH_REG, + (ring->desc_num >> HNS_ROCE_CMQ_DESC_NUM_S) | + HNS_ROCE_CMQ_ENABLE); + roce_write(hr_dev, ROCEE_TX_CMQ_HEAD_REG, 0); + roce_write(hr_dev, ROCEE_TX_CMQ_TAIL_REG, 0); + } else { + roce_write(hr_dev, ROCEE_RX_CMQ_BASEADDR_L_REG, (u32)dma); + roce_write(hr_dev, ROCEE_RX_CMQ_BASEADDR_H_REG, + upper_32_bits(dma)); + roce_write(hr_dev, ROCEE_RX_CMQ_DEPTH_REG, + (ring->desc_num >> HNS_ROCE_CMQ_DESC_NUM_S) | + HNS_ROCE_CMQ_ENABLE); + roce_write(hr_dev, ROCEE_RX_CMQ_HEAD_REG, 0); + roce_write(hr_dev, ROCEE_RX_CMQ_TAIL_REG, 0); + } +} + +static int hns_roce_v2_cmq_init(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv; + int ret; + + /* Setup the queue entries for command queue */ + priv->cmq.csq.desc_num = 1024; + priv->cmq.crq.desc_num = 1024; + + /* Setup the lock for command queue */ + spin_lock_init(&priv->cmq.csq.lock); + spin_lock_init(&priv->cmq.crq.lock); + + /* Setup Tx write back timeout */ + priv->cmq.tx_timeout = HNS_ROCE_CMQ_TX_TIMEOUT; + + /* Init CSQ */ + ret = hns_roce_init_cmq_ring(hr_dev, TYPE_CSQ); + if (ret) { + dev_err(hr_dev->dev, "Init CSQ error, ret = %d.\n", ret); + return ret; + } + + /* Init CRQ */ + ret = hns_roce_init_cmq_ring(hr_dev, TYPE_CRQ); + if (ret) { + dev_err(hr_dev->dev, "Init CRQ error, ret = %d.\n", ret); + goto err_crq; + } + + /* Init CSQ REG */ + hns_roce_cmq_init_regs(hr_dev, TYPE_CSQ); + + /* Init CRQ REG */ + hns_roce_cmq_init_regs(hr_dev, TYPE_CRQ); + + return 0; + +err_crq: + hns_roce_free_cmq_desc(hr_dev, &priv->cmq.csq); + + return ret; +} + +static void hns_roce_v2_cmq_exit(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv; + + hns_roce_free_cmq_desc(hr_dev, &priv->cmq.csq); + hns_roce_free_cmq_desc(hr_dev, &priv->cmq.crq); +} + +void hns_roce_cmq_setup_basic_desc(struct hns_roce_cmq_desc *desc, + enum hns_roce_opcode_type opcode, + bool is_read) +{ + memset((void *)desc, 0, sizeof(struct hns_roce_cmq_desc)); + desc->opcode = cpu_to_le16(opcode); + desc->flag = + cpu_to_le16(HNS_ROCE_CMD_FLAG_NO_INTR | HNS_ROCE_CMD_FLAG_IN); + if (is_read) + desc->flag |= cpu_to_le16(HNS_ROCE_CMD_FLAG_WR); + else + desc->flag &= cpu_to_le16(~HNS_ROCE_CMD_FLAG_WR); +} + +static int hns_roce_cmq_csq_done(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv; + u32 head = roce_read(hr_dev, ROCEE_TX_CMQ_HEAD_REG); + + return head == priv->cmq.csq.next_to_use; +} + +static int hns_roce_cmq_csq_clean(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv; + struct hns_roce_v2_cmq_ring *csq = &priv->cmq.csq; + struct hns_roce_cmq_desc *desc; + u16 ntc = csq->next_to_clean; + u32 head; + int clean = 0; + + desc = &csq->desc[ntc]; + head = roce_read(hr_dev, ROCEE_TX_CMQ_HEAD_REG); + while (head != ntc) { + memset(desc, 0, sizeof(*desc)); + ntc++; + if (ntc == csq->desc_num) + ntc = 0; + desc = &csq->desc[ntc]; + clean++; + } + csq->next_to_clean = ntc; + + return clean; +} + +int hns_roce_cmq_send(struct hns_roce_dev *hr_dev, + struct hns_roce_cmq_desc *desc, int num) +{ + struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv; + struct hns_roce_v2_cmq_ring *csq = &priv->cmq.csq; + struct hns_roce_cmq_desc *desc_to_use; + bool complete = false; + u32 timeout = 0; + int handle = 0; + u16 desc_ret; + int ret = 0; + int ntc; + + spin_lock_bh(&csq->lock); + + if (num > hns_roce_cmq_space(csq)) { + spin_unlock_bh(&csq->lock); + return -EBUSY; + } + + /* + * Record the location of desc in the cmq for this time + * which will be use for hardware to write back + */ + ntc = csq->next_to_use; + + while (handle < num) { + desc_to_use = &csq->desc[csq->next_to_use]; + *desc_to_use = desc[handle]; + dev_dbg(hr_dev->dev, "set cmq desc:\n"); + csq->next_to_use++; + if (csq->next_to_use == csq->desc_num) + csq->next_to_use = 0; + handle++; + } + + /* Write to hardware */ + roce_write(hr_dev, ROCEE_TX_CMQ_TAIL_REG, csq->next_to_use); + + /* + * If the command is sync, wait for the firmware to write back, + * if multi descriptors to be sent, use the first one to check + */ + if ((desc->flag) & HNS_ROCE_CMD_FLAG_NO_INTR) { + do { + if (hns_roce_cmq_csq_done(hr_dev)) + break; + usleep_range(1000, 2000); + timeout++; + } while (timeout < priv->cmq.tx_timeout); + } + + if (hns_roce_cmq_csq_done(hr_dev)) { + complete = true; + handle = 0; + while (handle < num) { + /* get the result of hardware write back */ + desc_to_use = &csq->desc[ntc]; + desc[handle] = *desc_to_use; + dev_dbg(hr_dev->dev, "Get cmq desc:\n"); + desc_ret = desc[handle].retval; + if (desc_ret == CMD_EXEC_SUCCESS) + ret = 0; + else + ret = -EIO; + priv->cmq.last_status = desc_ret; + ntc++; + handle++; + if (ntc == csq->desc_num) + ntc = 0; + } + } + + if (!complete) + ret = -EAGAIN; + + /* clean the command send queue */ + handle = hns_roce_cmq_csq_clean(hr_dev); + if (handle != num) + dev_warn(hr_dev->dev, "Cleaned %d, need to clean %d\n", + handle, num); + + spin_unlock_bh(&csq->lock); + + return ret; +} + +static const struct hns_roce_hw hns_roce_hw_v2 = { + .cmq_init = hns_roce_v2_cmq_init, + .cmq_exit = hns_roce_v2_cmq_exit, +}; static const struct pci_device_id hns_roce_hw_v2_pci_tbl[] = { {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA), 0}, @@ -87,6 +356,12 @@ static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle) if (!hr_dev) return -ENOMEM; + hr_dev->priv = kzalloc(sizeof(struct hns_roce_v2_priv), GFP_KERNEL); + if (!hr_dev->priv) { + ret = -ENOMEM; + goto error_failed_kzalloc; + } + hr_dev->pci_dev = handle->pdev; hr_dev->dev = &handle->pdev->dev; handle->priv = hr_dev; @@ -106,6 +381,9 @@ static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle) return 0; error_failed_get_cfg: + kfree(hr_dev->priv); + +error_failed_kzalloc: ib_dealloc_device(&hr_dev->ib_dev); return ret; @@ -117,6 +395,7 @@ static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv; hns_roce_exit(hr_dev); + kfree(hr_dev->priv); ib_dealloc_device(&hr_dev->ib_dev); } diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h new file mode 100644 index 0000000..781ae74 --- /dev/null +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2016-2017 Hisilicon Limited. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _HNS_ROCE_HW_V2_H +#define _HNS_ROCE_HW_V2_H + +#define HNS_ROCE_CMQ_TX_TIMEOUT 200 + +#define HNS_ROCE_CMD_FLAG_IN_VALID_SHIFT 0 +#define HNS_ROCE_CMD_FLAG_OUT_VALID_SHIFT 1 +#define HNS_ROCE_CMD_FLAG_NEXT_SHIFT 2 +#define HNS_ROCE_CMD_FLAG_WR_OR_RD_SHIFT 3 +#define HNS_ROCE_CMD_FLAG_NO_INTR_SHIFT 4 +#define HNS_ROCE_CMD_FLAG_ERR_INTR_SHIFT 5 + +#define HNS_ROCE_CMD_FLAG_IN BIT(HNS_ROCE_CMD_FLAG_IN_VALID_SHIFT) +#define HNS_ROCE_CMD_FLAG_OUT BIT(HNS_ROCE_CMD_FLAG_OUT_VALID_SHIFT) +#define HNS_ROCE_CMD_FLAG_NEXT BIT(HNS_ROCE_CMD_FLAG_NEXT_SHIFT) +#define HNS_ROCE_CMD_FLAG_WR BIT(HNS_ROCE_CMD_FLAG_WR_OR_RD_SHIFT) +#define HNS_ROCE_CMD_FLAG_NO_INTR BIT(HNS_ROCE_CMD_FLAG_NO_INTR_SHIFT) +#define HNS_ROCE_CMD_FLAG_ERR_INTR BIT(HNS_ROCE_CMD_FLAG_ERR_INTR_SHIFT) + +#define HNS_ROCE_CMQ_DESC_NUM_S 3 +#define HNS_ROCE_CMQ_EN_B 16 +#define HNS_ROCE_CMQ_ENABLE BIT(HNS_ROCE_CMQ_EN_B) + +/* CMQ command */ +enum hns_roce_opcode_type { + HNS_ROCE_OPC_QUERY_HW_VER = 0x8000, + HNS_ROCE_OPC_CFG_GLOBAL_PARAM = 0x8001, + HNS_ROCE_OPC_ALLOC_PF_RES = 0x8004, + HNS_ROCE_OPC_QUERY_PF_RES = 0x8400, + HNS_ROCE_OPC_ALLOC_VF_RES = 0x8401, + HNS_ROCE_OPC_CFG_BT_ATTR = 0x8506, +}; + +enum { + TYPE_CRQ, + TYPE_CSQ, +}; + +enum hns_roce_cmd_return_status { + CMD_EXEC_SUCCESS = 0, + CMD_NO_AUTH = 1, + CMD_NOT_EXEC = 2, + CMD_QUEUE_FULL = 3, +}; + +struct hns_roce_cmq_desc { + u16 opcode; + u16 flag; + u16 retval; + u16 rsv; + u32 data[6]; +}; + +struct hns_roce_v2_cmq_ring { + dma_addr_t desc_dma_addr; + struct hns_roce_cmq_desc *desc; + u32 head; + u32 tail; + + u16 buf_size; + u16 desc_num; + int next_to_use; + int next_to_clean; + u8 flag; + spinlock_t lock; /* command queue lock */ +}; + +struct hns_roce_v2_cmq { + struct hns_roce_v2_cmq_ring csq; + struct hns_roce_v2_cmq_ring crq; + u16 tx_timeout; + u16 last_status; +}; + +struct hns_roce_v2_priv { + struct hns_roce_v2_cmq cmq; +}; + +#endif diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index b07d437..4f5a6fd 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -678,6 +678,14 @@ int hns_roce_init(struct hns_roce_dev *hr_dev) } } + if (hr_dev->hw->cmq_init) { + ret = hr_dev->hw->cmq_init(hr_dev); + if (ret) { + dev_err(dev, "Init RoCE Command Queue failed!\n"); + goto error_failed_cmq_init; + } + } + hr_dev->hw->hw_profile(hr_dev); ret = hns_roce_cmd_init(hr_dev); @@ -750,6 +758,10 @@ int hns_roce_init(struct hns_roce_dev *hr_dev) hns_roce_cmd_cleanup(hr_dev); error_failed_cmd_init: + if (hr_dev->hw->cmq_exit) + hr_dev->hw->cmq_exit(hr_dev); + +error_failed_cmq_init: if (hr_dev->hw->reset) { ret = hr_dev->hw->reset(hr_dev, false); if (ret) @@ -774,6 +786,8 @@ void hns_roce_exit(struct hns_roce_dev *hr_dev) if (hr_dev->cmd_mod) hns_roce_cleanup_eq_table(hr_dev); hns_roce_cmd_cleanup(hr_dev); + if (hr_dev->hw->cmq_exit) + hr_dev->hw->cmq_exit(hr_dev); if (hr_dev->hw->reset) hr_dev->hw->reset(hr_dev, false); }