Message ID | 1526544173-106587-5-git-send-email-xavier.huwei@huawei.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
On Thu, May 17, 2018 at 04:02:52PM +0800, Wei Hu (Xavier) wrote: > diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c > index 86ef15f..e1c44a6 100644 > +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c > @@ -774,6 +774,9 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev, > int ret = 0; > int ntc; > > + if (hr_dev->is_reset) > + return 0; > + > spin_lock_bh(&csq->lock); > > if (num > hns_roce_cmq_space(csq)) { > @@ -4790,6 +4793,7 @@ static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle) > return 0; > > error_failed_get_cfg: > + handle->priv = NULL; > kfree(hr_dev->priv); > > error_failed_kzalloc: > @@ -4803,14 +4807,70 @@ static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, > { > struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv; > > + if (!hr_dev) > + return; > + > hns_roce_exit(hr_dev); > + handle->priv = NULL; > kfree(hr_dev->priv); > ib_dealloc_device(&hr_dev->ib_dev); > } Why are these hunks here? If init fails then uninit should not be called, so why meddle with priv? Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2018/5/17 23:14, Jason Gunthorpe wrote: > On Thu, May 17, 2018 at 04:02:52PM +0800, Wei Hu (Xavier) wrote: >> diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c >> index 86ef15f..e1c44a6 100644 >> +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c >> @@ -774,6 +774,9 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev, >> int ret = 0; >> int ntc; >> >> + if (hr_dev->is_reset) >> + return 0; >> + >> spin_lock_bh(&csq->lock); >> >> if (num > hns_roce_cmq_space(csq)) { >> @@ -4790,6 +4793,7 @@ static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle) >> return 0; >> >> error_failed_get_cfg: >> + handle->priv = NULL; >> kfree(hr_dev->priv); >> >> error_failed_kzalloc: >> @@ -4803,14 +4807,70 @@ static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, >> { >> struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv; >> >> + if (!hr_dev) >> + return; >> + >> hns_roce_exit(hr_dev); >> + handle->priv = NULL; >> kfree(hr_dev->priv); >> ib_dealloc_device(&hr_dev->ib_dev); >> } > Why are these hunks here? If init fails then uninit should not be > called, so why meddle with priv? In hns_roce_hw_v2_init_instance function, we evaluate handle->priv with hr_dev, We want clear the value in hns_roce_hw_v2_uninit_instance function. So we can ensure no problem in RoCE driver. static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle) { struct hns_roce_dev *hr_dev; int ret; hr_dev = (struct hns_roce_dev *)ib_alloc_device(sizeof(*hr_dev)); if (!hr_dev) return -ENOMEM; ...// other code handle->priv = hr_dev; ....// other code return 0; error_xxx: handle->priv = NULL; ...// other code error_yyyy: ib_dealloc_device(&hr_dev->ib_dev); return ret; } static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, bool reset) { struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv; if (!hr_dev) return; hns_roce_exit(hr_dev); handle->priv = NULL; kfree(hr_dev->priv); ib_dealloc_device(&hr_dev->ib_dev); } > > Jason > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, May 18, 2018 at 11:28:11AM +0800, Wei Hu (Xavier) wrote: > > > On 2018/5/17 23:14, Jason Gunthorpe wrote: > > On Thu, May 17, 2018 at 04:02:52PM +0800, Wei Hu (Xavier) wrote: > >> diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c > >> index 86ef15f..e1c44a6 100644 > >> +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c > >> @@ -774,6 +774,9 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev, > >> int ret = 0; > >> int ntc; > >> > >> + if (hr_dev->is_reset) > >> + return 0; > >> + > >> spin_lock_bh(&csq->lock); > >> > >> if (num > hns_roce_cmq_space(csq)) { > >> @@ -4790,6 +4793,7 @@ static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle) > >> return 0; > >> > >> error_failed_get_cfg: > >> + handle->priv = NULL; > >> kfree(hr_dev->priv); > >> > >> error_failed_kzalloc: > >> @@ -4803,14 +4807,70 @@ static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, > >> { > >> struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv; > >> > >> + if (!hr_dev) > >> + return; > >> + > >> hns_roce_exit(hr_dev); > >> + handle->priv = NULL; > >> kfree(hr_dev->priv); > >> ib_dealloc_device(&hr_dev->ib_dev); > >> } > > Why are these hunks here? If init fails then uninit should not be > > called, so why meddle with priv? > In hns_roce_hw_v2_init_instance function, we evaluate handle->priv with > hr_dev, > We want clear the value in hns_roce_hw_v2_uninit_instance function. > So we can ensure no problem in RoCE driver. What problem could happen? I keep removing unnecessary sets to null and checks of null, so please don't add them if they cannot happen. Eg uninit should never be called with a null priv, that is a serious logic mis-design someplace if it happens. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2018/5/18 12:15, Jason Gunthorpe wrote: > On Fri, May 18, 2018 at 11:28:11AM +0800, Wei Hu (Xavier) wrote: >> >> On 2018/5/17 23:14, Jason Gunthorpe wrote: >>> On Thu, May 17, 2018 at 04:02:52PM +0800, Wei Hu (Xavier) wrote: >>>> diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c >>>> index 86ef15f..e1c44a6 100644 >>>> +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c >>>> @@ -774,6 +774,9 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev, >>>> int ret = 0; >>>> int ntc; >>>> >>>> + if (hr_dev->is_reset) >>>> + return 0; >>>> + >>>> spin_lock_bh(&csq->lock); >>>> >>>> if (num > hns_roce_cmq_space(csq)) { >>>> @@ -4790,6 +4793,7 @@ static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle) >>>> return 0; >>>> >>>> error_failed_get_cfg: >>>> + handle->priv = NULL; >>>> kfree(hr_dev->priv); >>>> >>>> error_failed_kzalloc: >>>> @@ -4803,14 +4807,70 @@ static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, >>>> { >>>> struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv; >>>> >>>> + if (!hr_dev) >>>> + return; >>>> + >>>> hns_roce_exit(hr_dev); >>>> + handle->priv = NULL; >>>> kfree(hr_dev->priv); >>>> ib_dealloc_device(&hr_dev->ib_dev); >>>> } >>> Why are these hunks here? If init fails then uninit should not be >>> called, so why meddle with priv? >> In hns_roce_hw_v2_init_instance function, we evaluate handle->priv with >> hr_dev, >> We want clear the value in hns_roce_hw_v2_uninit_instance function. >> So we can ensure no problem in RoCE driver. > What problem could happen? > > I keep removing unnecessary sets to null and checks of null, so please > don't add them if they cannot happen. > > Eg uninit should never be called with a null priv, that is a serious > logic mis-design someplace if it happens. > > Jason NIC driver call the registered reset_notify() function to finish the part of RoCE reset process. In RoCE driver, when hnae3_reset_notify_type is HNAE3_UNINIT_CLIENT, we call hns_roce_hw_v2_uninit_instance(handle, false) to release the resources. when hnae3_reset_notify_type is HNAE3_INIT_CLIENT, we call hns_roce_hw_v2_init_instance. if hns_roce_hw_v2_init_instance failed, we should ensure no problem in the other callback function registered by RoCE driver. The related RoCE driver: static int hns_roce_hw_v2_reset_notify_uninit(struct hnae3_handle *handle) { msleep(100); hns_roce_hw_v2_uninit_instance(handle, false); return 0; } static int hns_roce_hw_v2_reset_notify(struct hnae3_handle *handle, enum hnae3_reset_notify_type type) { int ret = 0; switch (type) { case HNAE3_DOWN_CLIENT: ret = hns_roce_hw_v2_reset_notify_down(handle); break; case HNAE3_INIT_CLIENT: ret = hns_roce_hw_v2_init_instance(handle); break; case HNAE3_UNINIT_CLIENT: ret = hns_roce_hw_v2_reset_notify_uninit(handle); break; default: break; } return ret; } The related NIC driver: static int hclge_notify_roce_client(struct hclge_dev *hdev, enum hnae3_reset_notify_type type) { struct hnae3_client *client = hdev->roce_client; struct hnae3_handle *handle; int ret = 0; u16 i; if (!client) return 0; if (!client->ops->reset_notify) return -EOPNOTSUPP; for (i = 0; i < hdev->num_vmdq_vport + 1; i++) { handle = &hdev->vport[i].roce; ret = client->ops->reset_notify(handle, type); if (ret) { dev_err(&hdev->pdev->dev, "notify roce client failed %d", ret); return ret; } } return ret; } static void hclge_reset(struct hclge_dev *hdev) { struct hnae3_handle *handle; /* perform reset of the stack & ae device for a client */ handle = &hdev->vport[0].nic; hclge_notify_roce_client(hdev, HNAE3_DOWN_CLIENT); hclge_notify_roce_client(hdev, HNAE3_UNINIT_CLIENT); rtnl_lock(); hclge_notify_client(hdev, HNAE3_DOWN_CLIENT); if (!hclge_reset_wait(hdev)) { hclge_notify_client(hdev, HNAE3_UNINIT_CLIENT); hclge_reset_ae_dev(hdev->ae_dev); hclge_notify_client(hdev, HNAE3_INIT_CLIENT); hclge_clear_reset_cause(hdev); } else { /* schedule again to check pending resets later */ set_bit(hdev->reset_type, &hdev->reset_pending); hclge_reset_task_schedule(hdev); } hclge_notify_client(hdev, HNAE3_UP_CLIENT); handle->last_reset_time = jiffies; rtnl_unlock(); hclge_notify_roce_client(hdev, HNAE3_INIT_CLIENT); hclge_notify_roce_client(hdev, HNAE3_UP_CLIENT); } Thanks, Jason > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, May 18, 2018 at 03:23:00PM +0800, Wei Hu (Xavier) wrote: > > > On 2018/5/18 12:15, Jason Gunthorpe wrote: > > On Fri, May 18, 2018 at 11:28:11AM +0800, Wei Hu (Xavier) wrote: > >> > >> On 2018/5/17 23:14, Jason Gunthorpe wrote: > >>> On Thu, May 17, 2018 at 04:02:52PM +0800, Wei Hu (Xavier) wrote: > >>>> diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c > >>>> index 86ef15f..e1c44a6 100644 > >>>> +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c > >>>> @@ -774,6 +774,9 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev, > >>>> int ret = 0; > >>>> int ntc; > >>>> > >>>> + if (hr_dev->is_reset) > >>>> + return 0; > >>>> + > >>>> spin_lock_bh(&csq->lock); > >>>> > >>>> if (num > hns_roce_cmq_space(csq)) { > >>>> @@ -4790,6 +4793,7 @@ static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle) > >>>> return 0; > >>>> > >>>> error_failed_get_cfg: > >>>> + handle->priv = NULL; > >>>> kfree(hr_dev->priv); > >>>> > >>>> error_failed_kzalloc: > >>>> @@ -4803,14 +4807,70 @@ static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, > >>>> { > >>>> struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv; > >>>> > >>>> + if (!hr_dev) > >>>> + return; > >>>> + > >>>> hns_roce_exit(hr_dev); > >>>> + handle->priv = NULL; > >>>> kfree(hr_dev->priv); > >>>> ib_dealloc_device(&hr_dev->ib_dev); > >>>> } > >>> Why are these hunks here? If init fails then uninit should not be > >>> called, so why meddle with priv? > >> In hns_roce_hw_v2_init_instance function, we evaluate handle->priv with > >> hr_dev, > >> We want clear the value in hns_roce_hw_v2_uninit_instance function. > >> So we can ensure no problem in RoCE driver. > > What problem could happen? > > > > I keep removing unnecessary sets to null and checks of null, so please > > don't add them if they cannot happen. > > > > Eg uninit should never be called with a null priv, that is a serious > > logic mis-design someplace if it happens. > > > > Jason > NIC driver call the registered reset_notify() function to finish the > part of RoCE reset process. > In RoCE driver, when hnae3_reset_notify_type is HNAE3_UNINIT_CLIENT, > we call hns_roce_hw_v2_uninit_instance(handle, false) to release the > resources. > when hnae3_reset_notify_type is HNAE3_INIT_CLIENT, we call > hns_roce_hw_v2_init_instance. > if hns_roce_hw_v2_init_instance failed, we should ensure no problem in > the other callback > function registered by RoCE driver. Don't design things like this. init/uninit are paired - do not call something uninit if it can be called after init fails, or better, arrange to prevent that so things are sane. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2018/5/23 4:26, Jason Gunthorpe wrote: > On Fri, May 18, 2018 at 03:23:00PM +0800, Wei Hu (Xavier) wrote: >> >> On 2018/5/18 12:15, Jason Gunthorpe wrote: >>> On Fri, May 18, 2018 at 11:28:11AM +0800, Wei Hu (Xavier) wrote: >>>> On 2018/5/17 23:14, Jason Gunthorpe wrote: >>>>> On Thu, May 17, 2018 at 04:02:52PM +0800, Wei Hu (Xavier) wrote: >>>>>> diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c >>>>>> index 86ef15f..e1c44a6 100644 >>>>>> +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c >>>>>> @@ -774,6 +774,9 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev, >>>>>> int ret = 0; >>>>>> int ntc; >>>>>> >>>>>> + if (hr_dev->is_reset) >>>>>> + return 0; >>>>>> + >>>>>> spin_lock_bh(&csq->lock); >>>>>> >>>>>> if (num > hns_roce_cmq_space(csq)) { >>>>>> @@ -4790,6 +4793,7 @@ static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle) >>>>>> return 0; >>>>>> >>>>>> error_failed_get_cfg: >>>>>> + handle->priv = NULL; >>>>>> kfree(hr_dev->priv); >>>>>> >>>>>> error_failed_kzalloc: >>>>>> @@ -4803,14 +4807,70 @@ static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, >>>>>> { >>>>>> struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv; >>>>>> >>>>>> + if (!hr_dev) >>>>>> + return; >>>>>> + >>>>>> hns_roce_exit(hr_dev); >>>>>> + handle->priv = NULL; >>>>>> kfree(hr_dev->priv); >>>>>> ib_dealloc_device(&hr_dev->ib_dev); >>>>>> } >>>>> Why are these hunks here? If init fails then uninit should not be >>>>> called, so why meddle with priv? >>>> In hns_roce_hw_v2_init_instance function, we evaluate handle->priv with >>>> hr_dev, >>>> We want clear the value in hns_roce_hw_v2_uninit_instance function. >>>> So we can ensure no problem in RoCE driver. >>> What problem could happen? >>> >>> I keep removing unnecessary sets to null and checks of null, so please >>> don't add them if they cannot happen. >>> >>> Eg uninit should never be called with a null priv, that is a serious >>> logic mis-design someplace if it happens. >>> >>> Jason >> NIC driver call the registered reset_notify() function to finish the >> part of RoCE reset process. >> In RoCE driver, when hnae3_reset_notify_type is HNAE3_UNINIT_CLIENT, >> we call hns_roce_hw_v2_uninit_instance(handle, false) to release the >> resources. >> when hnae3_reset_notify_type is HNAE3_INIT_CLIENT, we call >> hns_roce_hw_v2_init_instance. >> if hns_roce_hw_v2_init_instance failed, we should ensure no problem in >> the other callback >> function registered by RoCE driver. > Don't design things like this. > > init/uninit are paired - do not call something uninit if it can be > called after init fails, or better, arrange to prevent that so things > are sane. > > Jason > > . The current RoCE driver registered 3 callback function to NIC driver as belows: 1.init_instance/uninit_instance are paired. 2.In reset_notify function, RoCE dirver still call init_instance/uninit_instance function. but NIC driver does not perceive the behavior. We need to judge in RoCE driver. static const struct hnae3_client_ops hns_roce_hw_v2_ops = { .init_instance = hns_roce_hw_v2_init_instance, .uninit_instance = hns_roce_hw_v2_uninit_instance, .reset_notify = hns_roce_hw_v2_reset_notify, }; Wei Hu > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, May 23, 2018 at 10:54:54AM +0800, Wei Hu (Xavier) wrote: > > > On 2018/5/23 4:26, Jason Gunthorpe wrote: > > On Fri, May 18, 2018 at 03:23:00PM +0800, Wei Hu (Xavier) wrote: > >> > >> On 2018/5/18 12:15, Jason Gunthorpe wrote: > >>> On Fri, May 18, 2018 at 11:28:11AM +0800, Wei Hu (Xavier) wrote: > >>>> On 2018/5/17 23:14, Jason Gunthorpe wrote: > >>>>> On Thu, May 17, 2018 at 04:02:52PM +0800, Wei Hu (Xavier) wrote: > >>>>>> diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c > >>>>>> index 86ef15f..e1c44a6 100644 > >>>>>> +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c > >>>>>> @@ -774,6 +774,9 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev, > >>>>>> int ret = 0; > >>>>>> int ntc; > >>>>>> > >>>>>> + if (hr_dev->is_reset) > >>>>>> + return 0; > >>>>>> + > >>>>>> spin_lock_bh(&csq->lock); > >>>>>> > >>>>>> if (num > hns_roce_cmq_space(csq)) { > >>>>>> @@ -4790,6 +4793,7 @@ static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle) > >>>>>> return 0; > >>>>>> > >>>>>> error_failed_get_cfg: > >>>>>> + handle->priv = NULL; > >>>>>> kfree(hr_dev->priv); > >>>>>> > >>>>>> error_failed_kzalloc: > >>>>>> @@ -4803,14 +4807,70 @@ static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, > >>>>>> { > >>>>>> struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv; > >>>>>> > >>>>>> + if (!hr_dev) > >>>>>> + return; > >>>>>> + > >>>>>> hns_roce_exit(hr_dev); > >>>>>> + handle->priv = NULL; > >>>>>> kfree(hr_dev->priv); > >>>>>> ib_dealloc_device(&hr_dev->ib_dev); > >>>>>> } > >>>>> Why are these hunks here? If init fails then uninit should not be > >>>>> called, so why meddle with priv? > >>>> In hns_roce_hw_v2_init_instance function, we evaluate handle->priv with > >>>> hr_dev, > >>>> We want clear the value in hns_roce_hw_v2_uninit_instance function. > >>>> So we can ensure no problem in RoCE driver. > >>> What problem could happen? > >>> > >>> I keep removing unnecessary sets to null and checks of null, so please > >>> don't add them if they cannot happen. > >>> > >>> Eg uninit should never be called with a null priv, that is a serious > >>> logic mis-design someplace if it happens. > >>> > >>> Jason > >> NIC driver call the registered reset_notify() function to finish the > >> part of RoCE reset process. > >> In RoCE driver, when hnae3_reset_notify_type is HNAE3_UNINIT_CLIENT, > >> we call hns_roce_hw_v2_uninit_instance(handle, false) to release the > >> resources. > >> when hnae3_reset_notify_type is HNAE3_INIT_CLIENT, we call > >> hns_roce_hw_v2_init_instance. > >> if hns_roce_hw_v2_init_instance failed, we should ensure no problem in > >> the other callback > >> function registered by RoCE driver. > > Don't design things like this. > > > > init/uninit are paired - do not call something uninit if it can be > > called after init fails, or better, arrange to prevent that so things > > are sane. > > > > Jason > > > > . > The current RoCE driver registered 3 callback function to NIC driver as > belows: > 1.init_instance/uninit_instance are paired. > 2.In reset_notify function, RoCE dirver still call > init_instance/uninit_instance function. > but NIC driver does not perceive the behavior. We need to judge in RoCE > driver. fix the nic driver Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2018/5/23 10:54, Wei Hu (Xavier) wrote: > > On 2018/5/23 4:26, Jason Gunthorpe wrote: >> On Fri, May 18, 2018 at 03:23:00PM +0800, Wei Hu (Xavier) wrote: >>> On 2018/5/18 12:15, Jason Gunthorpe wrote: >>>> On Fri, May 18, 2018 at 11:28:11AM +0800, Wei Hu (Xavier) wrote: >>>>> On 2018/5/17 23:14, Jason Gunthorpe wrote: >>>>>> On Thu, May 17, 2018 at 04:02:52PM +0800, Wei Hu (Xavier) wrote: >>>>>>> diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c >>>>>>> index 86ef15f..e1c44a6 100644 >>>>>>> +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c >>>>>>> @@ -774,6 +774,9 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev, >>>>>>> int ret = 0; >>>>>>> int ntc; >>>>>>> >>>>>>> + if (hr_dev->is_reset) >>>>>>> + return 0; >>>>>>> + >>>>>>> spin_lock_bh(&csq->lock); >>>>>>> >>>>>>> if (num > hns_roce_cmq_space(csq)) { >>>>>>> @@ -4790,6 +4793,7 @@ static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle) >>>>>>> return 0; >>>>>>> >>>>>>> error_failed_get_cfg: >>>>>>> + handle->priv = NULL; >>>>>>> kfree(hr_dev->priv); >>>>>>> >>>>>>> error_failed_kzalloc: >>>>>>> @@ -4803,14 +4807,70 @@ static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, >>>>>>> { >>>>>>> struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv; >>>>>>> >>>>>>> + if (!hr_dev) >>>>>>> + return; >>>>>>> + >>>>>>> hns_roce_exit(hr_dev); >>>>>>> + handle->priv = NULL; >>>>>>> kfree(hr_dev->priv); >>>>>>> ib_dealloc_device(&hr_dev->ib_dev); >>>>>>> } >>>>>> Why are these hunks here? If init fails then uninit should not be >>>>>> called, so why meddle with priv? >>>>> In hns_roce_hw_v2_init_instance function, we evaluate handle->priv with >>>>> hr_dev, >>>>> We want clear the value in hns_roce_hw_v2_uninit_instance function. >>>>> So we can ensure no problem in RoCE driver. >>>> What problem could happen? >>>> >>>> I keep removing unnecessary sets to null and checks of null, so please >>>> don't add them if they cannot happen. >>>> >>>> Eg uninit should never be called with a null priv, that is a serious >>>> logic mis-design someplace if it happens. >>>> >>>> Jason >>> NIC driver call the registered reset_notify() function to finish the >>> part of RoCE reset process. >>> In RoCE driver, when hnae3_reset_notify_type is HNAE3_UNINIT_CLIENT, >>> we call hns_roce_hw_v2_uninit_instance(handle, false) to release the >>> resources. >>> when hnae3_reset_notify_type is HNAE3_INIT_CLIENT, we call >>> hns_roce_hw_v2_init_instance. >>> if hns_roce_hw_v2_init_instance failed, we should ensure no problem in >>> the other callback >>> function registered by RoCE driver. >> Don't design things like this. >> >> init/uninit are paired - do not call something uninit if it can be >> called after init fails, or better, arrange to prevent that so things >> are sane. >> >> Jason >> >> . > The current RoCE driver registered 3 callback function to NIC driver as > belows: > 1.init_instance/uninit_instance are paired. > 2.In reset_notify function, RoCE dirver still call > init_instance/uninit_instance function. > but NIC driver does not perceive the behavior. We need to judge in RoCE > driver. > > static const struct hnae3_client_ops hns_roce_hw_v2_ops = { > .init_instance = hns_roce_hw_v2_init_instance, > .uninit_instance = hns_roce_hw_v2_uninit_instance, > .reset_notify = hns_roce_hw_v2_reset_notify, > }; struct hnae3_handle is defined in NIC driver, and handle->priv is used for RoCE driver, NIC driver will not use this member handle->priv. struct hnae3_handle { struct hnae3_client *client; struct pci_dev *pdev; void *priv; struct hnae3_ae_algo *ae_algo; /* the class who provides this handle */ u64 flags; /* Indicate the capabilities for this handle*/ unsigned long last_reset_time; enum hnae3_reset_type reset_level; union { struct net_device *netdev; /* first member */ struct hnae3_knic_private_info kinfo; struct hnae3_unic_private_info uinfo; struct hnae3_roce_private_info rinfo; }; u32 numa_node_mask; /* for multi-chip support */ }; > Wei Hu > > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > > . > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2018/5/23 11:47, Jason Gunthorpe wrote: > On Wed, May 23, 2018 at 10:54:54AM +0800, Wei Hu (Xavier) wrote: >> >> On 2018/5/23 4:26, Jason Gunthorpe wrote: >>> On Fri, May 18, 2018 at 03:23:00PM +0800, Wei Hu (Xavier) wrote: >>>> On 2018/5/18 12:15, Jason Gunthorpe wrote: >>>>> On Fri, May 18, 2018 at 11:28:11AM +0800, Wei Hu (Xavier) wrote: >>>>>> On 2018/5/17 23:14, Jason Gunthorpe wrote: >>>>>>> On Thu, May 17, 2018 at 04:02:52PM +0800, Wei Hu (Xavier) wrote: >>>>>>>> diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c >>>>>>>> index 86ef15f..e1c44a6 100644 >>>>>>>> +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c >>>>>>>> @@ -774,6 +774,9 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev, >>>>>>>> int ret = 0; >>>>>>>> int ntc; >>>>>>>> >>>>>>>> + if (hr_dev->is_reset) >>>>>>>> + return 0; >>>>>>>> + >>>>>>>> spin_lock_bh(&csq->lock); >>>>>>>> >>>>>>>> if (num > hns_roce_cmq_space(csq)) { >>>>>>>> @@ -4790,6 +4793,7 @@ static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle) >>>>>>>> return 0; >>>>>>>> >>>>>>>> error_failed_get_cfg: >>>>>>>> + handle->priv = NULL; >>>>>>>> kfree(hr_dev->priv); >>>>>>>> >>>>>>>> error_failed_kzalloc: >>>>>>>> @@ -4803,14 +4807,70 @@ static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, >>>>>>>> { >>>>>>>> struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv; >>>>>>>> >>>>>>>> + if (!hr_dev) >>>>>>>> + return; >>>>>>>> + >>>>>>>> hns_roce_exit(hr_dev); >>>>>>>> + handle->priv = NULL; >>>>>>>> kfree(hr_dev->priv); >>>>>>>> ib_dealloc_device(&hr_dev->ib_dev); >>>>>>>> } >>>>>>> Why are these hunks here? If init fails then uninit should not be >>>>>>> called, so why meddle with priv? >>>>>> In hns_roce_hw_v2_init_instance function, we evaluate handle->priv with >>>>>> hr_dev, >>>>>> We want clear the value in hns_roce_hw_v2_uninit_instance function. >>>>>> So we can ensure no problem in RoCE driver. >>>>> What problem could happen? >>>>> >>>>> I keep removing unnecessary sets to null and checks of null, so please >>>>> don't add them if they cannot happen. >>>>> >>>>> Eg uninit should never be called with a null priv, that is a serious >>>>> logic mis-design someplace if it happens. >>>>> >>>>> Jason >>>> NIC driver call the registered reset_notify() function to finish the >>>> part of RoCE reset process. >>>> In RoCE driver, when hnae3_reset_notify_type is HNAE3_UNINIT_CLIENT, >>>> we call hns_roce_hw_v2_uninit_instance(handle, false) to release the >>>> resources. >>>> when hnae3_reset_notify_type is HNAE3_INIT_CLIENT, we call >>>> hns_roce_hw_v2_init_instance. >>>> if hns_roce_hw_v2_init_instance failed, we should ensure no problem in >>>> the other callback >>>> function registered by RoCE driver. >>> Don't design things like this. >>> >>> init/uninit are paired - do not call something uninit if it can be >>> called after init fails, or better, arrange to prevent that so things >>> are sane. >>> >>> Jason >>> >>> . >> The current RoCE driver registered 3 callback function to NIC driver as >> belows: >> 1.init_instance/uninit_instance are paired. >> 2.In reset_notify function, RoCE dirver still call >> init_instance/uninit_instance function. >> but NIC driver does not perceive the behavior. We need to judge in RoCE >> driver. Hi, Jason I will send v2, thanks. Regards Wei Hu > fix the nic driver > > Jason > > . > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.c b/drivers/infiniband/hw/hns/hns_roce_cmd.c index 9ebe839..a0ba19d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cmd.c +++ b/drivers/infiniband/hw/hns/hns_roce_cmd.c @@ -176,6 +176,9 @@ int hns_roce_cmd_mbox(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param, unsigned long in_modifier, u8 op_modifier, u16 op, unsigned long timeout) { + if (hr_dev->is_reset) + return 0; + if (hr_dev->cmd.use_events) return hns_roce_cmd_mbox_wait(hr_dev, in_param, out_param, in_modifier, op_modifier, op, diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 412297d4..da8512b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -774,6 +774,8 @@ struct hns_roce_dev { const char *irq_names[HNS_ROCE_MAX_IRQ_NUM]; spinlock_t sm_lock; spinlock_t bt_cmd_lock; + bool active; + bool is_reset; struct hns_roce_ib_iboe iboe; struct list_head pgdir_list; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 86ef15f..e1c44a6 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -774,6 +774,9 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev, int ret = 0; int ntc; + if (hr_dev->is_reset) + return 0; + spin_lock_bh(&csq->lock); if (num > hns_roce_cmq_space(csq)) { @@ -4790,6 +4793,7 @@ static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle) return 0; error_failed_get_cfg: + handle->priv = NULL; kfree(hr_dev->priv); error_failed_kzalloc: @@ -4803,14 +4807,70 @@ static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, { struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv; + if (!hr_dev) + return; + hns_roce_exit(hr_dev); + handle->priv = NULL; kfree(hr_dev->priv); ib_dealloc_device(&hr_dev->ib_dev); } +static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle) +{ + struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv; + struct ib_event event; + + if (!hr_dev) { + dev_err(&handle->pdev->dev, + "Input parameter handle->priv is NULL!\n"); + return -EINVAL; + } + + hr_dev->active = false; + hr_dev->is_reset = true; + + event.event = IB_EVENT_DEVICE_FATAL; + event.device = &hr_dev->ib_dev; + event.element.port_num = 1; + ib_dispatch_event(&event); + + return 0; +} + +static int hns_roce_hw_v2_reset_notify_uninit(struct hnae3_handle *handle) +{ + msleep(100); + hns_roce_hw_v2_uninit_instance(handle, false); + return 0; +} + +static int hns_roce_hw_v2_reset_notify(struct hnae3_handle *handle, + enum hnae3_reset_notify_type type) +{ + int ret = 0; + + switch (type) { + case HNAE3_DOWN_CLIENT: + ret = hns_roce_hw_v2_reset_notify_down(handle); + break; + case HNAE3_INIT_CLIENT: + ret = hns_roce_hw_v2_init_instance(handle); + break; + case HNAE3_UNINIT_CLIENT: + ret = hns_roce_hw_v2_reset_notify_uninit(handle); + break; + default: + break; + } + + return ret; +} + static const struct hnae3_client_ops hns_roce_hw_v2_ops = { .init_instance = hns_roce_hw_v2_init_instance, .uninit_instance = hns_roce_hw_v2_uninit_instance, + .reset_notify = hns_roce_hw_v2_reset_notify, }; static struct hnae3_client hns_roce_hw_v2_client = { diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 7fafe9d..56ee172 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -336,6 +336,9 @@ static struct ib_ucontext *hns_roce_alloc_ucontext(struct ib_device *ib_dev, struct hns_roce_ib_alloc_ucontext_resp resp = {}; struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev); + if (!hr_dev->active) + return ERR_PTR(-EAGAIN); + resp.qp_tab_size = hr_dev->caps.num_qps; context = kmalloc(sizeof(*context), GFP_KERNEL); @@ -461,6 +464,7 @@ static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev) { struct hns_roce_ib_iboe *iboe = &hr_dev->iboe; + hr_dev->active = false; unregister_netdevice_notifier(&iboe->nb); ib_unregister_device(&hr_dev->ib_dev); } @@ -573,6 +577,7 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) goto error_failed_setup_mtu_mac; } + hr_dev->active = true; return 0; error_failed_setup_mtu_mac: @@ -765,6 +770,7 @@ int hns_roce_init(struct hns_roce_dev *hr_dev) return ret; } } + hr_dev->is_reset = false; if (hr_dev->hw->cmq_init) { ret = hr_dev->hw->cmq_init(hr_dev); @@ -864,6 +870,7 @@ int hns_roce_init(struct hns_roce_dev *hr_dev) void hns_roce_exit(struct hns_roce_dev *hr_dev) { hns_roce_unregister_device(hr_dev); + if (hr_dev->hw->hw_exit) hr_dev->hw->hw_exit(hr_dev); hns_roce_cleanup_bitmap(hr_dev);
This patch added reset process for RoCE in hip08. Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com> --- drivers/infiniband/hw/hns/hns_roce_cmd.c | 3 ++ drivers/infiniband/hw/hns/hns_roce_device.h | 2 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 60 +++++++++++++++++++++++++++++ drivers/infiniband/hw/hns/hns_roce_main.c | 7 ++++ 4 files changed, 72 insertions(+)