From patchwork Tue Apr 23 14:06:31 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Lijun Ou X-Patchwork-Id: 10913153 Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id B36FF112C for ; Tue, 23 Apr 2019 14:06:55 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id A29ED287F5 for ; Tue, 23 Apr 2019 14:06:55 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id 937622881E; Tue, 23 Apr 2019 14:06:55 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-7.9 required=2.0 tests=BAYES_00,MAILING_LIST_MULTI, RCVD_IN_DNSWL_HI autolearn=ham version=3.3.1 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id D2DF8287F5 for ; Tue, 23 Apr 2019 14:06:54 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1727500AbfDWOGy (ORCPT ); Tue, 23 Apr 2019 10:06:54 -0400 Received: from szxga05-in.huawei.com ([45.249.212.191]:7235 "EHLO huawei.com" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1727180AbfDWOGy (ORCPT ); Tue, 23 Apr 2019 10:06:54 -0400 Received: from DGGEMS409-HUB.china.huawei.com (unknown [172.30.72.60]) by Forcepoint Email with ESMTP id 539583D15485504D59C2; Tue, 23 Apr 2019 22:06:51 +0800 (CST) Received: from linux-ioko.site (10.71.200.31) by DGGEMS409-HUB.china.huawei.com (10.3.19.209) with Microsoft SMTP Server id 14.3.439.0; Tue, 23 Apr 2019 22:06:43 +0800 From: Lijun Ou To: , CC: , , Subject: [PATCH V2 for-next] RDMA/hns: Add support function clear when removing module Date: Tue, 23 Apr 2019 22:06:31 +0800 Message-ID: <1556028391-107055-1-git-send-email-oulijun@huawei.com> X-Mailer: git-send-email 1.9.1 MIME-Version: 1.0 X-Originating-IP: [10.71.200.31] X-CFilter-Loop: Reflected Sender: linux-rdma-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-rdma@vger.kernel.org X-Virus-Scanned: ClamAV using ClamSMTP If carried out unloading operation when application is running, the hardware is too late to release some hardware resource that remain in it. Under these circumstances, it maybe happen error if loaded hns driver and run app again. In order to resolve it, the hardware adds a function to stop this function and clear the residual hardware resources in the function. Signed-off-by: chenglang Signed-off-by: Lijun Ou --- V1->V2: 1. rewrite the commit. --- drivers/infiniband/hw/hns/hns_roce_device.h | 1 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 164 ++++++++++++++++++++++++++++ drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 27 ++++- 3 files changed, 191 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 563cf39..2e35469 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -990,6 +990,7 @@ struct hns_roce_dev { void *priv; struct workqueue_struct *irq_workq; const struct hns_roce_dfx_hw *dfx; + u32 func_num; }; static inline struct hns_roce_dev *to_hr_dev(struct ib_device *ib_dev) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index f155d2d..efaf4ee 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1129,6 +1129,165 @@ static int hns_roce_cmq_query_hw_info(struct hns_roce_dev *hr_dev) return 0; } +static bool hns_roce_func_clr_chk_rst(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv; + struct hnae3_handle *handle = priv->handle; + const struct hnae3_ae_ops *ops = handle->ae_algo->ops; + unsigned long reset_cnt; + bool sw_resetting; + bool hw_resetting; + + reset_cnt = ops->ae_dev_reset_cnt(handle); + hw_resetting = ops->get_hw_reset_stat(handle); + sw_resetting = ops->ae_dev_resetting(handle); + + if (reset_cnt != hr_dev->reset_cnt || hw_resetting || sw_resetting) + return true; + + return false; +} + +static void hns_roce_func_clr_rst_prc(struct hns_roce_dev *hr_dev, int retval, + int flag) +{ + struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv; + struct hnae3_handle *handle = priv->handle; + const struct hnae3_ae_ops *ops = handle->ae_algo->ops; + unsigned long instance_stage; + unsigned long reset_cnt; + unsigned long end; + bool sw_resetting; + bool hw_resetting; + + instance_stage = handle->rinfo.instance_state; + reset_cnt = ops->ae_dev_reset_cnt(handle); + hw_resetting = ops->get_hw_reset_stat(handle); + sw_resetting = ops->ae_dev_resetting(handle); + + if (reset_cnt != hr_dev->reset_cnt) { + hr_dev->dis_db = true; + hr_dev->is_reset = true; + dev_info(hr_dev->dev, "Func clear success after reset.\n"); + } else if (hw_resetting) { + hr_dev->dis_db = true; + + dev_warn(hr_dev->dev, + "Func clear is pending, device in resetting state.\n"); + end = msecs_to_jiffies(HNS_ROCE_V2_HW_RST_TIMEOUT) + jiffies; + while (time_before(jiffies, end)) { + if (!ops->get_hw_reset_stat(handle)) { + hr_dev->is_reset = true; + dev_info(hr_dev->dev, + "Func clear success after reset.\n"); + return; + } + msleep(HNS_ROCE_V2_HW_RST_COMPLETION_WAIT); + } + + dev_warn(hr_dev->dev, "Func clear failed.\n"); + } else if (sw_resetting && instance_stage == HNS_ROCE_STATE_INIT) { + hr_dev->dis_db = true; + + dev_warn(hr_dev->dev, + "Func clear is pending, device in resetting state.\n"); + end = msecs_to_jiffies(HNS_ROCE_V2_HW_RST_TIMEOUT) + jiffies; + while (time_before(jiffies, end)) { + if (ops->ae_dev_reset_cnt(handle) != + hr_dev->reset_cnt) { + hr_dev->is_reset = true; + dev_info(hr_dev->dev, + "Func clear success after sw reset\n"); + return; + } + msleep(HNS_ROCE_V2_HW_RST_COMPLETION_WAIT); + } + + dev_warn(hr_dev->dev, "Func clear failed because of unfinished sw reset\n"); + } else { + if (retval && !flag) + dev_warn(hr_dev->dev, + "Func clear read failed, ret = %d.\n", retval); + + dev_warn(hr_dev->dev, "Func clear failed.\n"); + } +} + +static void hns_roce_query_func_num(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_pf_func_num *resp; + struct hns_roce_cmq_desc desc; + int ret; + + hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_QUERY_VF_NUM, true); + ret = hns_roce_cmq_send(hr_dev, &desc, 1); + if (ret) { + dev_err(hr_dev->dev, "Query vf count fail, ret = %d.\n", + ret); + return; + } + + resp = (struct hns_roce_pf_func_num *)desc.data; + hr_dev->func_num = resp->pf_own_func_num; +} + +static void hns_roce_clear_func(struct hns_roce_dev *hr_dev, int vf_id) +{ + bool fclr_write_fail_flag = false; + struct hns_roce_func_clear *resp; + struct hns_roce_cmq_desc desc; + unsigned long end; + int ret = 0; + + if (hns_roce_func_clr_chk_rst(hr_dev)) + goto out; + + hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_FUNC_CLEAR, false); + resp = (struct hns_roce_func_clear *)desc.data; + resp->rst_funcid_en = vf_id; + + ret = hns_roce_cmq_send(hr_dev, &desc, 1); + if (ret) { + fclr_write_fail_flag = true; + dev_err(hr_dev->dev, "Func clear write failed, ret = %d.\n", + ret); + goto out; + } + + end = msecs_to_jiffies(HNS_ROCE_V2_FUNC_CLEAR_TIMEOUT_MSECS) + jiffies; + + msleep(HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_INTERVAL); + while (time_before(jiffies, end)) { + if (hns_roce_func_clr_chk_rst(hr_dev)) + goto out; + + hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_FUNC_CLEAR, + true); + resp->rst_funcid_en = vf_id; + + ret = hns_roce_cmq_send(hr_dev, &desc, 1); + if (ret) { + msleep(HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_FAIL_WAIT); + continue; + } + + if (roce_get_bit(resp->func_done, FUNC_CLEAR_RST_FUN_DONE_S)) { + if (vf_id == 0) + hr_dev->is_reset = true; + return; + } + } + +out: + dev_err(hr_dev->dev, "Func clear read vf_id %d fail.\n", vf_id); + hns_roce_func_clr_rst_prc(hr_dev, ret, fclr_write_fail_flag); +} + +static void hns_roce_function_clear(struct hns_roce_dev *hr_dev) +{ + hns_roce_clear_func(hr_dev, 0); +} + static int hns_roce_query_fw_ver(struct hns_roce_dev *hr_dev) { struct hns_roce_query_fw_info *resp; @@ -1479,6 +1638,8 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev) return ret; } + hns_roce_query_func_num(hr_dev); + if (hr_dev->pci_dev->revision == 0x21) { ret = hns_roce_query_pf_timer_resource(hr_dev); if (ret) { @@ -1890,6 +2051,9 @@ static void hns_roce_v2_exit(struct hns_roce_dev *hr_dev) { struct hns_roce_v2_priv *priv = hr_dev->priv; + if (hr_dev->pci_dev->revision == 0x21) + hns_roce_function_clear(hr_dev); + hns_roce_free_link_table(hr_dev, &priv->tpq); hns_roce_free_link_table(hr_dev, &priv->tsq); } diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index edfdbe2..3f2c85f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -96,7 +96,10 @@ #define HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE 2 #define HNS_ROCE_V2_RSV_QPS 8 -#define HNS_ROCE_V2_HW_RST_TIMEOUT 1000 +/* Time out for hardware to complete reset */ +#define HNS_ROCE_V2_HW_RST_TIMEOUT 1000 + +#define HNS_ROCE_V2_HW_RST_COMPLETION_WAIT 20 #define HNS_ROCE_CONTEXT_HOP_NUM 1 #define HNS_ROCE_SCCC_HOP_NUM 1 @@ -236,11 +239,13 @@ enum hns_roce_opcode_type { HNS_ROCE_OPC_CFG_EXT_LLM = 0x8403, HNS_ROCE_OPC_CFG_TMOUT_LLM = 0x8404, HNS_ROCE_OPC_QUERY_PF_TIMER_RES = 0x8406, + HNS_ROCE_OPC_QUERY_VF_NUM = 0x8407, HNS_ROCE_OPC_CFG_SGID_TB = 0x8500, HNS_ROCE_OPC_CFG_SMAC_TB = 0x8501, HNS_ROCE_OPC_POST_MB = 0x8504, HNS_ROCE_OPC_QUERY_MB_ST = 0x8505, HNS_ROCE_OPC_CFG_BT_ATTR = 0x8506, + HNS_ROCE_OPC_FUNC_CLEAR = 0x8508, HNS_ROCE_OPC_CLR_SCCC = 0x8509, HNS_ROCE_OPC_QUERY_SCCC = 0x850a, HNS_ROCE_OPC_RESET_SCCC = 0x850b, @@ -1226,6 +1231,26 @@ struct hns_roce_query_fw_info { __le32 rsv[5]; }; +struct hns_roce_func_clear { + __le32 rst_funcid_en; + __le32 func_done; + __le32 rsv[4]; +}; + +struct hns_roce_pf_func_num { + __le32 pf_own_func_num; + __le32 func_done; + __le32 rsv[4]; +}; + +#define FUNC_CLEAR_RST_FUN_EN_S 8 + +#define FUNC_CLEAR_RST_FUN_DONE_S 0 + +#define HNS_ROCE_V2_FUNC_CLEAR_TIMEOUT_MSECS (512 * 100) +#define HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_INTERVAL 40 +#define HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_FAIL_WAIT 20 + struct hns_roce_cfg_llm_a { __le32 base_addr_l; __le32 base_addr_h;