Home Home > GIT Browse > openSUSE-15.1
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThomas Bogendoerfer <tbogendoerfer@suse.de>2019-06-07 13:27:44 +0200
committerThomas Bogendoerfer <tbogendoerfer@suse.de>2019-06-07 13:28:55 +0200
commitabad277c79e7488b513f7358b8f91e5d2cfaccf2 (patch)
tree84d19f24e162225777f15e597591a606a0c3d109
parent68307a8c88cc092b6fdf1888cecb93bb3c131733 (diff)
RDMA/hns: Fix the chip hanging caused by sending mailbox&CMQ
during reset (bsc#1104427 FATE#326416 bsc#1137232).
-rw-r--r--patches.drivers/RDMA-hns-Fix-the-chip-hanging-caused-by-sending-mail.patch285
-rw-r--r--series.conf1
2 files changed, 286 insertions, 0 deletions
diff --git a/patches.drivers/RDMA-hns-Fix-the-chip-hanging-caused-by-sending-mail.patch b/patches.drivers/RDMA-hns-Fix-the-chip-hanging-caused-by-sending-mail.patch
new file mode 100644
index 0000000000..529996624b
--- /dev/null
+++ b/patches.drivers/RDMA-hns-Fix-the-chip-hanging-caused-by-sending-mail.patch
@@ -0,0 +1,285 @@
+From: "Wei Hu (Xavier)" <xavier.huwei@huawei.com>
+Date: Sun, 3 Feb 2019 20:43:14 +0800
+Subject: RDMA/hns: Fix the chip hanging caused by sending mailbox&CMQ during
+ reset
+Patch-mainline: v5.1-rc1
+Git-commit: 6a04aed6afaefd5fd396f23da184298135f31e37
+References: bsc#1104427 FATE#326416 bsc#1137232
+
+On hi08 chip, There is a possibility of chip hanging and some errors when
+sending mailbox & doorbell during reset. We can fix it by prohibiting
+mailbox and doorbell during reset and reset occurred to ensure that
+hardware can work normally.
+
+Fixes: a04ff739f2a9 ("RDMA/hns: Add command queue support for hip08 RoCE driver")
+Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
+Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
+Acked-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
+---
+ drivers/infiniband/hw/hns/hns_roce_cmd.c | 32 ++++--
+ drivers/infiniband/hw/hns/hns_roce_device.h | 7 +
+ drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 139 ++++++++++++++++++++++++++--
+ drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 2
+ 4 files changed, 167 insertions(+), 13 deletions(-)
+
+--- a/drivers/infiniband/hw/hns/hns_roce_cmd.c
++++ b/drivers/infiniband/hw/hns/hns_roce_cmd.c
+@@ -176,17 +176,33 @@ int hns_roce_cmd_mbox(struct hns_roce_de
+ unsigned long in_modifier, u8 op_modifier, u16 op,
+ unsigned long timeout)
+ {
+- if (hr_dev->is_reset)
+- return 0;
++ int ret;
++
++ if (hr_dev->hw->rst_prc_mbox) {
++ ret = hr_dev->hw->rst_prc_mbox(hr_dev);
++ if (ret == CMD_RST_PRC_SUCCESS)
++ return 0;
++ else if (ret == CMD_RST_PRC_EBUSY)
++ return -EBUSY;
++ }
+
+ if (hr_dev->cmd.use_events)
+- return hns_roce_cmd_mbox_wait(hr_dev, in_param, out_param,
+- in_modifier, op_modifier, op,
+- timeout);
++ ret = hns_roce_cmd_mbox_wait(hr_dev, in_param, out_param,
++ in_modifier, op_modifier, op,
++ timeout);
+ else
+- return hns_roce_cmd_mbox_poll(hr_dev, in_param, out_param,
+- in_modifier, op_modifier, op,
+- timeout);
++ ret = hns_roce_cmd_mbox_poll(hr_dev, in_param, out_param,
++ in_modifier, op_modifier, op,
++ timeout);
++
++ if (ret == CMD_RST_PRC_EBUSY)
++ return -EBUSY;
++
++ if (ret && (hr_dev->hw->rst_prc_mbox &&
++ hr_dev->hw->rst_prc_mbox(hr_dev) == CMD_RST_PRC_SUCCESS))
++ return 0;
++
++ return ret;
+ }
+ EXPORT_SYMBOL_GPL(hns_roce_cmd_mbox);
+
+--- a/drivers/infiniband/hw/hns/hns_roce_device.h
++++ b/drivers/infiniband/hw/hns/hns_roce_device.h
+@@ -237,6 +237,12 @@ enum {
+ HNS_ROCE_RST_DIRECT_RETURN = 0,
+ };
+
++enum {
++ CMD_RST_PRC_OTHERS,
++ CMD_RST_PRC_SUCCESS,
++ CMD_RST_PRC_EBUSY,
++};
++
+ #define HNS_ROCE_CMD_SUCCESS 1
+
+ #define HNS_ROCE_PORT_DOWN 0
+@@ -875,6 +881,7 @@ struct hns_roce_hw {
+ u64 out_param, u32 in_modifier, u8 op_modifier, u16 op,
+ u16 token, int event);
+ int (*chk_mbox)(struct hns_roce_dev *hr_dev, unsigned long timeout);
++ int (*rst_prc_mbox)(struct hns_roce_dev *hr_dev);
+ int (*set_gid)(struct hns_roce_dev *hr_dev, u8 port, int gid_index,
+ const union ib_gid *gid, const struct ib_gid_attr *attr);
+ int (*set_mac)(struct hns_roce_dev *hr_dev, u8 phy_port, u8 *addr);
+--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
++++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+@@ -712,6 +712,110 @@ out:
+ return ret;
+ }
+
++static int hns_roce_v2_cmd_hw_reseted(struct hns_roce_dev *hr_dev,
++ unsigned long instance_stage,
++ unsigned long reset_stage)
++{
++ /* When hardware reset has been completed once or more, we should stop
++ * sending mailbox&cmq to hardware. If now in .init_instance()
++ * function, we should exit with error. If now at HNAE3_INIT_CLIENT
++ * stage of soft reset process, we should exit with error, and then
++ * HNAE3_INIT_CLIENT related process can rollback the operation like
++ * notifing hardware to free resources, HNAE3_INIT_CLIENT related
++ * process will exit with error to notify NIC driver to reschedule soft
++ * reset process once again.
++ */
++ hr_dev->is_reset = true;
++
++ if (reset_stage == HNS_ROCE_STATE_RST_INIT ||
++ instance_stage == HNS_ROCE_STATE_INIT)
++ return CMD_RST_PRC_EBUSY;
++
++ return CMD_RST_PRC_SUCCESS;
++}
++
++static int hns_roce_v2_cmd_hw_resetting(struct hns_roce_dev *hr_dev,
++ unsigned long instance_stage,
++ unsigned long reset_stage)
++{
++ struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
++ struct hnae3_handle *handle = priv->handle;
++ const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
++
++ /* When hardware reset is detected, we should stop sending mailbox&cmq
++ * to hardware. If now in .init_instance() function, we should
++ * exit with error. If now at HNAE3_INIT_CLIENT stage of soft reset
++ * process, we should exit with error, and then HNAE3_INIT_CLIENT
++ * related process can rollback the operation like notifing hardware to
++ * free resources, HNAE3_INIT_CLIENT related process will exit with
++ * error to notify NIC driver to reschedule soft reset process once
++ * again.
++ */
++ if (!ops->get_hw_reset_stat(handle))
++ hr_dev->is_reset = true;
++
++ if (!hr_dev->is_reset || reset_stage == HNS_ROCE_STATE_RST_INIT ||
++ instance_stage == HNS_ROCE_STATE_INIT)
++ return CMD_RST_PRC_EBUSY;
++
++ return CMD_RST_PRC_SUCCESS;
++}
++
++static int hns_roce_v2_cmd_sw_resetting(struct hns_roce_dev *hr_dev)
++{
++ struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
++ struct hnae3_handle *handle = priv->handle;
++ const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
++
++ /* When software reset is detected at .init_instance() function, we
++ * should stop sending mailbox&cmq to hardware, and exit with
++ * error.
++ */
++ if (ops->ae_dev_reset_cnt(handle) != hr_dev->reset_cnt)
++ hr_dev->is_reset = true;
++
++ return CMD_RST_PRC_EBUSY;
++}
++
++static int hns_roce_v2_rst_process_cmd(struct hns_roce_dev *hr_dev)
++{
++ struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
++ struct hnae3_handle *handle = priv->handle;
++ const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
++ unsigned long instance_stage; /* the current instance stage */
++ unsigned long reset_stage; /* the current reset stage */
++ unsigned long reset_cnt;
++ bool sw_resetting;
++ bool hw_resetting;
++
++ if (hr_dev->is_reset)
++ return CMD_RST_PRC_SUCCESS;
++
++ /* Get information about reset from NIC driver or RoCE driver itself,
++ * the meaning of the following variables from NIC driver are described
++ * as below:
++ * reset_cnt -- The count value of completed hardware reset.
++ * hw_resetting -- Whether hardware device is resetting now.
++ * sw_resetting -- Whether NIC's software reset process is running now.
++ */
++ instance_stage = handle->rinfo.instance_state;
++ reset_stage = handle->rinfo.reset_state;
++ reset_cnt = ops->ae_dev_reset_cnt(handle);
++ hw_resetting = ops->get_hw_reset_stat(handle);
++ sw_resetting = ops->ae_dev_resetting(handle);
++
++ if (reset_cnt != hr_dev->reset_cnt)
++ return hns_roce_v2_cmd_hw_reseted(hr_dev, instance_stage,
++ reset_stage);
++ else if (hw_resetting)
++ return hns_roce_v2_cmd_hw_resetting(hr_dev, instance_stage,
++ reset_stage);
++ else if (sw_resetting && instance_stage == HNS_ROCE_STATE_INIT)
++ return hns_roce_v2_cmd_sw_resetting(hr_dev);
++
++ return 0;
++}
++
+ static int hns_roce_cmq_space(struct hns_roce_v2_cmq_ring *ring)
+ {
+ int ntu = ring->next_to_use;
+@@ -892,8 +996,8 @@ static int hns_roce_cmq_csq_clean(struct
+ return clean;
+ }
+
+-static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
+- struct hns_roce_cmq_desc *desc, int num)
++static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
++ struct hns_roce_cmq_desc *desc, int num)
+ {
+ struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+ struct hns_roce_v2_cmq_ring *csq = &priv->cmq.csq;
+@@ -905,9 +1009,6 @@ static int hns_roce_cmq_send(struct hns_
+ int ret = 0;
+ int ntc;
+
+- if (hr_dev->is_reset)
+- return 0;
+-
+ spin_lock_bh(&csq->lock);
+
+ if (num > hns_roce_cmq_space(csq)) {
+@@ -982,6 +1083,30 @@ static int hns_roce_cmq_send(struct hns_
+ return ret;
+ }
+
++int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
++ struct hns_roce_cmq_desc *desc, int num)
++{
++ int retval;
++ int ret;
++
++ ret = hns_roce_v2_rst_process_cmd(hr_dev);
++ if (ret == CMD_RST_PRC_SUCCESS)
++ return 0;
++ if (ret == CMD_RST_PRC_EBUSY)
++ return ret;
++
++ ret = __hns_roce_cmq_send(hr_dev, desc, num);
++ if (ret) {
++ retval = hns_roce_v2_rst_process_cmd(hr_dev);
++ if (retval == CMD_RST_PRC_SUCCESS)
++ return 0;
++ else if (retval == CMD_RST_PRC_EBUSY)
++ return retval;
++ }
++
++ return ret;
++}
++
+ static int hns_roce_cmq_query_hw_info(struct hns_roce_dev *hr_dev)
+ {
+ struct hns_roce_query_version *resp;
+@@ -1857,6 +1982,9 @@ static int hns_roce_v2_chk_mbox(struct h
+
+ status = hns_roce_v2_cmd_complete(hr_dev);
+ if (status != 0x1) {
++ if (status == CMD_RST_PRC_EBUSY)
++ return status;
++
+ dev_err(dev, "mailbox status 0x%x!\n", status);
+ return -EBUSY;
+ }
+@@ -5961,6 +6089,7 @@ static const struct hns_roce_hw hns_roce
+ .hw_exit = hns_roce_v2_exit,
+ .post_mbox = hns_roce_v2_post_mbox,
+ .chk_mbox = hns_roce_v2_chk_mbox,
++ .rst_prc_mbox = hns_roce_v2_rst_process_cmd,
+ .set_gid = hns_roce_v2_set_gid,
+ .set_mac = hns_roce_v2_set_mac,
+ .write_mtpt = hns_roce_v2_write_mtpt,
+--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
++++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
+@@ -96,6 +96,8 @@
+ #define HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE 2
+ #define HNS_ROCE_V2_RSV_QPS 8
+
++#define HNS_ROCE_V2_HW_RST_TIMEOUT 1000
++
+ #define HNS_ROCE_CONTEXT_HOP_NUM 1
+ #define HNS_ROCE_SCCC_HOP_NUM 1
+ #define HNS_ROCE_MTT_HOP_NUM 1
diff --git a/series.conf b/series.conf
index bc472afea7..4ae9b262e8 100644
--- a/series.conf
+++ b/series.conf
@@ -45481,6 +45481,7 @@
patches.drivers/RDMA-hns-Remove-set-but-not-used-variable-rst.patch
patches.drivers/RDMA-hns-Make-some-function-static.patch
patches.drivers/RDMA-hns-Fix-the-Oops-during-rmmod-or-insmod-ko-when.patch
+ patches.drivers/RDMA-hns-Fix-the-chip-hanging-caused-by-sending-mail.patch
patches.drivers/iw_cxgb-kzalloc-the-iwcm-verbs-struct.patch
patches.drivers/IB-mlx5-Do-not-use-hw_access_flags-for-be-and-CPU-da.patch
patches.drivers/RDMA-bnxt_re-Add-chip-context-to-identify-57500-seri.patch