Home Home > GIT Browse > openSUSE-15.1
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThomas Bogendoerfer <tbogendoerfer@suse.de>2019-06-07 13:27:44 +0200
committerThomas Bogendoerfer <tbogendoerfer@suse.de>2019-06-07 13:28:49 +0200
commit68307a8c88cc092b6fdf1888cecb93bb3c131733 (patch)
tree4db9301e2f8abfe2ac33d5c9edc6ff0f6037512e
parent227e1e14d4821dc6f8b042f51ec2a52eebe5f42e (diff)
RDMA/hns: Fix the Oops during rmmod or insmod ko when reset
occurs (bsc#1104427 FATE#326416 bsc#1137232).
-rw-r--r--patches.drivers/RDMA-hns-Fix-the-Oops-during-rmmod-or-insmod-ko-when.patch307
-rw-r--r--series.conf1
2 files changed, 308 insertions, 0 deletions
diff --git a/patches.drivers/RDMA-hns-Fix-the-Oops-during-rmmod-or-insmod-ko-when.patch b/patches.drivers/RDMA-hns-Fix-the-Oops-during-rmmod-or-insmod-ko-when.patch
new file mode 100644
index 0000000000..65227a37ae
--- /dev/null
+++ b/patches.drivers/RDMA-hns-Fix-the-Oops-during-rmmod-or-insmod-ko-when.patch
@@ -0,0 +1,307 @@
+From: "Wei Hu (Xavier)" <xavier.huwei@huawei.com>
+Date: Sun, 3 Feb 2019 20:43:13 +0800
+Subject: RDMA/hns: Fix the Oops during rmmod or insmod ko when reset occurs
+Patch-mainline: v5.1-rc1
+Git-commit: d061effc36f7bd38a12912977a37a50ac9140d11
+References: bsc#1104427 FATE#326416 bsc#1137232
+
+In the reset process, the hns3 NIC driver notifies the RoCE driver to
+perform reset related processing by calling the .reset_notify() interface
+registered by the RoCE driver in hip08 SoC.
+
+In the current version, if a reset occurs simultaneously during the
+execution of rmmod or insmod ko, there may be Oops error as below:
+
+ Internal error: Oops: 86000007 [#1] PREEMPT SMP
+ Modules linked in: hns_roce(O) hns3(O) hclge(O) hnae3(O) [last unloaded: hns_roce_hw_v2]
+ CPU: 0 PID: 14 Comm: kworker/0:1 Tainted: G O 4.19.0-ge00d540 #1
+ Hardware name: Huawei Technologies Co., Ltd.
+ Workqueue: events hclge_reset_service_task [hclge]
+ pstate: 60c00009 (nZCv daif +PAN +UAO)
+ pc : 0xffff00000100b0b8
+ lr : 0xffff00000100aea0
+ sp : ffff000009afbab0
+ x29: ffff000009afbab0 x28: 0000000000000800
+ x27: 0000000000007ff0 x26: ffff80002f90c004
+ x25: 00000000000007ff x24: ffff000008f97000
+ x23: ffff80003efee0a8 x22: 0000000000001000
+ x21: ffff80002f917ff0 x20: ffff8000286ea070
+ x19: 0000000000000800 x18: 0000000000000400
+ x17: 00000000c4d3225d x16: 00000000000021b8
+ x15: 0000000000000400 x14: 0000000000000400
+ x13: 0000000000000000 x12: ffff80003fac6e30
+ x11: 0000800036303000 x10: 0000000000000001
+ x9 : 0000000000000000 x8 : ffff80003016d000
+ x7 : 0000000000000000 x6 : 000000000000003f
+ x5 : 0000000000000040 x4 : 0000000000000000
+ x3 : 0000000000000004 x2 : 00000000000007ff
+ x1 : 0000000000000000 x0 : 0000000000000000
+ Process kworker/0:1 (pid: 14, stack limit = 0x00000000af8f0ad9)
+ Call trace:
+ 0xffff00000100b0b8
+ 0xffff00000100b3a0
+ hns_roce_init+0x624/0xc88 [hns_roce]
+ 0xffff000001002df8
+ 0xffff000001006960
+ hclge_notify_roce_client+0x74/0xe0 [hclge]
+ hclge_reset_service_task+0xa58/0xbc0 [hclge]
+ process_one_work+0x1e4/0x458
+ worker_thread+0x40/0x450
+ kthread+0x12c/0x130
+ ret_from_fork+0x10/0x18
+ Code: bad PC value
+
+In the reset process, we will release the resources firstly, and after the
+hardware reset is completed, we will reapply resources and reconfigure the
+hardware.
+
+We can solve this problem by modifying both the NIC and the RoCE
+driver. We can modify the concurrent processing in the NIC driver to avoid
+calling the .reset_notify and .uninit_instance ops at the same time. And
+we need to modify the RoCE driver to record the reset stage and the
+driver's init/uninit state, and check the state in the .reset_notify,
+.init_instance. and uninit_instance functions to avoid NULL pointer
+operation.
+
+Fixes: cb7a94c9c808 ("RDMA/hns: Add reset process for RoCE in hip08")
+Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
+Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
+Acked-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
+---
+ drivers/infiniband/hw/hns/hns_roce_device.h | 21 +++++
+ drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 103 ++++++++++++++++++++++++----
+ drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 1
+ 3 files changed, 112 insertions(+), 13 deletions(-)
+
+--- a/drivers/infiniband/hw/hns/hns_roce_device.h
++++ b/drivers/infiniband/hw/hns/hns_roce_device.h
+@@ -217,6 +217,26 @@ enum {
+ HNS_ROCE_DB_PER_PAGE = PAGE_SIZE / 4
+ };
+
++enum hns_roce_reset_stage {
++ HNS_ROCE_STATE_NON_RST,
++ HNS_ROCE_STATE_RST_BEF_DOWN,
++ HNS_ROCE_STATE_RST_DOWN,
++ HNS_ROCE_STATE_RST_UNINIT,
++ HNS_ROCE_STATE_RST_INIT,
++ HNS_ROCE_STATE_RST_INITED,
++};
++
++enum hns_roce_instance_state {
++ HNS_ROCE_STATE_NON_INIT,
++ HNS_ROCE_STATE_INIT,
++ HNS_ROCE_STATE_INITED,
++ HNS_ROCE_STATE_UNINIT,
++};
++
++enum {
++ HNS_ROCE_RST_DIRECT_RETURN = 0,
++};
++
+ #define HNS_ROCE_CMD_SUCCESS 1
+
+ #define HNS_ROCE_PORT_DOWN 0
+@@ -919,6 +939,7 @@ struct hns_roce_dev {
+ spinlock_t bt_cmd_lock;
+ bool active;
+ bool is_reset;
++ unsigned long reset_cnt;
+ struct hns_roce_ib_iboe iboe;
+
+ struct list_head pgdir_list;
+--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
++++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+@@ -6002,6 +6002,7 @@ MODULE_DEVICE_TABLE(pci, hns_roce_hw_v2_
+ static int hns_roce_hw_v2_get_cfg(struct hns_roce_dev *hr_dev,
+ struct hnae3_handle *handle)
+ {
++ struct hns_roce_v2_priv *priv = hr_dev->priv;
+ const struct pci_device_id *id;
+ int i;
+
+@@ -6032,10 +6033,13 @@ static int hns_roce_hw_v2_get_cfg(struct
+ hr_dev->cmd_mod = 1;
+ hr_dev->loop_idc = 0;
+
++ hr_dev->reset_cnt = handle->ae_algo->ops->ae_dev_reset_cnt(handle);
++ priv->handle = handle;
++
+ return 0;
+ }
+
+-static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle)
++static int __hns_roce_hw_v2_init_instance(struct hnae3_handle *handle)
+ {
+ struct hns_roce_dev *hr_dev;
+ int ret;
+@@ -6052,7 +6056,6 @@ static int hns_roce_hw_v2_init_instance(
+
+ hr_dev->pci_dev = handle->pdev;
+ hr_dev->dev = &handle->pdev->dev;
+- handle->priv = hr_dev;
+
+ ret = hns_roce_hw_v2_get_cfg(hr_dev, handle);
+ if (ret) {
+@@ -6066,6 +6069,8 @@ static int hns_roce_hw_v2_init_instance(
+ goto error_failed_get_cfg;
+ }
+
++ handle->priv = hr_dev;
++
+ return 0;
+
+ error_failed_get_cfg:
+@@ -6077,7 +6082,7 @@ error_failed_kzalloc:
+ return ret;
+ }
+
+-static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
++static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
+ bool reset)
+ {
+ struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv;
+@@ -6085,24 +6090,78 @@ static void hns_roce_hw_v2_uninit_instan
+ if (!hr_dev)
+ return;
+
++ handle->priv = NULL;
+ hns_roce_exit(hr_dev);
+ kfree(hr_dev->priv);
+ ib_dealloc_device(&hr_dev->ib_dev);
+ }
+
++static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle)
++{
++ const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
++ struct device *dev = &handle->pdev->dev;
++ int ret;
++
++ handle->rinfo.instance_state = HNS_ROCE_STATE_INIT;
++
++ if (ops->ae_dev_resetting(handle) || ops->get_hw_reset_stat(handle)) {
++ handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT;
++ goto reset_chk_err;
++ }
++
++ ret = __hns_roce_hw_v2_init_instance(handle);
++ if (ret) {
++ handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT;
++ dev_err(dev, "RoCE instance init failed! ret = %d\n", ret);
++ if (ops->ae_dev_resetting(handle) ||
++ ops->get_hw_reset_stat(handle))
++ goto reset_chk_err;
++ else
++ return ret;
++ }
++
++ handle->rinfo.instance_state = HNS_ROCE_STATE_INITED;
++
++
++ return 0;
++
++reset_chk_err:
++ dev_err(dev, "Device is busy in resetting state.\n"
++ "please retry later.\n");
++
++ return -EBUSY;
++}
++
++static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
++ bool reset)
++{
++ if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED)
++ return;
++
++ handle->rinfo.instance_state = HNS_ROCE_STATE_UNINIT;
++
++ __hns_roce_hw_v2_uninit_instance(handle, reset);
++
++ handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT;
++}
+ static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle)
+ {
+- struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv;
++ struct hns_roce_dev *hr_dev;
+ struct ib_event event;
+
+- if (!hr_dev) {
+- dev_err(&handle->pdev->dev,
+- "Input parameter handle->priv is NULL!\n");
+- return -EINVAL;
++ if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED) {
++ set_bit(HNS_ROCE_RST_DIRECT_RETURN, &handle->rinfo.state);
++ return 0;
+ }
+
++ handle->rinfo.reset_state = HNS_ROCE_STATE_RST_DOWN;
++ clear_bit(HNS_ROCE_RST_DIRECT_RETURN, &handle->rinfo.state);
++
++ hr_dev = (struct hns_roce_dev *)handle->priv;
++ if (!hr_dev)
++ return 0;
++
+ hr_dev->active = false;
+- hr_dev->is_reset = true;
+
+ event.event = IB_EVENT_DEVICE_FATAL;
+ event.device = &hr_dev->ib_dev;
+@@ -6114,17 +6173,29 @@ static int hns_roce_hw_v2_reset_notify_d
+
+ static int hns_roce_hw_v2_reset_notify_init(struct hnae3_handle *handle)
+ {
++ struct device *dev = &handle->pdev->dev;
+ int ret;
+
+- ret = hns_roce_hw_v2_init_instance(handle);
++ if (test_and_clear_bit(HNS_ROCE_RST_DIRECT_RETURN,
++ &handle->rinfo.state)) {
++ handle->rinfo.reset_state = HNS_ROCE_STATE_RST_INITED;
++ return 0;
++ }
++
++ handle->rinfo.reset_state = HNS_ROCE_STATE_RST_INIT;
++
++ dev_info(&handle->pdev->dev, "In reset process RoCE client reinit.\n");
++ ret = __hns_roce_hw_v2_init_instance(handle);
+ if (ret) {
+ /* when reset notify type is HNAE3_INIT_CLIENT In reset notify
+ * callback function, RoCE Engine reinitialize. If RoCE reinit
+ * failed, we should inform NIC driver.
+ */
+ handle->priv = NULL;
+- dev_err(&handle->pdev->dev,
+- "In reset process RoCE reinit failed %d.\n", ret);
++ dev_err(dev, "In reset process RoCE reinit failed %d.\n", ret);
++ } else {
++ handle->rinfo.reset_state = HNS_ROCE_STATE_RST_INITED;
++ dev_info(dev, "Reset done, RoCE client reinit finished.\n");
+ }
+
+ return ret;
+@@ -6132,8 +6203,14 @@ static int hns_roce_hw_v2_reset_notify_i
+
+ static int hns_roce_hw_v2_reset_notify_uninit(struct hnae3_handle *handle)
+ {
++ if (test_bit(HNS_ROCE_RST_DIRECT_RETURN, &handle->rinfo.state))
++ return 0;
++
++ handle->rinfo.reset_state = HNS_ROCE_STATE_RST_UNINIT;
++ dev_info(&handle->pdev->dev, "In reset process RoCE client uninit.\n");
+ msleep(100);
+- hns_roce_hw_v2_uninit_instance(handle, false);
++ __hns_roce_hw_v2_uninit_instance(handle, false);
++
+ return 0;
+ }
+
+--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
++++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
+@@ -1602,6 +1602,7 @@ struct hns_roce_link_table_entry {
+ #define HNS_ROCE_LINK_TABLE_NXT_PTR_M GENMASK(31, 20)
+
+ struct hns_roce_v2_priv {
++ struct hnae3_handle *handle;
+ struct hns_roce_v2_cmq cmq;
+ struct hns_roce_link_table tsq;
+ struct hns_roce_link_table tpq;
diff --git a/series.conf b/series.conf
index daaee4db2e..bc472afea7 100644
--- a/series.conf
+++ b/series.conf
@@ -45480,6 +45480,7 @@
patches.drivers/IB-ipoib-Make-ipoib_intercept_dev_id_attr-static.patch
patches.drivers/RDMA-hns-Remove-set-but-not-used-variable-rst.patch
patches.drivers/RDMA-hns-Make-some-function-static.patch
+ patches.drivers/RDMA-hns-Fix-the-Oops-during-rmmod-or-insmod-ko-when.patch
patches.drivers/iw_cxgb-kzalloc-the-iwcm-verbs-struct.patch
patches.drivers/IB-mlx5-Do-not-use-hw_access_flags-for-be-and-CPU-da.patch
patches.drivers/RDMA-bnxt_re-Add-chip-context-to-identify-57500-seri.patch