From ef223e58a3f5b7877bfc9f4b65dd3ba51cce5b72 Mon Sep 17 00:00:00 2001 From: Josef Schlehofer Date: Tue, 2 Aug 2022 16:07:53 +0200 Subject: [PATCH] mvebu: PCI: aardvark: Implement workaround for PCIe Completion Timeout MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Turris MOX randomly crashes up, when there is connected miniPCIe card MediaTek MT7915 with the following output: [ 71.457007] Internal error: synchronous external abort: 96000210 [#1] SMP [ 71.464021] Modules linked in: xt_connlimit pppoe ppp_async nf_conncount iptable_nat ath9k xt_state xt_nat xt_helper xt_conntrack xt_connmark xt_connbytes xt_REDIREl [ 71.464187] btintel br_netfilter bnep bluetooth ath9k_hw ath10k_pci ath10k_core ath sch_tbf sch_ingress sch_htb sch_hfsc em_u32 cls_u32 cls_tcindex cls_route cls_mg [ 71.629589] CPU: 0 PID: 1298 Comm: kworker/u5:3 Not tainted 5.4.114 #0 [ 71.636319] Hardware name: CZ.NIC Turris Mox Board (DT) [ 71.641725] Workqueue: napi_workq napi_workfn [ 71.646221] pstate: 80400085 (Nzcv daIf +PAN -UAO) [ 71.651169] pc : mt76_set_irq_mask+0x118/0x150 [mt76] [ 71.656385] lr : mt7915_init_debugfs+0x358/0x368 [mt7915e] [ 71.662038] sp : ffffffc010003cd0 [ 71.665451] x29: ffffffc010003cd0 x28: 0000000000000060 [ 71.670929] x27: ffffffc010a56f98 x26: ffffffc010c0fa9a [ 71.676407] x25: ffffffc010ba8788 x24: ffffff803e01fe00 [ 71.681885] x23: 0000000000000030 x22: ffffffc010003dc4 [ 71.687361] x21: 0000000000000000 x20: ffffff803e01fea4 [ 71.692839] x19: ffffff803cb725c0 x18: 000000002d660780 [ 71.698317] x17: 0000000000000000 x16: 0000000000000001 [ 71.703795] x15: 0000000000005ee0 x14: ffffffc010d1d000 [ 71.709272] x13: 0000000000002f70 x12: 0000000000000000 [ 71.714749] x11: 0000000000000000 x10: 0000000000000040 [ 71.720226] x9 : ffffffc010bbe980 x8 : ffffffc010bbe978 [ 71.725704] x7 : ffffff803e4003f0 x6 : 0000000000000000 [ 71.731181] x5 : ffffffc02f240000 x4 : ffffffc010003e00 [ 71.736658] x3 : 0000000000000000 x2 : ffffffc008e3f230 [ 71.742135] x1 : 00000000000d7010 x0 : ffffffc0114d7010 [ 71.747613] Call trace: [ 71.750137] mt76_set_irq_mask+0x118/0x150 [mt76] [ 71.754990] mt7915_dual_hif_set_irq_mask+0x108/0xdc0 [mt7915e] [ 71.761098] __handle_irq_event_percpu+0x6c/0x170 [ 71.765950] handle_irq_event_percpu+0x34/0x88 [ 71.770531] handle_irq_event+0x40/0xb0 [ 71.774486] handle_level_irq+0xe0/0x170 [ 71.778530] generic_handle_irq+0x24/0x38 [ 71.782667] advk_pcie_irq_handler+0x11c/0x238 [ 71.787249] __handle_irq_event_percpu+0x6c/0x170 [ 71.792099] handle_irq_event_percpu+0x34/0x88 [ 71.796680] handle_irq_event+0x40/0xb0 [ 71.800633] handle_fasteoi_irq+0xdc/0x190 [ 71.804855] generic_handle_irq+0x24/0x38 [ 71.808988] __handle_domain_irq+0x60/0xb8 [ 71.813213] gic_handle_irq+0x8c/0x198 [ 71.817077] el1_irq+0xf0/0x1c0 [ 71.820314] el1_da+0xc/0xc0 [ 71.823288] mt76_set_irq_mask+0x118/0x150 [mt76] [ 71.828141] mt7915_mac_tx_free+0x4c4/0x828 [mt7915e] [ 71.833352] mt7915_queue_rx_skb+0x5c/0xa8 [mt7915e] [ 71.838473] mt76_dma_cleanup+0x89c/0x1248 [mt76] [ 71.843329] __napi_poll+0x38/0xf8 [ 71.846835] napi_workfn+0x58/0xb0 [ 71.850342] process_one_work+0x1fc/0x390 [ 71.854475] worker_thread+0x48/0x4d0 [ 71.858252] kthread+0x120/0x128 [ 71.861581] ret_from_fork+0x10/0x1c [ 71.865273] Code: 52800000 d65f03c0 f9562c00 8b214000 (b9400000) [ 71.871560] ---[ end trace 1d4e29987011411b ]--- [ 71.876320] Kernel panic - not syncing: Fatal exception in interrupt [ 71.882875] SMP: stopping secondary CPUs [ 71.886923] Kernel Offset: disabled [ 71.890519] CPU features: 0x0002,00002008 [ 71.894649] Memory Limit: none [ 71.897799] Rebooting in 3 seconds.. Patch is awaiting upstream merge: https://lore.kernel.org/linux-pci/20220802123816.21817-1-pali@kernel.org/T/#u There was also discussion about it in the linux-pci mailing list, where can be found response from Marvell's employee regarding A3720 PCIe erratum 3.12, which seems to provide further details which help this issue: https://lore.kernel.org/linux-pci/BN9PR18MB425154FE5019DCAF2028A1D5DB8D9@BN9PR18MB4251.namprd18.prod.outlook.com/t/#u Reported-by: Ondřej Caletka [Turris MOX] Signed-off-by: Josef Schlehofer Reviewed-by: Robert Marko --- .../100-aardvark-workaround-PCIe.patch | 81 +++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 target/linux/mvebu/patches-5.15/100-aardvark-workaround-PCIe.patch diff --git a/target/linux/mvebu/patches-5.15/100-aardvark-workaround-PCIe.patch b/target/linux/mvebu/patches-5.15/100-aardvark-workaround-PCIe.patch new file mode 100644 index 0000000000..975eadb80e --- /dev/null +++ b/target/linux/mvebu/patches-5.15/100-aardvark-workaround-PCIe.patch @@ -0,0 +1,81 @@ +Subject: [PATCH v2] PCI: aardvark: Implement workaround for PCIe Completion Timeout +Date: Tue, 2 Aug 2022 14:38:16 +0200 +Message-Id: <20220802123816.21817-1-pali@kernel.org> +X-Mailer: git-send-email 2.20.1 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit +Precedence: bulk +List-ID: +X-Mailing-List: linux-pci@vger.kernel.org + +Marvell Armada 3700 Functional Errata, Guidelines, and Restrictions +document describes in erratum 3.12 PCIe Completion Timeout (Ref #: 251), +that PCIe IP does not support a strong-ordered model for inbound posted vs. +outbound completion. + +As a workaround for this erratum, DIS_ORD_CHK flag in Debug Mux Control +register must be set. It disables the ordering check in the core between +Completions and Posted requests received from the link. + +Marvell also suggests to do full memory barrier at the beginning of +aardvark summary interrupt handler before calling interrupt handlers of +endpoint drivers in order to minimize the risk for the race condition +documented in the Erratum between the DMA done status reading and the +completion of writing to the host memory. + +More details about this issue and suggested workarounds are in discussion: +https://lore.kernel.org/linux-pci/BN9PR18MB425154FE5019DCAF2028A1D5DB8D9@BN9PR18MB4251.namprd18.prod.outlook.com/t/#u + +It was reported that enabling this workaround fixes instability issues and +"Unhandled fault" errors when using 60 GHz WiFi 802.11ad card with Qualcomm +QCA6335 chip under significant load which were caused by interrupt status +stuck in the outbound CMPLT queue traced back to this erratum. + +This workaround fixes also kernel panic triggered after some minutes of +usage 5 GHz WiFi 802.11ax card with Mediatek MT7915 chip: + + Internal error: synchronous external abort: 96000210 [#1] SMP + Kernel panic - not syncing: Fatal exception in interrupt + +Signed-off-by: Thomas Petazzoni +Signed-off-by: Pali Rohár +Fixes: 8c39d710363c ("PCI: aardvark: Add Aardvark PCI host controller driver") +Cc: stable@vger.kernel.org +--- + drivers/pci/controller/pci-aardvark.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +--- a/drivers/pci/controller/pci-aardvark.c ++++ b/drivers/pci/controller/pci-aardvark.c +@@ -210,6 +210,8 @@ enum { + }; + + #define VENDOR_ID_REG (LMI_BASE_ADDR + 0x44) ++#define DEBUG_MUX_CTRL_REG (LMI_BASE_ADDR + 0x208) ++#define DIS_ORD_CHK BIT(30) + + /* PCIe core controller registers */ + #define CTRL_CORE_BASE_ADDR 0x18000 +@@ -558,6 +560,11 @@ static void advk_pcie_setup_hw(struct ad + PCIE_CORE_CTRL2_TD_ENABLE; + advk_writel(pcie, reg, PCIE_CORE_CTRL2_REG); + ++ /* Disable ordering checks, workaround for erratum 3.12 "PCIe completion timeout" */ ++ reg = advk_readl(pcie, DEBUG_MUX_CTRL_REG); ++ reg |= DIS_ORD_CHK; ++ advk_writel(pcie, reg, DEBUG_MUX_CTRL_REG); ++ + /* Set lane X1 */ + reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); + reg &= ~LANE_CNT_MSK; +@@ -1580,6 +1587,9 @@ static irqreturn_t advk_pcie_irq_handler + struct advk_pcie *pcie = arg; + u32 status; + ++ /* Full memory barrier (ARM dsb sy), workaround for erratum 3.12 "PCIe completion timeout" */ ++ mb(); ++ + status = advk_readl(pcie, HOST_CTRL_INT_STATUS_REG); + if (!(status & PCIE_IRQ_CORE_INT)) + return IRQ_NONE; -- 2.30.2