From 7e484b902d269d8fd22afeccb765ccc2cd8efa19 Mon Sep 17 00:00:00 2001 From: Aleksander Jan Bajkowski Date: Mon, 3 Jan 2022 23:40:22 +0100 Subject: [PATCH] lantiq: backport latest upstream patches This patch includes a series of performance improvements. All patches were accepted and should land in 5.17. NAT Performance results on BT Home Hub 5A (kernel 5.10.89, mtu 1500): Down Up Before 539 Mbps 599 Mbps After 624 Mbps 695 Mbps Signed-off-by: Aleksander Jan Bajkowski --- ...-lantiq-dma-increase-descritor-count.patch | 28 +++ ...tiq_xrx200-increase-napi-poll-weigth.patch | 32 +++ ...t-lantiq_xrx200-convert-to-build_skb.patch | 206 ++++++++++++++++++ 3 files changed, 266 insertions(+) create mode 100644 target/linux/lantiq/patches-5.10/0713-MIPS-lantiq-dma-increase-descritor-count.patch create mode 100644 target/linux/lantiq/patches-5.10/0714-net-lantiq_xrx200-increase-napi-poll-weigth.patch create mode 100644 target/linux/lantiq/patches-5.10/0715-net-lantiq_xrx200-convert-to-build_skb.patch diff --git a/target/linux/lantiq/patches-5.10/0713-MIPS-lantiq-dma-increase-descritor-count.patch b/target/linux/lantiq/patches-5.10/0713-MIPS-lantiq-dma-increase-descritor-count.patch new file mode 100644 index 0000000000..37ed1d4f31 --- /dev/null +++ b/target/linux/lantiq/patches-5.10/0713-MIPS-lantiq-dma-increase-descritor-count.patch @@ -0,0 +1,28 @@ +From 5112e9234bbb89f8dd15c983206bd9107b8436d5 Mon Sep 17 00:00:00 2001 +From: Aleksander Jan Bajkowski +Date: Tue, 4 Jan 2022 16:11:42 +0100 +Subject: [PATCH 713/715] MIPS: lantiq: dma: increase descritor count + +NAT Performance results on BT Home Hub 5A (kernel 5.10.89, mtu 1500): + + Down Up +Before 539 Mbps 599 Mbps +After 545 Mbps 625 Mbps + +Signed-off-by: Aleksander Jan Bajkowski +Signed-off-by: Jakub Kicinski +--- + arch/mips/include/asm/mach-lantiq/xway/xway_dma.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/mips/include/asm/mach-lantiq/xway/xway_dma.h ++++ b/arch/mips/include/asm/mach-lantiq/xway/xway_dma.h +@@ -8,7 +8,7 @@ + #define LTQ_DMA_H__ + + #define LTQ_DESC_SIZE 0x08 /* each descriptor is 64bit */ +-#define LTQ_DESC_NUM 0x40 /* 64 descriptors / channel */ ++#define LTQ_DESC_NUM 0xC0 /* 192 descriptors / channel */ + + #define LTQ_DMA_OWN BIT(31) /* owner bit */ + #define LTQ_DMA_C BIT(30) /* complete bit */ diff --git a/target/linux/lantiq/patches-5.10/0714-net-lantiq_xrx200-increase-napi-poll-weigth.patch b/target/linux/lantiq/patches-5.10/0714-net-lantiq_xrx200-increase-napi-poll-weigth.patch new file mode 100644 index 0000000000..1fa49f406e --- /dev/null +++ b/target/linux/lantiq/patches-5.10/0714-net-lantiq_xrx200-increase-napi-poll-weigth.patch @@ -0,0 +1,32 @@ +From 768818d772d5d4ddc0c7eb2e62848929270ab7a3 Mon Sep 17 00:00:00 2001 +From: Aleksander Jan Bajkowski +Date: Tue, 4 Jan 2022 16:11:43 +0100 +Subject: [PATCH 714/715] net: lantiq_xrx200: increase napi poll weigth + +NAT Performance results on BT Home Hub 5A (kernel 5.10.89, mtu 1500): + + Down Up +Before 545 Mbps 625 Mbps +After 577 Mbps 648 Mbps + +Signed-off-by: Aleksander Jan Bajkowski +Signed-off-by: Jakub Kicinski +--- + drivers/net/ethernet/lantiq_xrx200.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/lantiq_xrx200.c ++++ b/drivers/net/ethernet/lantiq_xrx200.c +@@ -606,8 +606,10 @@ static int xrx200_probe(struct platform_ + PMAC_HD_CTL); + + /* setup NAPI */ +- netif_napi_add(net_dev, &priv->chan_rx.napi, xrx200_poll_rx, 32); +- netif_tx_napi_add(net_dev, &priv->chan_tx.napi, xrx200_tx_housekeeping, 32); ++ netif_napi_add(net_dev, &priv->chan_rx.napi, xrx200_poll_rx, ++ NAPI_POLL_WEIGHT); ++ netif_tx_napi_add(net_dev, &priv->chan_tx.napi, xrx200_tx_housekeeping, ++ NAPI_POLL_WEIGHT); + + platform_set_drvdata(pdev, priv); + diff --git a/target/linux/lantiq/patches-5.10/0715-net-lantiq_xrx200-convert-to-build_skb.patch b/target/linux/lantiq/patches-5.10/0715-net-lantiq_xrx200-convert-to-build_skb.patch new file mode 100644 index 0000000000..b2b014832c --- /dev/null +++ b/target/linux/lantiq/patches-5.10/0715-net-lantiq_xrx200-convert-to-build_skb.patch @@ -0,0 +1,206 @@ +From e015593573b3e3f74bd8a63c05fa92902194a354 Mon Sep 17 00:00:00 2001 +From: Aleksander Jan Bajkowski +Date: Tue, 4 Jan 2022 16:11:44 +0100 +Subject: [PATCH 715/715] net: lantiq_xrx200: convert to build_skb + +We can increase the efficiency of rx path by using buffers to receive +packets then build SKBs around them just before passing into the network +stack. In contrast, preallocating SKBs too early reduces CPU cache +efficiency. + +NAT Performance results on BT Home Hub 5A (kernel 5.10.89, mtu 1500): + + Down Up +Before 577 Mbps 648 Mbps +After 624 Mbps 695 Mbps + +Signed-off-by: Aleksander Jan Bajkowski +Signed-off-by: Jakub Kicinski +--- + drivers/net/ethernet/lantiq_xrx200.c | 56 ++++++++++++++++++---------- + 1 file changed, 36 insertions(+), 20 deletions(-) + +--- a/drivers/net/ethernet/lantiq_xrx200.c ++++ b/drivers/net/ethernet/lantiq_xrx200.c +@@ -63,7 +63,11 @@ struct xrx200_chan { + + struct napi_struct napi; + struct ltq_dma_channel dma; +- struct sk_buff *skb[LTQ_DESC_NUM]; ++ ++ union { ++ struct sk_buff *skb[LTQ_DESC_NUM]; ++ void *rx_buff[LTQ_DESC_NUM]; ++ }; + + struct sk_buff *skb_head; + struct sk_buff *skb_tail; +@@ -78,6 +82,7 @@ struct xrx200_priv { + struct xrx200_chan chan_rx; + + u16 rx_buf_size; ++ u16 rx_skb_size; + + struct net_device *net_dev; + struct device *dev; +@@ -115,6 +120,12 @@ static int xrx200_buffer_size(int mtu) + return round_up(xrx200_max_frame_len(mtu), 4 * XRX200_DMA_BURST_LEN); + } + ++static int xrx200_skb_size(u16 buf_size) ++{ ++ return SKB_DATA_ALIGN(buf_size + NET_SKB_PAD + NET_IP_ALIGN) + ++ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); ++} ++ + /* drop all the packets from the DMA ring */ + static void xrx200_flush_dma(struct xrx200_chan *ch) + { +@@ -173,30 +184,29 @@ static int xrx200_close(struct net_devic + return 0; + } + +-static int xrx200_alloc_skb(struct xrx200_chan *ch) ++static int xrx200_alloc_buf(struct xrx200_chan *ch, void *(*alloc)(unsigned int size)) + { +- struct sk_buff *skb = ch->skb[ch->dma.desc]; ++ void *buf = ch->rx_buff[ch->dma.desc]; + struct xrx200_priv *priv = ch->priv; + dma_addr_t mapping; + int ret = 0; + +- ch->skb[ch->dma.desc] = netdev_alloc_skb_ip_align(priv->net_dev, +- priv->rx_buf_size); +- if (!ch->skb[ch->dma.desc]) { ++ ch->rx_buff[ch->dma.desc] = alloc(priv->rx_skb_size); ++ if (!ch->rx_buff[ch->dma.desc]) { + ret = -ENOMEM; + goto skip; + } + +- mapping = dma_map_single(priv->dev, ch->skb[ch->dma.desc]->data, ++ mapping = dma_map_single(priv->dev, ch->rx_buff[ch->dma.desc], + priv->rx_buf_size, DMA_FROM_DEVICE); + if (unlikely(dma_mapping_error(priv->dev, mapping))) { +- dev_kfree_skb_any(ch->skb[ch->dma.desc]); +- ch->skb[ch->dma.desc] = skb; ++ skb_free_frag(ch->rx_buff[ch->dma.desc]); ++ ch->rx_buff[ch->dma.desc] = buf; + ret = -ENOMEM; + goto skip; + } + +- ch->dma.desc_base[ch->dma.desc].addr = mapping; ++ ch->dma.desc_base[ch->dma.desc].addr = mapping + NET_SKB_PAD + NET_IP_ALIGN; + /* Make sure the address is written before we give it to HW */ + wmb(); + skip: +@@ -210,13 +220,14 @@ static int xrx200_hw_receive(struct xrx2 + { + struct xrx200_priv *priv = ch->priv; + struct ltq_dma_desc *desc = &ch->dma.desc_base[ch->dma.desc]; +- struct sk_buff *skb = ch->skb[ch->dma.desc]; ++ void *buf = ch->rx_buff[ch->dma.desc]; + u32 ctl = desc->ctl; + int len = (ctl & LTQ_DMA_SIZE_MASK); + struct net_device *net_dev = priv->net_dev; ++ struct sk_buff *skb; + int ret; + +- ret = xrx200_alloc_skb(ch); ++ ret = xrx200_alloc_buf(ch, napi_alloc_frag); + + ch->dma.desc++; + ch->dma.desc %= LTQ_DESC_NUM; +@@ -227,19 +238,21 @@ static int xrx200_hw_receive(struct xrx2 + return ret; + } + ++ skb = build_skb(buf, priv->rx_skb_size); ++ skb_reserve(skb, NET_SKB_PAD); + skb_put(skb, len); + + /* add buffers to skb via skb->frag_list */ + if (ctl & LTQ_DMA_SOP) { + ch->skb_head = skb; + ch->skb_tail = skb; ++ skb_reserve(skb, NET_IP_ALIGN); + } else if (ch->skb_head) { + if (ch->skb_head == ch->skb_tail) + skb_shinfo(ch->skb_tail)->frag_list = skb; + else + ch->skb_tail->next = skb; + ch->skb_tail = skb; +- skb_reserve(ch->skb_tail, -NET_IP_ALIGN); + ch->skb_head->len += skb->len; + ch->skb_head->data_len += skb->len; + ch->skb_head->truesize += skb->truesize; +@@ -395,12 +408,13 @@ xrx200_change_mtu(struct net_device *net + struct xrx200_chan *ch_rx = &priv->chan_rx; + int old_mtu = net_dev->mtu; + bool running = false; +- struct sk_buff *skb; ++ void *buff; + int curr_desc; + int ret = 0; + + net_dev->mtu = new_mtu; + priv->rx_buf_size = xrx200_buffer_size(new_mtu); ++ priv->rx_skb_size = xrx200_skb_size(priv->rx_buf_size); + + if (new_mtu <= old_mtu) + return ret; +@@ -416,14 +430,15 @@ xrx200_change_mtu(struct net_device *net + + for (ch_rx->dma.desc = 0; ch_rx->dma.desc < LTQ_DESC_NUM; + ch_rx->dma.desc++) { +- skb = ch_rx->skb[ch_rx->dma.desc]; +- ret = xrx200_alloc_skb(ch_rx); ++ buff = ch_rx->rx_buff[ch_rx->dma.desc]; ++ ret = xrx200_alloc_buf(ch_rx, netdev_alloc_frag); + if (ret) { + net_dev->mtu = old_mtu; + priv->rx_buf_size = xrx200_buffer_size(old_mtu); ++ priv->rx_skb_size = xrx200_skb_size(priv->rx_buf_size); + break; + } +- dev_kfree_skb_any(skb); ++ skb_free_frag(buff); + } + + ch_rx->dma.desc = curr_desc; +@@ -476,7 +491,7 @@ static int xrx200_dma_init(struct xrx200 + ltq_dma_alloc_rx(&ch_rx->dma); + for (ch_rx->dma.desc = 0; ch_rx->dma.desc < LTQ_DESC_NUM; + ch_rx->dma.desc++) { +- ret = xrx200_alloc_skb(ch_rx); ++ ret = xrx200_alloc_buf(ch_rx, netdev_alloc_frag); + if (ret) + goto rx_free; + } +@@ -511,7 +526,7 @@ rx_ring_free: + /* free the allocated RX ring */ + for (i = 0; i < LTQ_DESC_NUM; i++) { + if (priv->chan_rx.skb[i]) +- dev_kfree_skb_any(priv->chan_rx.skb[i]); ++ skb_free_frag(priv->chan_rx.rx_buff[i]); + } + + rx_free: +@@ -528,7 +543,7 @@ static void xrx200_hw_cleanup(struct xrx + + /* free the allocated RX ring */ + for (i = 0; i < LTQ_DESC_NUM; i++) +- dev_kfree_skb_any(priv->chan_rx.skb[i]); ++ skb_free_frag(priv->chan_rx.rx_buff[i]); + } + + static int xrx200_probe(struct platform_device *pdev) +@@ -554,6 +569,7 @@ static int xrx200_probe(struct platform_ + net_dev->min_mtu = ETH_ZLEN; + net_dev->max_mtu = XRX200_DMA_DATA_LEN - xrx200_max_frame_len(0); + priv->rx_buf_size = xrx200_buffer_size(ETH_DATA_LEN); ++ priv->rx_skb_size = xrx200_skb_size(priv->rx_buf_size); + + /* load the memory ranges */ + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); -- 2.30.2