lantiq: backport latest upstream patches
authorAleksander Jan Bajkowski <olek2@wp.pl>
Mon, 3 Jan 2022 22:40:22 +0000 (23:40 +0100)
committerHauke Mehrtens <hauke@hauke-m.de>
Sun, 16 Jan 2022 19:51:14 +0000 (20:51 +0100)
This patch includes a series of performance improvements. All patches
were accepted and should land in 5.17.
NAT Performance results on BT Home Hub 5A (kernel 5.10.89, mtu 1500):

Down Up
Before 539 Mbps 599 Mbps
After 624 Mbps 695 Mbps

Signed-off-by: Aleksander Jan Bajkowski <olek2@wp.pl>
target/linux/lantiq/patches-5.10/0713-MIPS-lantiq-dma-increase-descritor-count.patch [new file with mode: 0644]
target/linux/lantiq/patches-5.10/0714-net-lantiq_xrx200-increase-napi-poll-weigth.patch [new file with mode: 0644]
target/linux/lantiq/patches-5.10/0715-net-lantiq_xrx200-convert-to-build_skb.patch [new file with mode: 0644]

diff --git a/target/linux/lantiq/patches-5.10/0713-MIPS-lantiq-dma-increase-descritor-count.patch b/target/linux/lantiq/patches-5.10/0713-MIPS-lantiq-dma-increase-descritor-count.patch
new file mode 100644 (file)
index 0000000..37ed1d4
--- /dev/null
@@ -0,0 +1,28 @@
+From 5112e9234bbb89f8dd15c983206bd9107b8436d5 Mon Sep 17 00:00:00 2001
+From: Aleksander Jan Bajkowski <olek2@wp.pl>
+Date: Tue, 4 Jan 2022 16:11:42 +0100
+Subject: [PATCH 713/715] MIPS: lantiq: dma: increase descritor count
+
+NAT Performance results on BT Home Hub 5A (kernel 5.10.89, mtu 1500):
+
+       Down            Up
+Before 539 Mbps        599 Mbps
+After  545 Mbps        625 Mbps
+
+Signed-off-by: Aleksander Jan Bajkowski <olek2@wp.pl>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+---
+ arch/mips/include/asm/mach-lantiq/xway/xway_dma.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/mips/include/asm/mach-lantiq/xway/xway_dma.h
++++ b/arch/mips/include/asm/mach-lantiq/xway/xway_dma.h
+@@ -8,7 +8,7 @@
+ #define LTQ_DMA_H__
+ #define LTQ_DESC_SIZE         0x08    /* each descriptor is 64bit */
+-#define LTQ_DESC_NUM          0x40    /* 64 descriptors / channel */
++#define LTQ_DESC_NUM          0xC0    /* 192 descriptors / channel */
+ #define LTQ_DMA_OWN           BIT(31) /* owner bit */
+ #define LTQ_DMA_C             BIT(30) /* complete bit */
diff --git a/target/linux/lantiq/patches-5.10/0714-net-lantiq_xrx200-increase-napi-poll-weigth.patch b/target/linux/lantiq/patches-5.10/0714-net-lantiq_xrx200-increase-napi-poll-weigth.patch
new file mode 100644 (file)
index 0000000..1fa49f4
--- /dev/null
@@ -0,0 +1,32 @@
+From 768818d772d5d4ddc0c7eb2e62848929270ab7a3 Mon Sep 17 00:00:00 2001
+From: Aleksander Jan Bajkowski <olek2@wp.pl>
+Date: Tue, 4 Jan 2022 16:11:43 +0100
+Subject: [PATCH 714/715] net: lantiq_xrx200: increase napi poll weigth
+
+NAT Performance results on BT Home Hub 5A (kernel 5.10.89, mtu 1500):
+
+       Down            Up
+Before 545 Mbps        625 Mbps
+After  577 Mbps        648 Mbps
+
+Signed-off-by: Aleksander Jan Bajkowski <olek2@wp.pl>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+---
+ drivers/net/ethernet/lantiq_xrx200.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/drivers/net/ethernet/lantiq_xrx200.c
++++ b/drivers/net/ethernet/lantiq_xrx200.c
+@@ -606,8 +606,10 @@ static int xrx200_probe(struct platform_
+                        PMAC_HD_CTL);
+       /* setup NAPI */
+-      netif_napi_add(net_dev, &priv->chan_rx.napi, xrx200_poll_rx, 32);
+-      netif_tx_napi_add(net_dev, &priv->chan_tx.napi, xrx200_tx_housekeeping, 32);
++      netif_napi_add(net_dev, &priv->chan_rx.napi, xrx200_poll_rx,
++                     NAPI_POLL_WEIGHT);
++      netif_tx_napi_add(net_dev, &priv->chan_tx.napi, xrx200_tx_housekeeping,
++                        NAPI_POLL_WEIGHT);
+       platform_set_drvdata(pdev, priv);
diff --git a/target/linux/lantiq/patches-5.10/0715-net-lantiq_xrx200-convert-to-build_skb.patch b/target/linux/lantiq/patches-5.10/0715-net-lantiq_xrx200-convert-to-build_skb.patch
new file mode 100644 (file)
index 0000000..b2b0148
--- /dev/null
@@ -0,0 +1,206 @@
+From e015593573b3e3f74bd8a63c05fa92902194a354 Mon Sep 17 00:00:00 2001
+From: Aleksander Jan Bajkowski <olek2@wp.pl>
+Date: Tue, 4 Jan 2022 16:11:44 +0100
+Subject: [PATCH 715/715] net: lantiq_xrx200: convert to build_skb
+
+We can increase the efficiency of rx path by using buffers to receive
+packets then build SKBs around them just before passing into the network
+stack. In contrast, preallocating SKBs too early reduces CPU cache
+efficiency.
+
+NAT Performance results on BT Home Hub 5A (kernel 5.10.89, mtu 1500):
+
+       Down            Up
+Before 577 Mbps        648 Mbps
+After  624 Mbps        695 Mbps
+
+Signed-off-by: Aleksander Jan Bajkowski <olek2@wp.pl>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+---
+ drivers/net/ethernet/lantiq_xrx200.c | 56 ++++++++++++++++++----------
+ 1 file changed, 36 insertions(+), 20 deletions(-)
+
+--- a/drivers/net/ethernet/lantiq_xrx200.c
++++ b/drivers/net/ethernet/lantiq_xrx200.c
+@@ -63,7 +63,11 @@ struct xrx200_chan {
+       struct napi_struct napi;
+       struct ltq_dma_channel dma;
+-      struct sk_buff *skb[LTQ_DESC_NUM];
++
++      union {
++              struct sk_buff *skb[LTQ_DESC_NUM];
++              void *rx_buff[LTQ_DESC_NUM];
++      };
+       struct sk_buff *skb_head;
+       struct sk_buff *skb_tail;
+@@ -78,6 +82,7 @@ struct xrx200_priv {
+       struct xrx200_chan chan_rx;
+       u16 rx_buf_size;
++      u16 rx_skb_size;
+       struct net_device *net_dev;
+       struct device *dev;
+@@ -115,6 +120,12 @@ static int xrx200_buffer_size(int mtu)
+       return round_up(xrx200_max_frame_len(mtu), 4 * XRX200_DMA_BURST_LEN);
+ }
++static int xrx200_skb_size(u16 buf_size)
++{
++      return SKB_DATA_ALIGN(buf_size + NET_SKB_PAD + NET_IP_ALIGN) +
++              SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
++}
++
+ /* drop all the packets from the DMA ring */
+ static void xrx200_flush_dma(struct xrx200_chan *ch)
+ {
+@@ -173,30 +184,29 @@ static int xrx200_close(struct net_devic
+       return 0;
+ }
+-static int xrx200_alloc_skb(struct xrx200_chan *ch)
++static int xrx200_alloc_buf(struct xrx200_chan *ch, void *(*alloc)(unsigned int size))
+ {
+-      struct sk_buff *skb = ch->skb[ch->dma.desc];
++      void *buf = ch->rx_buff[ch->dma.desc];
+       struct xrx200_priv *priv = ch->priv;
+       dma_addr_t mapping;
+       int ret = 0;
+-      ch->skb[ch->dma.desc] = netdev_alloc_skb_ip_align(priv->net_dev,
+-                                                        priv->rx_buf_size);
+-      if (!ch->skb[ch->dma.desc]) {
++      ch->rx_buff[ch->dma.desc] = alloc(priv->rx_skb_size);
++      if (!ch->rx_buff[ch->dma.desc]) {
+               ret = -ENOMEM;
+               goto skip;
+       }
+-      mapping = dma_map_single(priv->dev, ch->skb[ch->dma.desc]->data,
++      mapping = dma_map_single(priv->dev, ch->rx_buff[ch->dma.desc],
+                                priv->rx_buf_size, DMA_FROM_DEVICE);
+       if (unlikely(dma_mapping_error(priv->dev, mapping))) {
+-              dev_kfree_skb_any(ch->skb[ch->dma.desc]);
+-              ch->skb[ch->dma.desc] = skb;
++              skb_free_frag(ch->rx_buff[ch->dma.desc]);
++              ch->rx_buff[ch->dma.desc] = buf;
+               ret = -ENOMEM;
+               goto skip;
+       }
+-      ch->dma.desc_base[ch->dma.desc].addr = mapping;
++      ch->dma.desc_base[ch->dma.desc].addr = mapping + NET_SKB_PAD + NET_IP_ALIGN;
+       /* Make sure the address is written before we give it to HW */
+       wmb();
+ skip:
+@@ -210,13 +220,14 @@ static int xrx200_hw_receive(struct xrx2
+ {
+       struct xrx200_priv *priv = ch->priv;
+       struct ltq_dma_desc *desc = &ch->dma.desc_base[ch->dma.desc];
+-      struct sk_buff *skb = ch->skb[ch->dma.desc];
++      void *buf = ch->rx_buff[ch->dma.desc];
+       u32 ctl = desc->ctl;
+       int len = (ctl & LTQ_DMA_SIZE_MASK);
+       struct net_device *net_dev = priv->net_dev;
++      struct sk_buff *skb;
+       int ret;
+-      ret = xrx200_alloc_skb(ch);
++      ret = xrx200_alloc_buf(ch, napi_alloc_frag);
+       ch->dma.desc++;
+       ch->dma.desc %= LTQ_DESC_NUM;
+@@ -227,19 +238,21 @@ static int xrx200_hw_receive(struct xrx2
+               return ret;
+       }
++      skb = build_skb(buf, priv->rx_skb_size);
++      skb_reserve(skb, NET_SKB_PAD);
+       skb_put(skb, len);
+       /* add buffers to skb via skb->frag_list */
+       if (ctl & LTQ_DMA_SOP) {
+               ch->skb_head = skb;
+               ch->skb_tail = skb;
++              skb_reserve(skb, NET_IP_ALIGN);
+       } else if (ch->skb_head) {
+               if (ch->skb_head == ch->skb_tail)
+                       skb_shinfo(ch->skb_tail)->frag_list = skb;
+               else
+                       ch->skb_tail->next = skb;
+               ch->skb_tail = skb;
+-              skb_reserve(ch->skb_tail, -NET_IP_ALIGN);
+               ch->skb_head->len += skb->len;
+               ch->skb_head->data_len += skb->len;
+               ch->skb_head->truesize += skb->truesize;
+@@ -395,12 +408,13 @@ xrx200_change_mtu(struct net_device *net
+       struct xrx200_chan *ch_rx = &priv->chan_rx;
+       int old_mtu = net_dev->mtu;
+       bool running = false;
+-      struct sk_buff *skb;
++      void *buff;
+       int curr_desc;
+       int ret = 0;
+       net_dev->mtu = new_mtu;
+       priv->rx_buf_size = xrx200_buffer_size(new_mtu);
++      priv->rx_skb_size = xrx200_skb_size(priv->rx_buf_size);
+       if (new_mtu <= old_mtu)
+               return ret;
+@@ -416,14 +430,15 @@ xrx200_change_mtu(struct net_device *net
+       for (ch_rx->dma.desc = 0; ch_rx->dma.desc < LTQ_DESC_NUM;
+            ch_rx->dma.desc++) {
+-              skb = ch_rx->skb[ch_rx->dma.desc];
+-              ret = xrx200_alloc_skb(ch_rx);
++              buff = ch_rx->rx_buff[ch_rx->dma.desc];
++              ret = xrx200_alloc_buf(ch_rx, netdev_alloc_frag);
+               if (ret) {
+                       net_dev->mtu = old_mtu;
+                       priv->rx_buf_size = xrx200_buffer_size(old_mtu);
++                      priv->rx_skb_size = xrx200_skb_size(priv->rx_buf_size);
+                       break;
+               }
+-              dev_kfree_skb_any(skb);
++              skb_free_frag(buff);
+       }
+       ch_rx->dma.desc = curr_desc;
+@@ -476,7 +491,7 @@ static int xrx200_dma_init(struct xrx200
+       ltq_dma_alloc_rx(&ch_rx->dma);
+       for (ch_rx->dma.desc = 0; ch_rx->dma.desc < LTQ_DESC_NUM;
+            ch_rx->dma.desc++) {
+-              ret = xrx200_alloc_skb(ch_rx);
++              ret = xrx200_alloc_buf(ch_rx, netdev_alloc_frag);
+               if (ret)
+                       goto rx_free;
+       }
+@@ -511,7 +526,7 @@ rx_ring_free:
+       /* free the allocated RX ring */
+       for (i = 0; i < LTQ_DESC_NUM; i++) {
+               if (priv->chan_rx.skb[i])
+-                      dev_kfree_skb_any(priv->chan_rx.skb[i]);
++                      skb_free_frag(priv->chan_rx.rx_buff[i]);
+       }
+ rx_free:
+@@ -528,7 +543,7 @@ static void xrx200_hw_cleanup(struct xrx
+       /* free the allocated RX ring */
+       for (i = 0; i < LTQ_DESC_NUM; i++)
+-              dev_kfree_skb_any(priv->chan_rx.skb[i]);
++              skb_free_frag(priv->chan_rx.rx_buff[i]);
+ }
+ static int xrx200_probe(struct platform_device *pdev)
+@@ -554,6 +569,7 @@ static int xrx200_probe(struct platform_
+       net_dev->min_mtu = ETH_ZLEN;
+       net_dev->max_mtu = XRX200_DMA_DATA_LEN - xrx200_max_frame_len(0);
+       priv->rx_buf_size = xrx200_buffer_size(ETH_DATA_LEN);
++      priv->rx_skb_size = xrx200_skb_size(priv->rx_buf_size);
+       /* load the memory ranges */
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);