ramips: improve ethernet driver performance with GRO/TSO
authorFelix Fietkau <nbd@nbd.name>
Wed, 6 Jun 2018 07:56:13 +0000 (09:56 +0200)
committerFelix Fietkau <nbd@nbd.name>
Tue, 19 Jun 2018 07:45:28 +0000 (09:45 +0200)
GRO stores packets as fraglist. If they are routed back to the ethernet
device, they need to be re-segmented if the driver does not support
sending fraglists.
Add the missing support for that, along with a missing feature flag that
allows full routed GRO->TSO offload.
Considerably reduces CPU utilization for routing

Signed-off-by: Felix Fietkau <nbd@nbd.name>
target/linux/ramips/files-4.14/drivers/net/ethernet/mediatek/mtk_eth_soc.c
target/linux/ramips/files-4.14/drivers/net/ethernet/mediatek/mtk_eth_soc.h
target/linux/ramips/files-4.14/drivers/net/ethernet/mediatek/soc_mt7621.c

index d298fa9a97071717093478519e15cdeda7dcab80..e68ca204ed9e7d583b5850e2ccaf9fd22477509a 100644 (file)
@@ -307,24 +307,20 @@ no_rx_mem:
 
 static void fe_txd_unmap(struct device *dev, struct fe_tx_buf *tx_buf)
 {
-       if (tx_buf->flags & FE_TX_FLAGS_SINGLE0) {
-               dma_unmap_single(dev,
-                                dma_unmap_addr(tx_buf, dma_addr0),
-                                dma_unmap_len(tx_buf, dma_len0),
-                                DMA_TO_DEVICE);
-       } else if (tx_buf->flags & FE_TX_FLAGS_PAGE0) {
+       if (dma_unmap_len(tx_buf, dma_len0))
                dma_unmap_page(dev,
                               dma_unmap_addr(tx_buf, dma_addr0),
                               dma_unmap_len(tx_buf, dma_len0),
                               DMA_TO_DEVICE);
-       }
-       if (tx_buf->flags & FE_TX_FLAGS_PAGE1)
+
+       if (dma_unmap_len(tx_buf, dma_len1))
                dma_unmap_page(dev,
                               dma_unmap_addr(tx_buf, dma_addr1),
                               dma_unmap_len(tx_buf, dma_len1),
                               DMA_TO_DEVICE);
 
-       tx_buf->flags = 0;
+       dma_unmap_len_set(tx_buf, dma_addr0, 0);
+       dma_unmap_len_set(tx_buf, dma_addr1, 0);
        if (tx_buf->skb && (tx_buf->skb != (struct sk_buff *)DMA_DUMMY_DESC))
                dev_kfree_skb_any(tx_buf->skb);
        tx_buf->skb = NULL;
@@ -559,6 +555,54 @@ static inline u32 fe_empty_txd(struct fe_tx_ring *ring)
                         (ring->tx_ring_size - 1)));
 }
 
+static int fe_tx_dma_map_page(struct device *dev, struct fe_tx_buf *tx_buf,
+                             struct fe_tx_dma *txd, int idx,
+                             struct page *page, size_t offset, size_t size)
+{
+       dma_addr_t mapped_addr;
+
+       mapped_addr = dma_map_page(dev, page, offset, size, DMA_TO_DEVICE);
+       if (unlikely(dma_mapping_error(dev, mapped_addr)))
+               return -EIO;
+
+       if (idx & 1) {
+               txd->txd3 = mapped_addr;
+               txd->txd2 |= TX_DMA_PLEN1(size);
+               dma_unmap_addr_set(tx_buf, dma_addr1, mapped_addr);
+               dma_unmap_len_set(tx_buf, dma_len1, size);
+       } else {
+               tx_buf->skb = (struct sk_buff *)DMA_DUMMY_DESC;
+               txd->txd1 = mapped_addr;
+               txd->txd2 = TX_DMA_PLEN0(size);
+               dma_unmap_addr_set(tx_buf, dma_addr0, mapped_addr);
+               dma_unmap_len_set(tx_buf, dma_len0, size);
+       }
+       return 0;
+}
+
+static int fe_tx_dma_map_skb(struct device *dev, struct fe_tx_buf *tx_buf,
+                            struct fe_tx_dma *txd, int idx,
+                            struct sk_buff *skb)
+{
+       struct page *page = virt_to_page(skb->data);
+       size_t offset = offset_in_page(skb->data);
+       size_t size = skb_headlen(skb);
+
+       return fe_tx_dma_map_page(dev, tx_buf, txd, idx, page, offset, size);
+}
+
+static inline struct sk_buff *
+fe_next_frag(struct sk_buff *head, struct sk_buff *skb)
+{
+       if (skb != head)
+               return skb->next;
+
+       if (skb_has_frag_list(skb))
+               return skb_shinfo(skb)->frag_list;
+
+       return NULL;
+}
+
 static int fe_tx_map_dma(struct sk_buff *skb, struct net_device *dev,
                         int tx_num, struct fe_tx_ring *ring)
 {
@@ -566,7 +610,7 @@ static int fe_tx_map_dma(struct sk_buff *skb, struct net_device *dev,
        struct skb_frag_struct *frag;
        struct fe_tx_dma txd, *ptxd;
        struct fe_tx_buf *tx_buf;
-       dma_addr_t mapped_addr;
+       struct sk_buff *head = skb;
        unsigned int nr_frags;
        u32 def_txd4;
        int i, j, k, frag_size, frag_map_size, offset;
@@ -574,7 +618,6 @@ static int fe_tx_map_dma(struct sk_buff *skb, struct net_device *dev,
        tx_buf = &ring->tx_buf[ring->tx_next_idx];
        memset(tx_buf, 0, sizeof(*tx_buf));
        memset(&txd, 0, sizeof(txd));
-       nr_frags = skb_shinfo(skb)->nr_frags;
 
        /* init tx descriptor */
        if (priv->soc->tx_dma)
@@ -613,82 +656,68 @@ static int fe_tx_map_dma(struct sk_buff *skb, struct net_device *dev,
                }
        }
 
-       mapped_addr = dma_map_single(&dev->dev, skb->data,
-                                    skb_headlen(skb), DMA_TO_DEVICE);
-       if (unlikely(dma_mapping_error(&dev->dev, mapped_addr)))
-               goto err_out;
-       txd.txd1 = mapped_addr;
-       txd.txd2 = TX_DMA_PLEN0(skb_headlen(skb));
+       k = 0;
+       j = ring->tx_next_idx;
 
-       tx_buf->flags |= FE_TX_FLAGS_SINGLE0;
-       dma_unmap_addr_set(tx_buf, dma_addr0, mapped_addr);
-       dma_unmap_len_set(tx_buf, dma_len0, skb_headlen(skb));
+next_frag:
+       if (skb_headlen(skb)) {
+               if (fe_tx_dma_map_skb(&dev->dev, tx_buf, &txd, k++, skb))
+                       goto err_dma;
+       }
 
        /* TX SG offload */
-       j = ring->tx_next_idx;
-       k = 0;
+       nr_frags = skb_shinfo(skb)->nr_frags;
        for (i = 0; i < nr_frags; i++) {
-               offset = 0;
+               struct page *page;
+
                frag = &skb_shinfo(skb)->frags[i];
                frag_size = skb_frag_size(frag);
+               offset = frag->page_offset;
+               page = skb_frag_page(frag);
 
                while (frag_size > 0) {
                        frag_map_size = min(frag_size, TX_DMA_BUF_LEN);
-                       mapped_addr = skb_frag_dma_map(&dev->dev, frag, offset,
-                                                      frag_map_size,
-                                                      DMA_TO_DEVICE);
-                       if (unlikely(dma_mapping_error(&dev->dev, mapped_addr)))
-                               goto err_dma;
-
-                       if (k & 0x1) {
-                               j = NEXT_TX_DESP_IDX(j);
-                               txd.txd1 = mapped_addr;
-                               txd.txd2 = TX_DMA_PLEN0(frag_map_size);
+                       if (!(k & 0x1)) {
+                               fe_set_txd(&txd, &ring->tx_dma[j]);
+                               memset(&txd, 0, sizeof(txd));
                                txd.txd4 = def_txd4;
-
+                               j = NEXT_TX_DESP_IDX(j);
                                tx_buf = &ring->tx_buf[j];
-                               memset(tx_buf, 0, sizeof(*tx_buf));
-
-                               tx_buf->flags |= FE_TX_FLAGS_PAGE0;
-                               dma_unmap_addr_set(tx_buf, dma_addr0,
-                                                  mapped_addr);
-                               dma_unmap_len_set(tx_buf, dma_len0,
-                                                 frag_map_size);
-                       } else {
-                               txd.txd3 = mapped_addr;
-                               txd.txd2 |= TX_DMA_PLEN1(frag_map_size);
-
-                               tx_buf->skb = (struct sk_buff *)DMA_DUMMY_DESC;
-                               tx_buf->flags |= FE_TX_FLAGS_PAGE1;
-                               dma_unmap_addr_set(tx_buf, dma_addr1,
-                                                  mapped_addr);
-                               dma_unmap_len_set(tx_buf, dma_len1,
-                                                 frag_map_size);
-
-                               if (!((i == (nr_frags - 1)) &&
-                                     (frag_map_size == frag_size))) {
-                                       fe_set_txd(&txd, &ring->tx_dma[j]);
-                                       memset(&txd, 0, sizeof(txd));
-                               }
                        }
+
+                       if (fe_tx_dma_map_page(&dev->dev, tx_buf, &txd, k++,
+                                              page, offset, frag_map_size))
+                               goto err_dma;
+
                        frag_size -= frag_map_size;
                        offset += frag_map_size;
-                       k++;
                }
        }
 
+       skb = fe_next_frag(head, skb);
+       if (skb) {
+               if (!(k & 0x1)) {
+                       fe_set_txd(&txd, &ring->tx_dma[j]);
+                       memset(&txd, 0, sizeof(txd));
+                       txd.txd4 = def_txd4;
+                       j = NEXT_TX_DESP_IDX(j);
+                       tx_buf = &ring->tx_buf[j];
+               }
+               goto next_frag;
+       }
+
        /* set last segment */
        if (k & 0x1)
-               txd.txd2 |= TX_DMA_LS1;
-       else
                txd.txd2 |= TX_DMA_LS0;
+       else
+               txd.txd2 |= TX_DMA_LS1;
        fe_set_txd(&txd, &ring->tx_dma[j]);
 
        /* store skb to cleanup */
-       tx_buf->skb = skb;
+       tx_buf->skb = head;
 
-       netdev_sent_queue(dev, skb->len);
-       skb_tx_timestamp(skb);
+       netdev_sent_queue(dev, head->len);
+       skb_tx_timestamp(head);
 
        ring->tx_next_idx = NEXT_TX_DESP_IDX(j);
        /* make sure that all changes to the dma ring are flushed before we
@@ -702,7 +731,7 @@ static int fe_tx_map_dma(struct sk_buff *skb, struct net_device *dev,
                        netif_wake_queue(dev);
        }
 
-       if (netif_xmit_stopped(netdev_get_tx_queue(dev, 0)) || !skb->xmit_more)
+       if (netif_xmit_stopped(netdev_get_tx_queue(dev, 0)) || !head->xmit_more)
                fe_reg_w32(ring->tx_next_idx, FE_REG_TX_CTX_IDX0);
 
        return 0;
@@ -762,10 +791,12 @@ static inline int fe_skb_padto(struct sk_buff *skb, struct fe_priv *priv)
 
 static inline int fe_cal_txd_req(struct sk_buff *skb)
 {
-       int i, nfrags;
+       struct sk_buff *head = skb;
+       int i, nfrags = 0;
        struct skb_frag_struct *frag;
 
-       nfrags = 1;
+next_frag:
+       nfrags++;
        if (skb_is_gso(skb)) {
                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                        frag = &skb_shinfo(skb)->frags[i];
@@ -775,6 +806,10 @@ static inline int fe_cal_txd_req(struct sk_buff *skb)
                nfrags += skb_shinfo(skb)->nr_frags;
        }
 
+       skb = fe_next_frag(head, skb);
+       if (skb)
+               goto next_frag;
+
        return DIV_ROUND_UP(nfrags, 2);
 }
 
index dfaa5fd9ea10d64646ac35cc898d9f4959aa8ed4..517d8ba4dc4bb3323188dce7ff7828d26be5b315 100644 (file)
@@ -435,19 +435,12 @@ struct fe_hw_stats {
 #undef _FE
 };
 
-enum fe_tx_flags {
-       FE_TX_FLAGS_SINGLE0     = 0x01,
-       FE_TX_FLAGS_PAGE0       = 0x02,
-       FE_TX_FLAGS_PAGE1       = 0x04,
-};
-
 struct fe_tx_buf {
        struct sk_buff *skb;
-       u32 flags;
        DEFINE_DMA_UNMAP_ADDR(dma_addr0);
-       DEFINE_DMA_UNMAP_LEN(dma_len0);
        DEFINE_DMA_UNMAP_ADDR(dma_addr1);
-       DEFINE_DMA_UNMAP_LEN(dma_len1);
+       u16 dma_len0;
+       u16 dma_len1;
 };
 
 struct fe_tx_ring {
index 00dd45e01f6659047b6f56a73383791ad34b95bd..96d3909a48a3fbb5fadf83a2aac60d7ab1f15d0d 100644 (file)
@@ -143,7 +143,8 @@ static void mt7621_init_data(struct fe_soc_data *data,
 
        netdev->hw_features = NETIF_F_IP_CSUM | NETIF_F_RXCSUM |
                NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_SG | NETIF_F_TSO |
-               NETIF_F_TSO6 | NETIF_F_IPV6_CSUM;
+               NETIF_F_TSO6 | NETIF_F_IPV6_CSUM | NETIF_F_FRAGLIST |
+               NETIF_F_TSO_MANGLEID;
 }
 
 static void mt7621_set_mac(struct fe_priv *priv, unsigned char *mac)