--- /dev/null
+From: Alexander Lobakin <bloodyreaper@yandex.ru>
+Date: Tue, 21 Apr 2020 16:41:08 +0300
+Subject: [PATCH] net: dsa: add GRO support via gro_cells
+
+gro_cells lib is used by different encapsulating netdevices, such as
+geneve, macsec, vxlan etc. to speed up decapsulated traffic processing.
+CPU tag is a sort of "encapsulation", and we can use the same mechs to
+greatly improve overall DSA performance.
+skbs are passed to the GRO layer after removing CPU tags, so we don't
+need any new packet offload types as it was firstly proposed by me in
+the first GRO-over-DSA variant [1].
+
+The size of struct gro_cells is sizeof(void *), so hot struct
+dsa_slave_priv becomes only 4/8 bytes bigger, and all critical fields
+remain in one 32-byte cacheline.
+The other positive side effect is that drivers for network devices
+that can be shipped as CPU ports of DSA-driven switches can now use
+napi_gro_frags() to pass skbs to kernel. Packets built that way are
+completely non-linear and are likely being dropped without GRO.
+
+This was tested on to-be-mainlined-soon Ethernet driver that uses
+napi_gro_frags(), and the overall performance was on par with the
+variant from [1], sometimes even better due to minimal overhead.
+net.core.gro_normal_batch tuning may help to push it to the limit
+on particular setups and platforms.
+
+iperf3 IPoE VLAN NAT TCP forwarding (port1.218 -> port0) setup
+on 1.2 GHz MIPS board:
+
+5.7-rc2 baseline:
+
+[ID] Interval Transfer Bitrate Retr
+[ 5] 0.00-120.01 sec 9.00 GBytes 644 Mbits/sec 413 sender
+[ 5] 0.00-120.00 sec 8.99 GBytes 644 Mbits/sec receiver
+
+Iface RX packets TX packets
+eth0 7097731 7097702
+port0 426050 6671829
+port1 6671681 425862
+port1.218 6671677 425851
+
+With this patch:
+
+[ID] Interval Transfer Bitrate Retr
+[ 5] 0.00-120.01 sec 12.2 GBytes 870 Mbits/sec 122 sender
+[ 5] 0.00-120.00 sec 12.2 GBytes 870 Mbits/sec receiver
+
+Iface RX packets TX packets
+eth0 9474792 9474777
+port0 455200 353288
+port1 9019592 455035
+port1.218 353144 455024
+
+v2:
+ - Add some performance examples in the commit message;
+ - No functional changes.
+
+[1] https://lore.kernel.org/netdev/20191230143028.27313-1-alobakin@dlink.ru/
+
+Signed-off-by: Alexander Lobakin <bloodyreaper@yandex.ru>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+---
+
+--- a/net/dsa/Kconfig
++++ b/net/dsa/Kconfig
+@@ -9,6 +9,7 @@ menuconfig NET_DSA
+ tristate "Distributed Switch Architecture"
+ depends on HAVE_NET_DSA
+ depends on BRIDGE || BRIDGE=n
++ select GRO_CELLS
+ select NET_SWITCHDEV
+ select PHYLINK
+ select NET_DEVLINK
+--- a/net/dsa/dsa.c
++++ b/net/dsa/dsa.c
+@@ -238,7 +238,7 @@ static int dsa_switch_rcv(struct sk_buff
+ if (dsa_skb_defer_rx_timestamp(p, skb))
+ return 0;
+
+- netif_receive_skb(skb);
++ gro_cells_receive(&p->gcells, skb);
+
+ return 0;
+ }
+--- a/net/dsa/dsa_priv.h
++++ b/net/dsa/dsa_priv.h
+@@ -11,6 +11,7 @@
+ #include <linux/netdevice.h>
+ #include <linux/netpoll.h>
+ #include <net/dsa.h>
++#include <net/gro_cells.h>
+
+ enum {
+ DSA_NOTIFIER_AGEING_TIME,
+@@ -68,6 +69,8 @@ struct dsa_slave_priv {
+
+ struct pcpu_sw_netstats *stats64;
+
++ struct gro_cells gcells;
++
+ /* DSA port data, such as switch, port index, etc. */
+ struct dsa_port *dp;
+
+--- a/net/dsa/slave.c
++++ b/net/dsa/slave.c
+@@ -1431,6 +1431,11 @@ int dsa_slave_create(struct dsa_port *po
+ free_netdev(slave_dev);
+ return -ENOMEM;
+ }
++
++ ret = gro_cells_init(&p->gcells, slave_dev);
++ if (ret)
++ goto out_free;
++
+ p->dp = port;
+ INIT_LIST_HEAD(&p->mall_tc_list);
+ INIT_WORK(&port->xmit_work, dsa_port_xmit_work);
+@@ -1443,7 +1448,7 @@ int dsa_slave_create(struct dsa_port *po
+ ret = dsa_slave_phy_setup(slave_dev);
+ if (ret) {
+ netdev_err(master, "error %d setting up slave phy\n", ret);
+- goto out_free;
++ goto out_gcells;
+ }
+
+ dsa_slave_notify(slave_dev, DSA_PORT_REGISTER);
+@@ -1462,6 +1467,8 @@ out_phy:
+ phylink_disconnect_phy(p->dp->pl);
+ rtnl_unlock();
+ phylink_destroy(p->dp->pl);
++out_gcells:
++ gro_cells_destroy(&p->gcells);
+ out_free:
+ free_percpu(p->stats64);
+ free_netdev(slave_dev);
+@@ -1482,6 +1489,7 @@ void dsa_slave_destroy(struct net_device
+ dsa_slave_notify(slave_dev, DSA_PORT_UNREGISTER);
+ unregister_netdev(slave_dev);
+ phylink_destroy(dp->pl);
++ gro_cells_destroy(&p->gcells);
+ free_percpu(p->stats64);
+ free_netdev(slave_dev);
+ }