From: Felix Fietkau Date: Sun, 26 Jul 2020 13:24:42 +0000 (+0200) Subject: kernel: add patch that adds support for running threaded NAPI poll functions X-Git-Url: http://git.lede-project.org./?a=commitdiff_plain;h=f51f18e0998568e6a53c834b50f8fa1b20f634f3;p=openwrt%2Fstaging%2Faparcar.git kernel: add patch that adds support for running threaded NAPI poll functions This is helps on workloads with CPU intensive poll functions (e.g. 802.11) on multicore systems Signed-off-by: Felix Fietkau --- diff --git a/target/linux/generic/hack-4.14/721-phy_packets.patch b/target/linux/generic/hack-4.14/721-phy_packets.patch index e6e06554de..d67171054d 100644 --- a/target/linux/generic/hack-4.14/721-phy_packets.patch +++ b/target/linux/generic/hack-4.14/721-phy_packets.patch @@ -15,7 +15,7 @@ Signed-off-by: Felix Fietkau --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h -@@ -1412,6 +1412,7 @@ enum netdev_priv_flags { +@@ -1415,6 +1415,7 @@ enum netdev_priv_flags { IFF_PHONY_HEADROOM = 1<<26, IFF_MACSEC = 1<<27, IFF_L3MDEV_RX_HANDLER = 1<<28, @@ -23,7 +23,7 @@ Signed-off-by: Felix Fietkau }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN -@@ -1442,6 +1443,7 @@ enum netdev_priv_flags { +@@ -1445,6 +1446,7 @@ enum netdev_priv_flags { #define IFF_RXFH_CONFIGURED IFF_RXFH_CONFIGURED #define IFF_MACSEC IFF_MACSEC #define IFF_L3MDEV_RX_HANDLER IFF_L3MDEV_RX_HANDLER @@ -31,7 +31,7 @@ Signed-off-by: Felix Fietkau /** * struct net_device - The DEVICE structure. -@@ -1728,6 +1730,11 @@ struct net_device { +@@ -1731,6 +1733,11 @@ struct net_device { const struct xfrmdev_ops *xfrmdev_ops; #endif @@ -43,7 +43,7 @@ Signed-off-by: Felix Fietkau const struct header_ops *header_ops; unsigned int flags; -@@ -1802,6 +1809,10 @@ struct net_device { +@@ -1805,6 +1812,10 @@ struct net_device { struct mpls_dev __rcu *mpls_ptr; #endif @@ -101,7 +101,7 @@ Signed-off-by: Felix Fietkau help --- a/net/core/dev.c +++ b/net/core/dev.c -@@ -3000,10 +3000,20 @@ static int xmit_one(struct sk_buff *skb, +@@ -3001,10 +3001,20 @@ static int xmit_one(struct sk_buff *skb, if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all)) dev_queue_xmit_nit(skb, dev); diff --git a/target/linux/generic/hack-4.19/721-phy_packets.patch b/target/linux/generic/hack-4.19/721-phy_packets.patch index 2bb01718ad..9463ed22c3 100644 --- a/target/linux/generic/hack-4.19/721-phy_packets.patch +++ b/target/linux/generic/hack-4.19/721-phy_packets.patch @@ -15,7 +15,7 @@ Signed-off-by: Felix Fietkau --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h -@@ -1514,6 +1514,7 @@ enum netdev_priv_flags { +@@ -1517,6 +1517,7 @@ enum netdev_priv_flags { IFF_FAILOVER_SLAVE = 1<<28, IFF_L3MDEV_RX_HANDLER = 1<<29, IFF_LIVE_RENAME_OK = 1<<30, @@ -23,7 +23,7 @@ Signed-off-by: Felix Fietkau }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN -@@ -1546,6 +1547,7 @@ enum netdev_priv_flags { +@@ -1549,6 +1550,7 @@ enum netdev_priv_flags { #define IFF_FAILOVER_SLAVE IFF_FAILOVER_SLAVE #define IFF_L3MDEV_RX_HANDLER IFF_L3MDEV_RX_HANDLER #define IFF_LIVE_RENAME_OK IFF_LIVE_RENAME_OK @@ -31,7 +31,7 @@ Signed-off-by: Felix Fietkau /** * struct net_device - The DEVICE structure. -@@ -1846,6 +1848,11 @@ struct net_device { +@@ -1849,6 +1851,11 @@ struct net_device { const struct tlsdev_ops *tlsdev_ops; #endif @@ -43,7 +43,7 @@ Signed-off-by: Felix Fietkau const struct header_ops *header_ops; unsigned int flags; -@@ -1928,6 +1935,10 @@ struct net_device { +@@ -1931,6 +1938,10 @@ struct net_device { struct mpls_dev __rcu *mpls_ptr; #endif @@ -101,7 +101,7 @@ Signed-off-by: Felix Fietkau help --- a/net/core/dev.c +++ b/net/core/dev.c -@@ -3251,10 +3251,20 @@ static int xmit_one(struct sk_buff *skb, +@@ -3252,10 +3252,20 @@ static int xmit_one(struct sk_buff *skb, if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all)) dev_queue_xmit_nit(skb, dev); diff --git a/target/linux/generic/hack-5.4/721-phy_packets.patch b/target/linux/generic/hack-5.4/721-phy_packets.patch index b3008cbbcf..ed18da9e0c 100644 --- a/target/linux/generic/hack-5.4/721-phy_packets.patch +++ b/target/linux/generic/hack-5.4/721-phy_packets.patch @@ -15,7 +15,7 @@ Signed-off-by: Felix Fietkau --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h -@@ -1546,6 +1546,7 @@ enum netdev_priv_flags { +@@ -1549,6 +1549,7 @@ enum netdev_priv_flags { IFF_FAILOVER_SLAVE = 1<<28, IFF_L3MDEV_RX_HANDLER = 1<<29, IFF_LIVE_RENAME_OK = 1<<30, @@ -23,7 +23,7 @@ Signed-off-by: Felix Fietkau }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN -@@ -1578,6 +1579,7 @@ enum netdev_priv_flags { +@@ -1581,6 +1582,7 @@ enum netdev_priv_flags { #define IFF_FAILOVER_SLAVE IFF_FAILOVER_SLAVE #define IFF_L3MDEV_RX_HANDLER IFF_L3MDEV_RX_HANDLER #define IFF_LIVE_RENAME_OK IFF_LIVE_RENAME_OK @@ -31,7 +31,7 @@ Signed-off-by: Felix Fietkau /** * struct net_device - The DEVICE structure. -@@ -1879,6 +1881,11 @@ struct net_device { +@@ -1882,6 +1884,11 @@ struct net_device { const struct tlsdev_ops *tlsdev_ops; #endif @@ -43,7 +43,7 @@ Signed-off-by: Felix Fietkau const struct header_ops *header_ops; unsigned int flags; -@@ -1961,6 +1968,10 @@ struct net_device { +@@ -1964,6 +1971,10 @@ struct net_device { struct mpls_dev __rcu *mpls_ptr; #endif @@ -101,7 +101,7 @@ Signed-off-by: Felix Fietkau help --- a/net/core/dev.c +++ b/net/core/dev.c -@@ -3191,10 +3191,20 @@ static int xmit_one(struct sk_buff *skb, +@@ -3192,10 +3192,20 @@ static int xmit_one(struct sk_buff *skb, if (dev_nit_active(dev)) dev_queue_xmit_nit(skb, dev); diff --git a/target/linux/generic/pending-4.14/690-net-add-support-for-threaded-NAPI-polling.patch b/target/linux/generic/pending-4.14/690-net-add-support-for-threaded-NAPI-polling.patch new file mode 100644 index 0000000000..1a531cb8fc --- /dev/null +++ b/target/linux/generic/pending-4.14/690-net-add-support-for-threaded-NAPI-polling.patch @@ -0,0 +1,339 @@ +From: Felix Fietkau +Date: Sun, 26 Jul 2020 14:03:21 +0200 +Subject: [PATCH] net: add support for threaded NAPI polling + +For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI +poll function does not perform well. Since NAPI poll is bound to the CPU it +was scheduled from, we can easily end up with a few very busy CPUs spending +most of their time in softirq/ksoftirqd and some idle ones. + +Introduce threaded NAPI for such drivers based on a workqueue. The API is the +same except for using netif_threaded_napi_add instead of netif_napi_add. + +In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling +improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded +NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling +thread. + +With threaded NAPI it seems stable and consistent (and higher than the best +results I got without it). + +Based on a patch by Hillf Danton + +Cc: Hillf Danton +Signed-off-by: Felix Fietkau +--- + +--- a/include/linux/netdevice.h ++++ b/include/linux/netdevice.h +@@ -326,6 +326,7 @@ struct napi_struct { + struct list_head dev_list; + struct hlist_node napi_hash_node; + unsigned int napi_id; ++ struct work_struct work; + }; + + enum { +@@ -336,6 +337,7 @@ enum { + NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */ + NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */ + NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */ ++ NAPI_STATE_THREADED, /* Use threaded NAPI */ + }; + + enum { +@@ -346,6 +348,7 @@ enum { + NAPIF_STATE_HASHED = BIT(NAPI_STATE_HASHED), + NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), + NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), ++ NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED), + }; + + enum gro_result { +@@ -2093,6 +2096,26 @@ void netif_napi_add(struct net_device *d + int (*poll)(struct napi_struct *, int), int weight); + + /** ++ * netif_threaded_napi_add - initialize a NAPI context ++ * @dev: network device ++ * @napi: NAPI context ++ * @poll: polling function ++ * @weight: default weight ++ * ++ * This variant of netif_napi_add() should be used from drivers using NAPI ++ * with CPU intensive poll functions. ++ * This will schedule polling from a high priority workqueue ++ */ ++static inline void netif_threaded_napi_add(struct net_device *dev, ++ struct napi_struct *napi, ++ int (*poll)(struct napi_struct *, int), ++ int weight) ++{ ++ set_bit(NAPI_STATE_THREADED, &napi->state); ++ netif_napi_add(dev, napi, poll, weight); ++} ++ ++/** + * netif_tx_napi_add - initialize a NAPI context + * @dev: network device + * @napi: NAPI context +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -160,6 +160,7 @@ static DEFINE_SPINLOCK(offload_lock); + struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; + struct list_head ptype_all __read_mostly; /* Taps */ + static struct list_head offload_base __read_mostly; ++static struct workqueue_struct *napi_workq __read_mostly; + + static int netif_rx_internal(struct sk_buff *skb); + static int call_netdevice_notifiers_info(unsigned long val, +@@ -5237,6 +5238,11 @@ void __napi_schedule(struct napi_struct + { + unsigned long flags; + ++ if (test_bit(NAPI_STATE_THREADED, &n->state)) { ++ queue_work(napi_workq, &n->work); ++ return; ++ } ++ + local_irq_save(flags); + ____napi_schedule(this_cpu_ptr(&softnet_data), n); + local_irq_restore(flags); +@@ -5284,6 +5290,11 @@ EXPORT_SYMBOL(napi_schedule_prep); + */ + void __napi_schedule_irqoff(struct napi_struct *n) + { ++ if (test_bit(NAPI_STATE_THREADED, &n->state)) { ++ queue_work(napi_workq, &n->work); ++ return; ++ } ++ + ____napi_schedule(this_cpu_ptr(&softnet_data), n); + } + EXPORT_SYMBOL(__napi_schedule_irqoff); +@@ -5521,6 +5532,82 @@ static enum hrtimer_restart napi_watchdo + return HRTIMER_NORESTART; + } + ++static int __napi_poll(struct napi_struct *n, bool *repoll) ++{ ++ int work, weight; ++ ++ weight = n->weight; ++ ++ /* This NAPI_STATE_SCHED test is for avoiding a race ++ * with netpoll's poll_napi(). Only the entity which ++ * obtains the lock and sees NAPI_STATE_SCHED set will ++ * actually make the ->poll() call. Therefore we avoid ++ * accidentally calling ->poll() when NAPI is not scheduled. ++ */ ++ work = 0; ++ if (test_bit(NAPI_STATE_SCHED, &n->state)) { ++ work = n->poll(n, weight); ++ trace_napi_poll(n, work, weight); ++ } ++ ++ WARN_ON_ONCE(work > weight); ++ ++ if (likely(work < weight)) ++ return work; ++ ++ /* Drivers must not modify the NAPI state if they ++ * consume the entire weight. In such cases this code ++ * still "owns" the NAPI instance and therefore can ++ * move the instance around on the list at-will. ++ */ ++ if (unlikely(napi_disable_pending(n))) { ++ napi_complete(n); ++ return work; ++ } ++ ++ if (n->gro_list) { ++ /* flush too old packets ++ * If HZ < 1000, flush all packets. ++ */ ++ napi_gro_flush(n, HZ >= 1000); ++ } ++ ++ *repoll = true; ++ ++ return work; ++} ++ ++static void napi_workfn(struct work_struct *work) ++{ ++ struct napi_struct *n = container_of(work, struct napi_struct, work); ++ void *have; ++ ++ for (;;) { ++ bool repoll = false; ++ ++ local_bh_disable(); ++ ++ have = netpoll_poll_lock(n); ++ __napi_poll(n, &repoll); ++ netpoll_poll_unlock(have); ++ ++ local_bh_enable(); ++ ++ if (!repoll) ++ return; ++ ++ if (!need_resched()) ++ continue; ++ ++ /* ++ * have to pay for the latency of task switch even if ++ * napi is scheduled ++ */ ++ queue_work(napi_workq, work); ++ return; ++ } ++} ++ + void netif_napi_add(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int weight) + { +@@ -5540,6 +5627,7 @@ void netif_napi_add(struct net_device *d + #ifdef CONFIG_NETPOLL + napi->poll_owner = -1; + #endif ++ INIT_WORK(&napi->work, napi_workfn); + set_bit(NAPI_STATE_SCHED, &napi->state); + napi_hash_add(napi); + } +@@ -5565,6 +5653,7 @@ EXPORT_SYMBOL(napi_disable); + void netif_napi_del(struct napi_struct *napi) + { + might_sleep(); ++ cancel_work_sync(&napi->work); + if (napi_hash_del(napi)) + synchronize_net(); + list_del_init(&napi->dev_list); +@@ -5578,48 +5667,18 @@ EXPORT_SYMBOL(netif_napi_del); + + static int napi_poll(struct napi_struct *n, struct list_head *repoll) + { ++ bool do_repoll = false; + void *have; +- int work, weight; ++ int work; + + list_del_init(&n->poll_list); + + have = netpoll_poll_lock(n); + +- weight = n->weight; +- +- /* This NAPI_STATE_SCHED test is for avoiding a race +- * with netpoll's poll_napi(). Only the entity which +- * obtains the lock and sees NAPI_STATE_SCHED set will +- * actually make the ->poll() call. Therefore we avoid +- * accidentally calling ->poll() when NAPI is not scheduled. +- */ +- work = 0; +- if (test_bit(NAPI_STATE_SCHED, &n->state)) { +- work = n->poll(n, weight); +- trace_napi_poll(n, work, weight); +- } +- +- WARN_ON_ONCE(work > weight); +- +- if (likely(work < weight)) +- goto out_unlock; ++ work = __napi_poll(n, &do_repoll); + +- /* Drivers must not modify the NAPI state if they +- * consume the entire weight. In such cases this code +- * still "owns" the NAPI instance and therefore can +- * move the instance around on the list at-will. +- */ +- if (unlikely(napi_disable_pending(n))) { +- napi_complete(n); ++ if (!do_repoll) + goto out_unlock; +- } +- +- if (n->gro_list) { +- /* flush too old packets +- * If HZ < 1000, flush all packets. +- */ +- napi_gro_flush(n, HZ >= 1000); +- } + + /* Some drivers may have called napi_schedule + * prior to exhausting their budget. +@@ -8855,6 +8914,10 @@ static int __init net_dev_init(void) + sd->backlog.weight = weight_p; + } + ++ napi_workq = alloc_workqueue("napi_workq", WQ_UNBOUND | WQ_HIGHPRI, ++ WQ_UNBOUND_MAX_ACTIVE | WQ_SYSFS); ++ BUG_ON(!napi_workq); ++ + dev_boot_phase = 0; + + /* The loopback device is special if any other network devices +--- a/net/core/net-sysfs.c ++++ b/net/core/net-sysfs.c +@@ -441,6 +441,52 @@ static ssize_t proto_down_store(struct d + } + NETDEVICE_SHOW_RW(proto_down, fmt_dec); + ++static int change_napi_threaded(struct net_device *dev, unsigned long val) ++{ ++ struct napi_struct *napi; ++ ++ if (list_empty(&dev->napi_list)) ++ return -EOPNOTSUPP; ++ ++ list_for_each_entry(napi, &dev->napi_list, dev_list) { ++ if (val) ++ set_bit(NAPI_STATE_THREADED, &napi->state); ++ else ++ clear_bit(NAPI_STATE_THREADED, &napi->state); ++ } ++ ++ return 0; ++} ++ ++static ssize_t napi_threaded_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t len) ++{ ++ return netdev_store(dev, attr, buf, len, change_napi_threaded); ++} ++ ++static ssize_t napi_threaded_show(struct device *dev, ++ struct device_attribute *attr, ++ char *buf) ++{ ++ struct net_device *netdev = to_net_dev(dev); ++ struct napi_struct *napi; ++ bool enabled = false; ++ ++ if (!rtnl_trylock()) ++ return restart_syscall(); ++ ++ list_for_each_entry(napi, &netdev->napi_list, dev_list) { ++ if (test_bit(NAPI_STATE_THREADED, &napi->state)) ++ enabled = true; ++ } ++ ++ rtnl_unlock(); ++ ++ return sprintf(buf, fmt_dec, enabled); ++} ++static DEVICE_ATTR_RW(napi_threaded); ++ + static ssize_t phys_port_id_show(struct device *dev, + struct device_attribute *attr, char *buf) + { +@@ -536,6 +582,7 @@ static struct attribute *net_class_attrs + &dev_attr_flags.attr, + &dev_attr_tx_queue_len.attr, + &dev_attr_gro_flush_timeout.attr, ++ &dev_attr_napi_threaded.attr, + &dev_attr_phys_port_id.attr, + &dev_attr_phys_port_name.attr, + &dev_attr_phys_switch_id.attr, diff --git a/target/linux/generic/pending-4.19/690-net-add-support-for-threaded-NAPI-polling.patch b/target/linux/generic/pending-4.19/690-net-add-support-for-threaded-NAPI-polling.patch new file mode 100644 index 0000000000..b5c701c80b --- /dev/null +++ b/target/linux/generic/pending-4.19/690-net-add-support-for-threaded-NAPI-polling.patch @@ -0,0 +1,339 @@ +From: Felix Fietkau +Date: Sun, 26 Jul 2020 14:03:21 +0200 +Subject: [PATCH] net: add support for threaded NAPI polling + +For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI +poll function does not perform well. Since NAPI poll is bound to the CPU it +was scheduled from, we can easily end up with a few very busy CPUs spending +most of their time in softirq/ksoftirqd and some idle ones. + +Introduce threaded NAPI for such drivers based on a workqueue. The API is the +same except for using netif_threaded_napi_add instead of netif_napi_add. + +In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling +improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded +NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling +thread. + +With threaded NAPI it seems stable and consistent (and higher than the best +results I got without it). + +Based on a patch by Hillf Danton + +Cc: Hillf Danton +Signed-off-by: Felix Fietkau +--- + +--- a/include/linux/netdevice.h ++++ b/include/linux/netdevice.h +@@ -339,6 +339,7 @@ struct napi_struct { + struct list_head dev_list; + struct hlist_node napi_hash_node; + unsigned int napi_id; ++ struct work_struct work; + }; + + enum { +@@ -349,6 +350,7 @@ enum { + NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */ + NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */ + NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */ ++ NAPI_STATE_THREADED, /* Use threaded NAPI */ + }; + + enum { +@@ -359,6 +361,7 @@ enum { + NAPIF_STATE_HASHED = BIT(NAPI_STATE_HASHED), + NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), + NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), ++ NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED), + }; + + enum gro_result { +@@ -2230,6 +2233,26 @@ void netif_napi_add(struct net_device *d + int (*poll)(struct napi_struct *, int), int weight); + + /** ++ * netif_threaded_napi_add - initialize a NAPI context ++ * @dev: network device ++ * @napi: NAPI context ++ * @poll: polling function ++ * @weight: default weight ++ * ++ * This variant of netif_napi_add() should be used from drivers using NAPI ++ * with CPU intensive poll functions. ++ * This will schedule polling from a high priority workqueue ++ */ ++static inline void netif_threaded_napi_add(struct net_device *dev, ++ struct napi_struct *napi, ++ int (*poll)(struct napi_struct *, int), ++ int weight) ++{ ++ set_bit(NAPI_STATE_THREADED, &napi->state); ++ netif_napi_add(dev, napi, poll, weight); ++} ++ ++/** + * netif_tx_napi_add - initialize a NAPI context + * @dev: network device + * @napi: NAPI context +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -160,6 +160,7 @@ static DEFINE_SPINLOCK(offload_lock); + struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; + struct list_head ptype_all __read_mostly; /* Taps */ + static struct list_head offload_base __read_mostly; ++static struct workqueue_struct *napi_workq __read_mostly; + + static int netif_rx_internal(struct sk_buff *skb); + static int call_netdevice_notifiers_info(unsigned long val, +@@ -5891,6 +5892,11 @@ void __napi_schedule(struct napi_struct + { + unsigned long flags; + ++ if (test_bit(NAPI_STATE_THREADED, &n->state)) { ++ queue_work(napi_workq, &n->work); ++ return; ++ } ++ + local_irq_save(flags); + ____napi_schedule(this_cpu_ptr(&softnet_data), n); + local_irq_restore(flags); +@@ -5938,6 +5944,11 @@ EXPORT_SYMBOL(napi_schedule_prep); + */ + void __napi_schedule_irqoff(struct napi_struct *n) + { ++ if (test_bit(NAPI_STATE_THREADED, &n->state)) { ++ queue_work(napi_workq, &n->work); ++ return; ++ } ++ + ____napi_schedule(this_cpu_ptr(&softnet_data), n); + } + EXPORT_SYMBOL(__napi_schedule_irqoff); +@@ -6186,6 +6197,82 @@ static void init_gro_hash(struct napi_st + napi->gro_bitmask = 0; + } + ++static int __napi_poll(struct napi_struct *n, bool *repoll) ++{ ++ int work, weight; ++ ++ weight = n->weight; ++ ++ /* This NAPI_STATE_SCHED test is for avoiding a race ++ * with netpoll's poll_napi(). Only the entity which ++ * obtains the lock and sees NAPI_STATE_SCHED set will ++ * actually make the ->poll() call. Therefore we avoid ++ * accidentally calling ->poll() when NAPI is not scheduled. ++ */ ++ work = 0; ++ if (test_bit(NAPI_STATE_SCHED, &n->state)) { ++ work = n->poll(n, weight); ++ trace_napi_poll(n, work, weight); ++ } ++ ++ WARN_ON_ONCE(work > weight); ++ ++ if (likely(work < weight)) ++ return work; ++ ++ /* Drivers must not modify the NAPI state if they ++ * consume the entire weight. In such cases this code ++ * still "owns" the NAPI instance and therefore can ++ * move the instance around on the list at-will. ++ */ ++ if (unlikely(napi_disable_pending(n))) { ++ napi_complete(n); ++ return work; ++ } ++ ++ if (n->gro_bitmask) { ++ /* flush too old packets ++ * If HZ < 1000, flush all packets. ++ */ ++ napi_gro_flush(n, HZ >= 1000); ++ } ++ ++ *repoll = true; ++ ++ return work; ++} ++ ++static void napi_workfn(struct work_struct *work) ++{ ++ struct napi_struct *n = container_of(work, struct napi_struct, work); ++ void *have; ++ ++ for (;;) { ++ bool repoll = false; ++ ++ local_bh_disable(); ++ ++ have = netpoll_poll_lock(n); ++ __napi_poll(n, &repoll); ++ netpoll_poll_unlock(have); ++ ++ local_bh_enable(); ++ ++ if (!repoll) ++ return; ++ ++ if (!need_resched()) ++ continue; ++ ++ /* ++ * have to pay for the latency of task switch even if ++ * napi is scheduled ++ */ ++ queue_work(napi_workq, work); ++ return; ++ } ++} ++ + void netif_napi_add(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int weight) + { +@@ -6204,6 +6291,7 @@ void netif_napi_add(struct net_device *d + #ifdef CONFIG_NETPOLL + napi->poll_owner = -1; + #endif ++ INIT_WORK(&napi->work, napi_workfn); + set_bit(NAPI_STATE_SCHED, &napi->state); + napi_hash_add(napi); + } +@@ -6242,6 +6330,7 @@ static void flush_gro_hash(struct napi_s + void netif_napi_del(struct napi_struct *napi) + { + might_sleep(); ++ cancel_work_sync(&napi->work); + if (napi_hash_del(napi)) + synchronize_net(); + list_del_init(&napi->dev_list); +@@ -6254,48 +6343,18 @@ EXPORT_SYMBOL(netif_napi_del); + + static int napi_poll(struct napi_struct *n, struct list_head *repoll) + { ++ bool do_repoll = false; + void *have; +- int work, weight; ++ int work; + + list_del_init(&n->poll_list); + + have = netpoll_poll_lock(n); + +- weight = n->weight; +- +- /* This NAPI_STATE_SCHED test is for avoiding a race +- * with netpoll's poll_napi(). Only the entity which +- * obtains the lock and sees NAPI_STATE_SCHED set will +- * actually make the ->poll() call. Therefore we avoid +- * accidentally calling ->poll() when NAPI is not scheduled. +- */ +- work = 0; +- if (test_bit(NAPI_STATE_SCHED, &n->state)) { +- work = n->poll(n, weight); +- trace_napi_poll(n, work, weight); +- } +- +- WARN_ON_ONCE(work > weight); +- +- if (likely(work < weight)) +- goto out_unlock; ++ work = __napi_poll(n, &do_repoll); + +- /* Drivers must not modify the NAPI state if they +- * consume the entire weight. In such cases this code +- * still "owns" the NAPI instance and therefore can +- * move the instance around on the list at-will. +- */ +- if (unlikely(napi_disable_pending(n))) { +- napi_complete(n); ++ if (!do_repoll) + goto out_unlock; +- } +- +- if (n->gro_bitmask) { +- /* flush too old packets +- * If HZ < 1000, flush all packets. +- */ +- napi_gro_flush(n, HZ >= 1000); +- } + + /* Some drivers may have called napi_schedule + * prior to exhausting their budget. +@@ -9895,6 +9954,10 @@ static int __init net_dev_init(void) + sd->backlog.weight = weight_p; + } + ++ napi_workq = alloc_workqueue("napi_workq", WQ_UNBOUND | WQ_HIGHPRI, ++ WQ_UNBOUND_MAX_ACTIVE | WQ_SYSFS); ++ BUG_ON(!napi_workq); ++ + dev_boot_phase = 0; + + /* The loopback device is special if any other network devices +--- a/net/core/net-sysfs.c ++++ b/net/core/net-sysfs.c +@@ -447,6 +447,52 @@ static ssize_t proto_down_store(struct d + } + NETDEVICE_SHOW_RW(proto_down, fmt_dec); + ++static int change_napi_threaded(struct net_device *dev, unsigned long val) ++{ ++ struct napi_struct *napi; ++ ++ if (list_empty(&dev->napi_list)) ++ return -EOPNOTSUPP; ++ ++ list_for_each_entry(napi, &dev->napi_list, dev_list) { ++ if (val) ++ set_bit(NAPI_STATE_THREADED, &napi->state); ++ else ++ clear_bit(NAPI_STATE_THREADED, &napi->state); ++ } ++ ++ return 0; ++} ++ ++static ssize_t napi_threaded_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t len) ++{ ++ return netdev_store(dev, attr, buf, len, change_napi_threaded); ++} ++ ++static ssize_t napi_threaded_show(struct device *dev, ++ struct device_attribute *attr, ++ char *buf) ++{ ++ struct net_device *netdev = to_net_dev(dev); ++ struct napi_struct *napi; ++ bool enabled = false; ++ ++ if (!rtnl_trylock()) ++ return restart_syscall(); ++ ++ list_for_each_entry(napi, &netdev->napi_list, dev_list) { ++ if (test_bit(NAPI_STATE_THREADED, &napi->state)) ++ enabled = true; ++ } ++ ++ rtnl_unlock(); ++ ++ return sprintf(buf, fmt_dec, enabled); ++} ++static DEVICE_ATTR_RW(napi_threaded); ++ + static ssize_t phys_port_id_show(struct device *dev, + struct device_attribute *attr, char *buf) + { +@@ -542,6 +588,7 @@ static struct attribute *net_class_attrs + &dev_attr_flags.attr, + &dev_attr_tx_queue_len.attr, + &dev_attr_gro_flush_timeout.attr, ++ &dev_attr_napi_threaded.attr, + &dev_attr_phys_port_id.attr, + &dev_attr_phys_port_name.attr, + &dev_attr_phys_switch_id.attr, diff --git a/target/linux/generic/pending-5.4/690-net-add-support-for-threaded-NAPI-polling.patch b/target/linux/generic/pending-5.4/690-net-add-support-for-threaded-NAPI-polling.patch new file mode 100644 index 0000000000..0fd837d45e --- /dev/null +++ b/target/linux/generic/pending-5.4/690-net-add-support-for-threaded-NAPI-polling.patch @@ -0,0 +1,343 @@ +From: Felix Fietkau +Date: Sun, 26 Jul 2020 14:03:21 +0200 +Subject: [PATCH] net: add support for threaded NAPI polling + +For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI +poll function does not perform well. Since NAPI poll is bound to the CPU it +was scheduled from, we can easily end up with a few very busy CPUs spending +most of their time in softirq/ksoftirqd and some idle ones. + +Introduce threaded NAPI for such drivers based on a workqueue. The API is the +same except for using netif_threaded_napi_add instead of netif_napi_add. + +In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling +improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded +NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling +thread. + +With threaded NAPI it seems stable and consistent (and higher than the best +results I got without it). + +Based on a patch by Hillf Danton + +Cc: Hillf Danton +Signed-off-by: Felix Fietkau +--- + +--- a/include/linux/netdevice.h ++++ b/include/linux/netdevice.h +@@ -340,6 +340,7 @@ struct napi_struct { + struct list_head dev_list; + struct hlist_node napi_hash_node; + unsigned int napi_id; ++ struct work_struct work; + }; + + enum { +@@ -350,6 +351,7 @@ enum { + NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */ + NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */ + NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */ ++ NAPI_STATE_THREADED, /* Use threaded NAPI */ + }; + + enum { +@@ -360,6 +362,7 @@ enum { + NAPIF_STATE_HASHED = BIT(NAPI_STATE_HASHED), + NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), + NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), ++ NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED), + }; + + enum gro_result { +@@ -2249,6 +2252,26 @@ void netif_napi_add(struct net_device *d + int (*poll)(struct napi_struct *, int), int weight); + + /** ++ * netif_threaded_napi_add - initialize a NAPI context ++ * @dev: network device ++ * @napi: NAPI context ++ * @poll: polling function ++ * @weight: default weight ++ * ++ * This variant of netif_napi_add() should be used from drivers using NAPI ++ * with CPU intensive poll functions. ++ * This will schedule polling from a high priority workqueue ++ */ ++static inline void netif_threaded_napi_add(struct net_device *dev, ++ struct napi_struct *napi, ++ int (*poll)(struct napi_struct *, int), ++ int weight) ++{ ++ set_bit(NAPI_STATE_THREADED, &napi->state); ++ netif_napi_add(dev, napi, poll, weight); ++} ++ ++/** + * netif_tx_napi_add - initialize a NAPI context + * @dev: network device + * @napi: NAPI context +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -156,6 +156,7 @@ static DEFINE_SPINLOCK(offload_lock); + struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; + struct list_head ptype_all __read_mostly; /* Taps */ + static struct list_head offload_base __read_mostly; ++static struct workqueue_struct *napi_workq __read_mostly; + + static int netif_rx_internal(struct sk_buff *skb); + static int call_netdevice_notifiers_info(unsigned long val, +@@ -5910,6 +5911,11 @@ void __napi_schedule(struct napi_struct + { + unsigned long flags; + ++ if (test_bit(NAPI_STATE_THREADED, &n->state)) { ++ queue_work(napi_workq, &n->work); ++ return; ++ } ++ + local_irq_save(flags); + ____napi_schedule(this_cpu_ptr(&softnet_data), n); + local_irq_restore(flags); +@@ -5957,6 +5963,11 @@ EXPORT_SYMBOL(napi_schedule_prep); + */ + void __napi_schedule_irqoff(struct napi_struct *n) + { ++ if (test_bit(NAPI_STATE_THREADED, &n->state)) { ++ queue_work(napi_workq, &n->work); ++ return; ++ } ++ + ____napi_schedule(this_cpu_ptr(&softnet_data), n); + } + EXPORT_SYMBOL(__napi_schedule_irqoff); +@@ -6218,6 +6229,84 @@ static void init_gro_hash(struct napi_st + napi->gro_bitmask = 0; + } + ++static int __napi_poll(struct napi_struct *n, bool *repoll) ++{ ++ int work, weight; ++ ++ weight = n->weight; ++ ++ /* This NAPI_STATE_SCHED test is for avoiding a race ++ * with netpoll's poll_napi(). Only the entity which ++ * obtains the lock and sees NAPI_STATE_SCHED set will ++ * actually make the ->poll() call. Therefore we avoid ++ * accidentally calling ->poll() when NAPI is not scheduled. ++ */ ++ work = 0; ++ if (test_bit(NAPI_STATE_SCHED, &n->state)) { ++ work = n->poll(n, weight); ++ trace_napi_poll(n, work, weight); ++ } ++ ++ WARN_ON_ONCE(work > weight); ++ ++ if (likely(work < weight)) ++ return work; ++ ++ /* Drivers must not modify the NAPI state if they ++ * consume the entire weight. In such cases this code ++ * still "owns" the NAPI instance and therefore can ++ * move the instance around on the list at-will. ++ */ ++ if (unlikely(napi_disable_pending(n))) { ++ napi_complete(n); ++ return work; ++ } ++ ++ if (n->gro_bitmask) { ++ /* flush too old packets ++ * If HZ < 1000, flush all packets. ++ */ ++ napi_gro_flush(n, HZ >= 1000); ++ } ++ ++ gro_normal_list(n); ++ ++ *repoll = true; ++ ++ return work; ++} ++ ++static void napi_workfn(struct work_struct *work) ++{ ++ struct napi_struct *n = container_of(work, struct napi_struct, work); ++ void *have; ++ ++ for (;;) { ++ bool repoll = false; ++ ++ local_bh_disable(); ++ ++ have = netpoll_poll_lock(n); ++ __napi_poll(n, &repoll); ++ netpoll_poll_unlock(have); ++ ++ local_bh_enable(); ++ ++ if (!repoll) ++ return; ++ ++ if (!need_resched()) ++ continue; ++ ++ /* ++ * have to pay for the latency of task switch even if ++ * napi is scheduled ++ */ ++ queue_work(napi_workq, work); ++ return; ++ } ++} ++ + void netif_napi_add(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int weight) + { +@@ -6238,6 +6327,7 @@ void netif_napi_add(struct net_device *d + #ifdef CONFIG_NETPOLL + napi->poll_owner = -1; + #endif ++ INIT_WORK(&napi->work, napi_workfn); + set_bit(NAPI_STATE_SCHED, &napi->state); + napi_hash_add(napi); + } +@@ -6276,6 +6366,7 @@ static void flush_gro_hash(struct napi_s + void netif_napi_del(struct napi_struct *napi) + { + might_sleep(); ++ cancel_work_sync(&napi->work); + if (napi_hash_del(napi)) + synchronize_net(); + list_del_init(&napi->dev_list); +@@ -6288,50 +6379,18 @@ EXPORT_SYMBOL(netif_napi_del); + + static int napi_poll(struct napi_struct *n, struct list_head *repoll) + { ++ bool do_repoll = false; + void *have; +- int work, weight; ++ int work; + + list_del_init(&n->poll_list); + + have = netpoll_poll_lock(n); + +- weight = n->weight; +- +- /* This NAPI_STATE_SCHED test is for avoiding a race +- * with netpoll's poll_napi(). Only the entity which +- * obtains the lock and sees NAPI_STATE_SCHED set will +- * actually make the ->poll() call. Therefore we avoid +- * accidentally calling ->poll() when NAPI is not scheduled. +- */ +- work = 0; +- if (test_bit(NAPI_STATE_SCHED, &n->state)) { +- work = n->poll(n, weight); +- trace_napi_poll(n, work, weight); +- } +- +- WARN_ON_ONCE(work > weight); +- +- if (likely(work < weight)) +- goto out_unlock; ++ work = __napi_poll(n, &do_repoll); + +- /* Drivers must not modify the NAPI state if they +- * consume the entire weight. In such cases this code +- * still "owns" the NAPI instance and therefore can +- * move the instance around on the list at-will. +- */ +- if (unlikely(napi_disable_pending(n))) { +- napi_complete(n); ++ if (!do_repoll) + goto out_unlock; +- } +- +- if (n->gro_bitmask) { +- /* flush too old packets +- * If HZ < 1000, flush all packets. +- */ +- napi_gro_flush(n, HZ >= 1000); +- } +- +- gro_normal_list(n); + + /* Some drivers may have called napi_schedule + * prior to exhausting their budget. +@@ -10264,6 +10323,10 @@ static int __init net_dev_init(void) + sd->backlog.weight = weight_p; + } + ++ napi_workq = alloc_workqueue("napi_workq", WQ_UNBOUND | WQ_HIGHPRI, ++ WQ_UNBOUND_MAX_ACTIVE | WQ_SYSFS); ++ BUG_ON(!napi_workq); ++ + dev_boot_phase = 0; + + /* The loopback device is special if any other network devices +--- a/net/core/net-sysfs.c ++++ b/net/core/net-sysfs.c +@@ -442,6 +442,52 @@ static ssize_t proto_down_store(struct d + } + NETDEVICE_SHOW_RW(proto_down, fmt_dec); + ++static int change_napi_threaded(struct net_device *dev, unsigned long val) ++{ ++ struct napi_struct *napi; ++ ++ if (list_empty(&dev->napi_list)) ++ return -EOPNOTSUPP; ++ ++ list_for_each_entry(napi, &dev->napi_list, dev_list) { ++ if (val) ++ set_bit(NAPI_STATE_THREADED, &napi->state); ++ else ++ clear_bit(NAPI_STATE_THREADED, &napi->state); ++ } ++ ++ return 0; ++} ++ ++static ssize_t napi_threaded_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t len) ++{ ++ return netdev_store(dev, attr, buf, len, change_napi_threaded); ++} ++ ++static ssize_t napi_threaded_show(struct device *dev, ++ struct device_attribute *attr, ++ char *buf) ++{ ++ struct net_device *netdev = to_net_dev(dev); ++ struct napi_struct *napi; ++ bool enabled = false; ++ ++ if (!rtnl_trylock()) ++ return restart_syscall(); ++ ++ list_for_each_entry(napi, &netdev->napi_list, dev_list) { ++ if (test_bit(NAPI_STATE_THREADED, &napi->state)) ++ enabled = true; ++ } ++ ++ rtnl_unlock(); ++ ++ return sprintf(buf, fmt_dec, enabled); ++} ++static DEVICE_ATTR_RW(napi_threaded); ++ + static ssize_t phys_port_id_show(struct device *dev, + struct device_attribute *attr, char *buf) + { +@@ -532,6 +578,7 @@ static struct attribute *net_class_attrs + &dev_attr_flags.attr, + &dev_attr_tx_queue_len.attr, + &dev_attr_gro_flush_timeout.attr, ++ &dev_attr_napi_threaded.attr, + &dev_attr_phys_port_id.attr, + &dev_attr_phys_port_name.attr, + &dev_attr_phys_switch_id.attr,