1 From: Wei Wang <weiwan@google.com>
2 Date: Mon, 8 Feb 2021 11:34:09 -0800
3 Subject: [PATCH] net: implement threaded-able napi poll loop support
5 This patch allows running each napi poll loop inside its own
7 The kthread is created during netif_napi_add() if dev->threaded
8 is set. And threaded mode is enabled in napi_enable(). We will
9 provide a way to set dev->threaded and enable threaded mode
10 without a device up/down in the following patch.
12 Once that threaded mode is enabled and the kthread is
13 started, napi_schedule() will wake-up such thread instead
14 of scheduling the softirq.
16 The threaded poll loop behaves quite likely the net_rx_action,
17 but it does not have to manipulate local irqs and uses
18 an explicit scheduling point based on netdev_budget.
20 Co-developed-by: Paolo Abeni <pabeni@redhat.com>
21 Signed-off-by: Paolo Abeni <pabeni@redhat.com>
22 Co-developed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
23 Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
24 Co-developed-by: Jakub Kicinski <kuba@kernel.org>
25 Signed-off-by: Jakub Kicinski <kuba@kernel.org>
26 Signed-off-by: Wei Wang <weiwan@google.com>
27 Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
28 Signed-off-by: David S. Miller <davem@davemloft.net>
31 --- a/include/linux/netdevice.h
32 +++ b/include/linux/netdevice.h
33 @@ -347,6 +347,7 @@ struct napi_struct {
34 struct list_head dev_list;
35 struct hlist_node napi_hash_node;
37 + struct task_struct *thread;
41 @@ -357,6 +358,7 @@ enum {
42 NAPI_STATE_LISTED, /* NAPI added to system lists */
43 NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
44 NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
45 + NAPI_STATE_THREADED, /* The poll is performed inside its own thread*/
49 @@ -367,6 +369,7 @@ enum {
50 NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED),
51 NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
52 NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
53 + NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED),
57 @@ -497,20 +500,7 @@ static inline bool napi_complete(struct
59 void napi_disable(struct napi_struct *n);
62 - * napi_enable - enable NAPI scheduling
65 - * Resume NAPI from being scheduled on this context.
66 - * Must be paired with napi_disable.
68 -static inline void napi_enable(struct napi_struct *n)
70 - BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
71 - smp_mb__before_atomic();
72 - clear_bit(NAPI_STATE_SCHED, &n->state);
73 - clear_bit(NAPI_STATE_NPSVC, &n->state);
75 +void napi_enable(struct napi_struct *n);
78 * napi_synchronize - wait until NAPI is not running
79 @@ -1842,6 +1832,8 @@ enum netdev_ml_priv_type {
81 * @wol_enabled: Wake-on-LAN is enabled
83 + * @threaded: napi threaded mode is enabled
85 * @net_notifier_list: List of per-net netdev notifier block
86 * that follow this device when it is moved
87 * to another network namespace.
88 @@ -2161,6 +2153,7 @@ struct net_device {
89 struct lock_class_key *qdisc_running_key;
91 unsigned wol_enabled:1;
92 + unsigned threaded:1;
94 struct list_head net_notifier_list;
99 #include <linux/etherdevice.h>
100 #include <linux/ethtool.h>
101 #include <linux/skbuff.h>
102 +#include <linux/kthread.h>
103 #include <linux/bpf.h>
104 #include <linux/bpf_trace.h>
105 #include <net/net_namespace.h>
106 @@ -1500,6 +1501,27 @@ void netdev_notify_peers(struct net_devi
108 EXPORT_SYMBOL(netdev_notify_peers);
110 +static int napi_threaded_poll(void *data);
112 +static int napi_kthread_create(struct napi_struct *n)
116 + /* Create and wake up the kthread once to put it in
117 + * TASK_INTERRUPTIBLE mode to avoid the blocked task
118 + * warning and work with loadavg.
120 + n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
121 + n->dev->name, n->napi_id);
122 + if (IS_ERR(n->thread)) {
123 + err = PTR_ERR(n->thread);
124 + pr_err("kthread_run failed with err %d\n", err);
131 static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
133 const struct net_device_ops *ops = dev->netdev_ops;
134 @@ -4255,6 +4277,21 @@ int gro_normal_batch __read_mostly = 8;
135 static inline void ____napi_schedule(struct softnet_data *sd,
136 struct napi_struct *napi)
138 + struct task_struct *thread;
140 + if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
141 + /* Paired with smp_mb__before_atomic() in
142 + * napi_enable(). Use READ_ONCE() to guarantee
143 + * a complete read on napi->thread. Only call
144 + * wake_up_process() when it's not NULL.
146 + thread = READ_ONCE(napi->thread);
148 + wake_up_process(thread);
153 list_add_tail(&napi->poll_list, &sd->poll_list);
154 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
156 @@ -6746,6 +6783,12 @@ void netif_napi_add(struct net_device *d
157 set_bit(NAPI_STATE_NPSVC, &napi->state);
158 list_add_rcu(&napi->dev_list, &dev->napi_list);
160 + /* Create kthread for this napi if dev->threaded is set.
161 + * Clear dev->threaded if kthread creation failed so that
162 + * threaded mode will not be enabled in napi_enable().
164 + if (dev->threaded && napi_kthread_create(napi))
167 EXPORT_SYMBOL(netif_napi_add);
169 @@ -6762,9 +6805,28 @@ void napi_disable(struct napi_struct *n)
170 hrtimer_cancel(&n->timer);
172 clear_bit(NAPI_STATE_DISABLE, &n->state);
173 + clear_bit(NAPI_STATE_THREADED, &n->state);
175 EXPORT_SYMBOL(napi_disable);
178 + * napi_enable - enable NAPI scheduling
181 + * Resume NAPI from being scheduled on this context.
182 + * Must be paired with napi_disable.
184 +void napi_enable(struct napi_struct *n)
186 + BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
187 + smp_mb__before_atomic();
188 + clear_bit(NAPI_STATE_SCHED, &n->state);
189 + clear_bit(NAPI_STATE_NPSVC, &n->state);
190 + if (n->dev->threaded && n->thread)
191 + set_bit(NAPI_STATE_THREADED, &n->state);
193 +EXPORT_SYMBOL(napi_enable);
195 static void flush_gro_hash(struct napi_struct *napi)
198 @@ -6790,6 +6852,11 @@ void __netif_napi_del(struct napi_struct
200 flush_gro_hash(napi);
201 napi->gro_bitmask = 0;
203 + if (napi->thread) {
204 + kthread_stop(napi->thread);
205 + napi->thread = NULL;
208 EXPORT_SYMBOL(__netif_napi_del);
210 @@ -6871,6 +6938,51 @@ static int napi_poll(struct napi_struct
214 +static int napi_thread_wait(struct napi_struct *napi)
216 + set_current_state(TASK_INTERRUPTIBLE);
218 + while (!kthread_should_stop() && !napi_disable_pending(napi)) {
219 + if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
220 + WARN_ON(!list_empty(&napi->poll_list));
221 + __set_current_state(TASK_RUNNING);
226 + set_current_state(TASK_INTERRUPTIBLE);
228 + __set_current_state(TASK_RUNNING);
232 +static int napi_threaded_poll(void *data)
234 + struct napi_struct *napi = data;
237 + while (!napi_thread_wait(napi)) {
239 + bool repoll = false;
241 + local_bh_disable();
243 + have = netpoll_poll_lock(napi);
244 + __napi_poll(napi, &repoll);
245 + netpoll_poll_unlock(have);
247 + __kfree_skb_flush();
259 static __latent_entropy void net_rx_action(struct softirq_action *h)
261 struct softnet_data *sd = this_cpu_ptr(&softnet_data);