1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: "Jason A. Donenfeld" <Jason@zx2c4.com>
3 Date: Mon, 22 Feb 2021 17:25:48 +0100
4 Subject: [PATCH] wireguard: queueing: get rid of per-peer ring buffers
6 Content-Type: text/plain; charset=UTF-8
7 Content-Transfer-Encoding: 8bit
9 commit 8b5553ace83cced775eefd0f3f18b5c6214ccf7a upstream.
11 Having two ring buffers per-peer means that every peer results in two
12 massive ring allocations. On an 8-core x86_64 machine, this commit
13 reduces the per-peer allocation from 18,688 bytes to 1,856 bytes, which
14 is an 90% reduction. Ninety percent! With some single-machine
15 deployments approaching 500,000 peers, we're talking about a reduction
16 from 7 gigs of memory down to 700 megs of memory.
18 In order to get rid of these per-peer allocations, this commit switches
19 to using a list-based queueing approach. Currently GSO fragments are
20 chained together using the skb->next pointer (the skb_list_* singly
21 linked list approach), so we form the per-peer queue around the unused
22 skb->prev pointer (which sort of makes sense because the links are
23 pointing backwards). Use of skb_queue_* is not possible here, because
24 that is based on doubly linked lists and spinlocks. Multiple cores can
25 write into the queue at any given time, because its writes occur in the
26 start_xmit path or in the udp_recv path. But reads happen in a single
27 workqueue item per-peer, amounting to a multi-producer, single-consumer
30 The MPSC queue is implemented locklessly and never blocks. However, it
31 is not linearizable (though it is serializable), with a very tight and
32 unlikely race on writes, which, when hit (some tiny fraction of the
33 0.15% of partial adds on a fully loaded 16-core x86_64 system), causes
34 the queue reader to terminate early. However, because every packet sent
35 queues up the same workqueue item after it is fully added, the worker
36 resumes again, and stopping early isn't actually a problem, since at
37 that point the packet wouldn't have yet been added to the encryption
38 queue. These properties allow us to avoid disabling interrupts or
39 spinning. The design is based on Dmitry Vyukov's algorithm [1].
41 Performance-wise, ordinarily list-based queues aren't preferable to
42 ringbuffers, because of cache misses when following pointers around.
43 However, we *already* have to follow the adjacent pointers when working
44 through fragments, so there shouldn't actually be any change there. A
45 potential downside is that dequeueing is a bit more complicated, but the
46 ptr_ring structure used prior had a spinlock when dequeueing, so all and
47 all the difference appears to be a wash.
49 Actually, from profiling, the biggest performance hit, by far, of this
50 commit winds up being atomic_add_unless(count, 1, max) and atomic_
51 dec(count), which account for the majority of CPU time, according to
52 perf. In that sense, the previous ring buffer was superior in that it
53 could check if it was full by head==tail, which the list-based approach
56 But all and all, this enables us to get massive memory savings, allowing
57 WireGuard to scale for real world deployments, without taking much of a
60 [1] http://www.1024cores.net/home/lock-free-algorithms/queues/intrusive-mpsc-node-based-queue
62 Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
63 Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
64 Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
65 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
66 Signed-off-by: Jakub Kicinski <kuba@kernel.org>
67 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
69 drivers/net/wireguard/device.c | 12 ++---
70 drivers/net/wireguard/device.h | 15 +++---
71 drivers/net/wireguard/peer.c | 28 ++++-------
72 drivers/net/wireguard/peer.h | 4 +-
73 drivers/net/wireguard/queueing.c | 86 +++++++++++++++++++++++++-------
74 drivers/net/wireguard/queueing.h | 45 ++++++++++++-----
75 drivers/net/wireguard/receive.c | 16 +++---
76 drivers/net/wireguard/send.c | 31 ++++--------
77 8 files changed, 144 insertions(+), 93 deletions(-)
79 --- a/drivers/net/wireguard/device.c
80 +++ b/drivers/net/wireguard/device.c
81 @@ -235,8 +235,8 @@ static void wg_destruct(struct net_devic
82 destroy_workqueue(wg->handshake_receive_wq);
83 destroy_workqueue(wg->handshake_send_wq);
84 destroy_workqueue(wg->packet_crypt_wq);
85 - wg_packet_queue_free(&wg->decrypt_queue, true);
86 - wg_packet_queue_free(&wg->encrypt_queue, true);
87 + wg_packet_queue_free(&wg->decrypt_queue);
88 + wg_packet_queue_free(&wg->encrypt_queue);
89 rcu_barrier(); /* Wait for all the peers to be actually freed. */
90 wg_ratelimiter_uninit();
91 memzero_explicit(&wg->static_identity, sizeof(wg->static_identity));
92 @@ -338,12 +338,12 @@ static int wg_newlink(struct net *src_ne
93 goto err_destroy_handshake_send;
95 ret = wg_packet_queue_init(&wg->encrypt_queue, wg_packet_encrypt_worker,
96 - true, MAX_QUEUED_PACKETS);
97 + MAX_QUEUED_PACKETS);
99 goto err_destroy_packet_crypt;
101 ret = wg_packet_queue_init(&wg->decrypt_queue, wg_packet_decrypt_worker,
102 - true, MAX_QUEUED_PACKETS);
103 + MAX_QUEUED_PACKETS);
105 goto err_free_encrypt_queue;
107 @@ -368,9 +368,9 @@ static int wg_newlink(struct net *src_ne
108 err_uninit_ratelimiter:
109 wg_ratelimiter_uninit();
110 err_free_decrypt_queue:
111 - wg_packet_queue_free(&wg->decrypt_queue, true);
112 + wg_packet_queue_free(&wg->decrypt_queue);
113 err_free_encrypt_queue:
114 - wg_packet_queue_free(&wg->encrypt_queue, true);
115 + wg_packet_queue_free(&wg->encrypt_queue);
116 err_destroy_packet_crypt:
117 destroy_workqueue(wg->packet_crypt_wq);
118 err_destroy_handshake_send:
119 --- a/drivers/net/wireguard/device.h
120 +++ b/drivers/net/wireguard/device.h
121 @@ -27,13 +27,14 @@ struct multicore_worker {
124 struct ptr_ring ring;
127 - struct multicore_worker __percpu *worker;
130 - struct work_struct work;
132 + struct multicore_worker __percpu *worker;
137 + struct sk_buff *head, *tail, *peeked;
138 + struct { struct sk_buff *next, *prev; } empty; // Match first 2 members of struct sk_buff.
143 --- a/drivers/net/wireguard/peer.c
144 +++ b/drivers/net/wireguard/peer.c
145 @@ -32,27 +32,22 @@ struct wg_peer *wg_peer_create(struct wg
146 peer = kzalloc(sizeof(*peer), GFP_KERNEL);
150 + if (dst_cache_init(&peer->endpoint_cache, GFP_KERNEL))
154 wg_noise_handshake_init(&peer->handshake, &wg->static_identity,
155 public_key, preshared_key, peer);
156 - if (dst_cache_init(&peer->endpoint_cache, GFP_KERNEL))
158 - if (wg_packet_queue_init(&peer->tx_queue, wg_packet_tx_worker, false,
159 - MAX_QUEUED_PACKETS))
161 - if (wg_packet_queue_init(&peer->rx_queue, NULL, false,
162 - MAX_QUEUED_PACKETS))
165 peer->internal_id = atomic64_inc_return(&peer_counter);
166 peer->serial_work_cpu = nr_cpumask_bits;
167 wg_cookie_init(&peer->latest_cookie);
168 wg_timers_init(peer);
169 wg_cookie_checker_precompute_peer_keys(peer);
170 spin_lock_init(&peer->keypairs.keypair_update_lock);
171 - INIT_WORK(&peer->transmit_handshake_work,
172 - wg_packet_handshake_send_worker);
173 + INIT_WORK(&peer->transmit_handshake_work, wg_packet_handshake_send_worker);
174 + INIT_WORK(&peer->transmit_packet_work, wg_packet_tx_worker);
175 + wg_prev_queue_init(&peer->tx_queue);
176 + wg_prev_queue_init(&peer->rx_queue);
177 rwlock_init(&peer->endpoint_lock);
178 kref_init(&peer->refcount);
179 skb_queue_head_init(&peer->staged_packet_queue);
180 @@ -68,11 +63,7 @@ struct wg_peer *wg_peer_create(struct wg
181 pr_debug("%s: Peer %llu created\n", wg->dev->name, peer->internal_id);
185 - wg_packet_queue_free(&peer->tx_queue, false);
187 - dst_cache_destroy(&peer->endpoint_cache);
193 @@ -197,8 +188,7 @@ static void rcu_release(struct rcu_head
194 struct wg_peer *peer = container_of(rcu, struct wg_peer, rcu);
196 dst_cache_destroy(&peer->endpoint_cache);
197 - wg_packet_queue_free(&peer->rx_queue, false);
198 - wg_packet_queue_free(&peer->tx_queue, false);
199 + WARN_ON(wg_prev_queue_peek(&peer->tx_queue) || wg_prev_queue_peek(&peer->rx_queue));
201 /* The final zeroing takes care of clearing any remaining handshake key
202 * material and other potentially sensitive information.
203 --- a/drivers/net/wireguard/peer.h
204 +++ b/drivers/net/wireguard/peer.h
205 @@ -36,7 +36,7 @@ struct endpoint {
208 struct wg_device *device;
209 - struct crypt_queue tx_queue, rx_queue;
210 + struct prev_queue tx_queue, rx_queue;
211 struct sk_buff_head staged_packet_queue;
214 @@ -46,7 +46,7 @@ struct wg_peer {
215 rwlock_t endpoint_lock;
216 struct noise_handshake handshake;
217 atomic64_t last_sent_handshake;
218 - struct work_struct transmit_handshake_work, clear_peer_work;
219 + struct work_struct transmit_handshake_work, clear_peer_work, transmit_packet_work;
220 struct cookie latest_cookie;
221 struct hlist_node pubkey_hash;
222 u64 rx_bytes, tx_bytes;
223 --- a/drivers/net/wireguard/queueing.c
224 +++ b/drivers/net/wireguard/queueing.c
225 @@ -9,8 +9,7 @@ struct multicore_worker __percpu *
226 wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr)
229 - struct multicore_worker __percpu *worker =
230 - alloc_percpu(struct multicore_worker);
231 + struct multicore_worker __percpu *worker = alloc_percpu(struct multicore_worker);
235 @@ -23,7 +22,7 @@ wg_packet_percpu_multicore_worker_alloc(
238 int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function,
239 - bool multicore, unsigned int len)
244 @@ -31,25 +30,78 @@ int wg_packet_queue_init(struct crypt_qu
245 ret = ptr_ring_init(&queue->ring, len, GFP_KERNEL);
250 - queue->worker = wg_packet_percpu_multicore_worker_alloc(
252 - if (!queue->worker) {
253 - ptr_ring_cleanup(&queue->ring, NULL);
257 - INIT_WORK(&queue->work, function);
259 + queue->worker = wg_packet_percpu_multicore_worker_alloc(function, queue);
260 + if (!queue->worker) {
261 + ptr_ring_cleanup(&queue->ring, NULL);
267 -void wg_packet_queue_free(struct crypt_queue *queue, bool multicore)
268 +void wg_packet_queue_free(struct crypt_queue *queue)
271 - free_percpu(queue->worker);
272 + free_percpu(queue->worker);
273 WARN_ON(!__ptr_ring_empty(&queue->ring));
274 ptr_ring_cleanup(&queue->ring, NULL);
277 +#define NEXT(skb) ((skb)->prev)
278 +#define STUB(queue) ((struct sk_buff *)&queue->empty)
280 +void wg_prev_queue_init(struct prev_queue *queue)
282 + NEXT(STUB(queue)) = NULL;
283 + queue->head = queue->tail = STUB(queue);
284 + queue->peeked = NULL;
285 + atomic_set(&queue->count, 0);
287 + offsetof(struct sk_buff, next) != offsetof(struct prev_queue, empty.next) -
288 + offsetof(struct prev_queue, empty) ||
289 + offsetof(struct sk_buff, prev) != offsetof(struct prev_queue, empty.prev) -
290 + offsetof(struct prev_queue, empty));
293 +static void __wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb)
295 + WRITE_ONCE(NEXT(skb), NULL);
296 + WRITE_ONCE(NEXT(xchg_release(&queue->head, skb)), skb);
299 +bool wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb)
301 + if (!atomic_add_unless(&queue->count, 1, MAX_QUEUED_PACKETS))
303 + __wg_prev_queue_enqueue(queue, skb);
307 +struct sk_buff *wg_prev_queue_dequeue(struct prev_queue *queue)
309 + struct sk_buff *tail = queue->tail, *next = smp_load_acquire(&NEXT(tail));
311 + if (tail == STUB(queue)) {
314 + queue->tail = next;
316 + next = smp_load_acquire(&NEXT(next));
319 + queue->tail = next;
320 + atomic_dec(&queue->count);
323 + if (tail != READ_ONCE(queue->head))
325 + __wg_prev_queue_enqueue(queue, STUB(queue));
326 + next = smp_load_acquire(&NEXT(tail));
328 + queue->tail = next;
329 + atomic_dec(&queue->count);
337 --- a/drivers/net/wireguard/queueing.h
338 +++ b/drivers/net/wireguard/queueing.h
339 @@ -17,12 +17,13 @@ struct wg_device;
341 struct multicore_worker;
346 /* queueing.c APIs: */
347 int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function,
348 - bool multicore, unsigned int len);
349 -void wg_packet_queue_free(struct crypt_queue *queue, bool multicore);
351 +void wg_packet_queue_free(struct crypt_queue *queue);
352 struct multicore_worker __percpu *
353 wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr);
355 @@ -135,8 +136,31 @@ static inline int wg_cpumask_next_online
359 +void wg_prev_queue_init(struct prev_queue *queue);
361 +/* Multi producer */
362 +bool wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb);
364 +/* Single consumer */
365 +struct sk_buff *wg_prev_queue_dequeue(struct prev_queue *queue);
367 +/* Single consumer */
368 +static inline struct sk_buff *wg_prev_queue_peek(struct prev_queue *queue)
371 + return queue->peeked;
372 + queue->peeked = wg_prev_queue_dequeue(queue);
373 + return queue->peeked;
376 +/* Single consumer */
377 +static inline void wg_prev_queue_drop_peeked(struct prev_queue *queue)
379 + queue->peeked = NULL;
382 static inline int wg_queue_enqueue_per_device_and_peer(
383 - struct crypt_queue *device_queue, struct crypt_queue *peer_queue,
384 + struct crypt_queue *device_queue, struct prev_queue *peer_queue,
385 struct sk_buff *skb, struct workqueue_struct *wq, int *next_cpu)
388 @@ -145,8 +169,9 @@ static inline int wg_queue_enqueue_per_d
389 /* We first queue this up for the peer ingestion, but the consumer
390 * will wait for the state to change to CRYPTED or DEAD before.
392 - if (unlikely(ptr_ring_produce_bh(&peer_queue->ring, skb)))
393 + if (unlikely(!wg_prev_queue_enqueue(peer_queue, skb)))
396 /* Then we queue it up in the device queue, which consumes the
397 * packet as soon as it can.
399 @@ -157,9 +182,7 @@ static inline int wg_queue_enqueue_per_d
403 -static inline void wg_queue_enqueue_per_peer(struct crypt_queue *queue,
404 - struct sk_buff *skb,
405 - enum packet_state state)
406 +static inline void wg_queue_enqueue_per_peer_tx(struct sk_buff *skb, enum packet_state state)
408 /* We take a reference, because as soon as we call atomic_set, the
409 * peer can be freed from below us.
410 @@ -167,14 +190,12 @@ static inline void wg_queue_enqueue_per_
411 struct wg_peer *peer = wg_peer_get(PACKET_PEER(skb));
413 atomic_set_release(&PACKET_CB(skb)->state, state);
414 - queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu,
415 - peer->internal_id),
416 - peer->device->packet_crypt_wq, &queue->work);
417 + queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu, peer->internal_id),
418 + peer->device->packet_crypt_wq, &peer->transmit_packet_work);
422 -static inline void wg_queue_enqueue_per_peer_napi(struct sk_buff *skb,
423 - enum packet_state state)
424 +static inline void wg_queue_enqueue_per_peer_rx(struct sk_buff *skb, enum packet_state state)
426 /* We take a reference, because as soon as we call atomic_set, the
427 * peer can be freed from below us.
428 --- a/drivers/net/wireguard/receive.c
429 +++ b/drivers/net/wireguard/receive.c
430 @@ -444,7 +444,6 @@ packet_processed:
431 int wg_packet_rx_poll(struct napi_struct *napi, int budget)
433 struct wg_peer *peer = container_of(napi, struct wg_peer, napi);
434 - struct crypt_queue *queue = &peer->rx_queue;
435 struct noise_keypair *keypair;
436 struct endpoint endpoint;
437 enum packet_state state;
438 @@ -455,11 +454,10 @@ int wg_packet_rx_poll(struct napi_struct
439 if (unlikely(budget <= 0))
442 - while ((skb = __ptr_ring_peek(&queue->ring)) != NULL &&
443 + while ((skb = wg_prev_queue_peek(&peer->rx_queue)) != NULL &&
444 (state = atomic_read_acquire(&PACKET_CB(skb)->state)) !=
445 PACKET_STATE_UNCRYPTED) {
446 - __ptr_ring_discard_one(&queue->ring);
447 - peer = PACKET_PEER(skb);
448 + wg_prev_queue_drop_peeked(&peer->rx_queue);
449 keypair = PACKET_CB(skb)->keypair;
452 @@ -508,7 +506,7 @@ void wg_packet_decrypt_worker(struct wor
453 enum packet_state state =
454 likely(decrypt_packet(skb, PACKET_CB(skb)->keypair)) ?
455 PACKET_STATE_CRYPTED : PACKET_STATE_DEAD;
456 - wg_queue_enqueue_per_peer_napi(skb, state);
457 + wg_queue_enqueue_per_peer_rx(skb, state);
461 @@ -531,12 +529,10 @@ static void wg_packet_consume_data(struc
462 if (unlikely(READ_ONCE(peer->is_dead)))
465 - ret = wg_queue_enqueue_per_device_and_peer(&wg->decrypt_queue,
466 - &peer->rx_queue, skb,
467 - wg->packet_crypt_wq,
468 - &wg->decrypt_queue.last_cpu);
469 + ret = wg_queue_enqueue_per_device_and_peer(&wg->decrypt_queue, &peer->rx_queue, skb,
470 + wg->packet_crypt_wq, &wg->decrypt_queue.last_cpu);
471 if (unlikely(ret == -EPIPE))
472 - wg_queue_enqueue_per_peer_napi(skb, PACKET_STATE_DEAD);
473 + wg_queue_enqueue_per_peer_rx(skb, PACKET_STATE_DEAD);
474 if (likely(!ret || ret == -EPIPE)) {
475 rcu_read_unlock_bh();
477 --- a/drivers/net/wireguard/send.c
478 +++ b/drivers/net/wireguard/send.c
479 @@ -239,8 +239,7 @@ void wg_packet_send_keepalive(struct wg_
480 wg_packet_send_staged_packets(peer);
483 -static void wg_packet_create_data_done(struct sk_buff *first,
484 - struct wg_peer *peer)
485 +static void wg_packet_create_data_done(struct wg_peer *peer, struct sk_buff *first)
487 struct sk_buff *skb, *next;
488 bool is_keepalive, data_sent = false;
489 @@ -262,22 +261,19 @@ static void wg_packet_create_data_done(s
491 void wg_packet_tx_worker(struct work_struct *work)
493 - struct crypt_queue *queue = container_of(work, struct crypt_queue,
495 + struct wg_peer *peer = container_of(work, struct wg_peer, transmit_packet_work);
496 struct noise_keypair *keypair;
497 enum packet_state state;
498 struct sk_buff *first;
499 - struct wg_peer *peer;
501 - while ((first = __ptr_ring_peek(&queue->ring)) != NULL &&
502 + while ((first = wg_prev_queue_peek(&peer->tx_queue)) != NULL &&
503 (state = atomic_read_acquire(&PACKET_CB(first)->state)) !=
504 PACKET_STATE_UNCRYPTED) {
505 - __ptr_ring_discard_one(&queue->ring);
506 - peer = PACKET_PEER(first);
507 + wg_prev_queue_drop_peeked(&peer->tx_queue);
508 keypair = PACKET_CB(first)->keypair;
510 if (likely(state == PACKET_STATE_CRYPTED))
511 - wg_packet_create_data_done(first, peer);
512 + wg_packet_create_data_done(peer, first);
514 kfree_skb_list(first);
516 @@ -306,16 +302,14 @@ void wg_packet_encrypt_worker(struct wor
520 - wg_queue_enqueue_per_peer(&PACKET_PEER(first)->tx_queue, first,
522 + wg_queue_enqueue_per_peer_tx(first, state);
528 -static void wg_packet_create_data(struct sk_buff *first)
529 +static void wg_packet_create_data(struct wg_peer *peer, struct sk_buff *first)
531 - struct wg_peer *peer = PACKET_PEER(first);
532 struct wg_device *wg = peer->device;
535 @@ -323,13 +317,10 @@ static void wg_packet_create_data(struct
536 if (unlikely(READ_ONCE(peer->is_dead)))
539 - ret = wg_queue_enqueue_per_device_and_peer(&wg->encrypt_queue,
540 - &peer->tx_queue, first,
541 - wg->packet_crypt_wq,
542 - &wg->encrypt_queue.last_cpu);
543 + ret = wg_queue_enqueue_per_device_and_peer(&wg->encrypt_queue, &peer->tx_queue, first,
544 + wg->packet_crypt_wq, &wg->encrypt_queue.last_cpu);
545 if (unlikely(ret == -EPIPE))
546 - wg_queue_enqueue_per_peer(&peer->tx_queue, first,
547 - PACKET_STATE_DEAD);
548 + wg_queue_enqueue_per_peer_tx(first, PACKET_STATE_DEAD);
550 rcu_read_unlock_bh();
551 if (likely(!ret || ret == -EPIPE))
552 @@ -393,7 +384,7 @@ void wg_packet_send_staged_packets(struc
553 packets.prev->next = NULL;
554 wg_peer_get(keypair->entry.peer);
555 PACKET_CB(packets.next)->keypair = keypair;
556 - wg_packet_create_data(packets.next);
557 + wg_packet_create_data(peer, packets.next);