1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: "Jason A. Donenfeld" <Jason@zx2c4.com>
3 Date: Fri, 4 Jun 2021 17:17:38 +0200
4 Subject: [PATCH] wireguard: allowedips: free empty intermediate nodes when
7 commit bf7b042dc62a31f66d3a41dd4dfc7806f267b307 upstream.
9 When removing single nodes, it's possible that that node's parent is an
10 empty intermediate node, in which case, it too should be removed.
11 Otherwise the trie fills up and never is fully emptied, leading to
12 gradual memory leaks over time for tries that are modified often. There
13 was originally code to do this, but was removed during refactoring in
14 2016 and never reworked. Now that we have proper parent pointers from
15 the previous commits, we can implement this properly.
17 In order to reduce branching and expensive comparisons, we want to keep
18 the double pointer for parent assignment (which lets us easily chain up
19 to the root), but we still need to actually get the parent's base
20 address. So encode the bit number into the last two bits of the pointer,
21 and pack and unpack it as needed. This is a little bit clumsy but is the
22 fastest and less memory wasteful of the compromises. Note that we align
23 the root struct here to a minimum of 4, because it's embedded into a
24 larger struct, and we're relying on having the bottom two bits for our
25 flag, which would only be 16-bit aligned on m68k.
27 The existing macro-based helpers were a bit unwieldy for adding the bit
28 packing to, so this commit replaces them with safer and clearer ordinary
31 We add a test to the randomized/fuzzer part of the selftests, to free
32 the randomized tries by-peer, refuzz it, and repeat, until it's supposed
33 to be empty, and then then see if that actually resulted in the whole
34 thing being emptied. That combined with kmemcheck should hopefully make
35 sure this commit is doing what it should. Along the way this resulted in
36 various other cleanups of the tests and fixes for recent graphviz.
38 Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
39 Cc: stable@vger.kernel.org
40 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
41 Signed-off-by: David S. Miller <davem@davemloft.net>
42 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
44 drivers/net/wireguard/allowedips.c | 102 ++++++------
45 drivers/net/wireguard/allowedips.h | 4 +-
46 drivers/net/wireguard/selftest/allowedips.c | 162 ++++++++++----------
47 3 files changed, 137 insertions(+), 131 deletions(-)
49 --- a/drivers/net/wireguard/allowedips.c
50 +++ b/drivers/net/wireguard/allowedips.c
51 @@ -30,8 +30,11 @@ static void copy_and_assign_cidr(struct
53 memcpy(node->bits, src, bits / 8U);
55 -#define CHOOSE_NODE(parent, key) \
56 - parent->bit[(key[parent->bit_at_a] >> parent->bit_at_b) & 1]
58 +static inline u8 choose(struct allowedips_node *node, const u8 *key)
60 + return (key[node->bit_at_a] >> node->bit_at_b) & 1;
63 static void push_rcu(struct allowedips_node **stack,
64 struct allowedips_node __rcu *p, unsigned int *len)
65 @@ -112,7 +115,7 @@ static struct allowedips_node *find_node
67 if (node->cidr == bits)
69 - node = rcu_dereference_bh(CHOOSE_NODE(node, key));
70 + node = rcu_dereference_bh(node->bit[choose(node, key)]);
74 @@ -144,8 +147,7 @@ static bool node_placement(struct allowe
75 u8 cidr, u8 bits, struct allowedips_node **rnode,
78 - struct allowedips_node *node = rcu_dereference_protected(trie,
79 - lockdep_is_held(lock));
80 + struct allowedips_node *node = rcu_dereference_protected(trie, lockdep_is_held(lock));
81 struct allowedips_node *parent = NULL;
84 @@ -155,13 +157,24 @@ static bool node_placement(struct allowe
88 - node = rcu_dereference_protected(CHOOSE_NODE(parent, key),
89 - lockdep_is_held(lock));
90 + node = rcu_dereference_protected(parent->bit[choose(parent, key)], lockdep_is_held(lock));
96 +static inline void connect_node(struct allowedips_node **parent, u8 bit, struct allowedips_node *node)
98 + node->parent_bit_packed = (unsigned long)parent | bit;
99 + rcu_assign_pointer(*parent, node);
102 +static inline void choose_and_connect_node(struct allowedips_node *parent, struct allowedips_node *node)
104 + u8 bit = choose(parent, node->bits);
105 + connect_node(&parent->bit[bit], bit, node);
108 static int add(struct allowedips_node __rcu **trie, u8 bits, const u8 *key,
109 u8 cidr, struct wg_peer *peer, struct mutex *lock)
111 @@ -177,8 +190,7 @@ static int add(struct allowedips_node __
112 RCU_INIT_POINTER(node->peer, peer);
113 list_add_tail(&node->peer_list, &peer->allowedips_list);
114 copy_and_assign_cidr(node, key, cidr, bits);
115 - rcu_assign_pointer(node->parent_bit, trie);
116 - rcu_assign_pointer(*trie, node);
117 + connect_node(trie, 2, node);
120 if (node_placement(*trie, key, cidr, bits, &node, lock)) {
121 @@ -197,10 +209,10 @@ static int add(struct allowedips_node __
123 down = rcu_dereference_protected(*trie, lockdep_is_held(lock));
125 - down = rcu_dereference_protected(CHOOSE_NODE(node, key), lockdep_is_held(lock));
126 + const u8 bit = choose(node, key);
127 + down = rcu_dereference_protected(node->bit[bit], lockdep_is_held(lock));
129 - rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(node, key));
130 - rcu_assign_pointer(CHOOSE_NODE(node, key), newnode);
131 + connect_node(&node->bit[bit], bit, newnode);
135 @@ -208,15 +220,11 @@ static int add(struct allowedips_node __
138 if (newnode->cidr == cidr) {
139 - rcu_assign_pointer(down->parent_bit, &CHOOSE_NODE(newnode, down->bits));
140 - rcu_assign_pointer(CHOOSE_NODE(newnode, down->bits), down);
142 - rcu_assign_pointer(newnode->parent_bit, trie);
143 - rcu_assign_pointer(*trie, newnode);
145 - rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(parent, newnode->bits));
146 - rcu_assign_pointer(CHOOSE_NODE(parent, newnode->bits), newnode);
148 + choose_and_connect_node(newnode, down);
150 + connect_node(trie, 2, newnode);
152 + choose_and_connect_node(parent, newnode);
156 @@ -229,17 +237,12 @@ static int add(struct allowedips_node __
157 INIT_LIST_HEAD(&node->peer_list);
158 copy_and_assign_cidr(node, newnode->bits, cidr, bits);
160 - rcu_assign_pointer(down->parent_bit, &CHOOSE_NODE(node, down->bits));
161 - rcu_assign_pointer(CHOOSE_NODE(node, down->bits), down);
162 - rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(node, newnode->bits));
163 - rcu_assign_pointer(CHOOSE_NODE(node, newnode->bits), newnode);
165 - rcu_assign_pointer(node->parent_bit, trie);
166 - rcu_assign_pointer(*trie, node);
168 - rcu_assign_pointer(node->parent_bit, &CHOOSE_NODE(parent, node->bits));
169 - rcu_assign_pointer(CHOOSE_NODE(parent, node->bits), node);
171 + choose_and_connect_node(node, down);
172 + choose_and_connect_node(node, newnode);
174 + connect_node(trie, 2, node);
176 + choose_and_connect_node(parent, node);
180 @@ -297,7 +300,8 @@ int wg_allowedips_insert_v6(struct allow
181 void wg_allowedips_remove_by_peer(struct allowedips *table,
182 struct wg_peer *peer, struct mutex *lock)
184 - struct allowedips_node *node, *child, *tmp;
185 + struct allowedips_node *node, *child, **parent_bit, *parent, *tmp;
188 if (list_empty(&peer->allowedips_list))
190 @@ -307,19 +311,29 @@ void wg_allowedips_remove_by_peer(struct
191 RCU_INIT_POINTER(node->peer, NULL);
192 if (node->bit[0] && node->bit[1])
194 - child = rcu_dereference_protected(
195 - node->bit[!rcu_access_pointer(node->bit[0])],
196 - lockdep_is_held(lock));
197 + child = rcu_dereference_protected(node->bit[!rcu_access_pointer(node->bit[0])],
198 + lockdep_is_held(lock));
200 - child->parent_bit = node->parent_bit;
201 - *rcu_dereference_protected(node->parent_bit, lockdep_is_held(lock)) = child;
202 + child->parent_bit_packed = node->parent_bit_packed;
203 + parent_bit = (struct allowedips_node **)(node->parent_bit_packed & ~3UL);
204 + *parent_bit = child;
205 + parent = (void *)parent_bit -
206 + offsetof(struct allowedips_node, bit[node->parent_bit_packed & 1]);
207 + free_parent = !rcu_access_pointer(node->bit[0]) &&
208 + !rcu_access_pointer(node->bit[1]) &&
209 + (node->parent_bit_packed & 3) <= 1 &&
210 + !rcu_access_pointer(parent->peer);
212 + child = rcu_dereference_protected(
213 + parent->bit[!(node->parent_bit_packed & 1)],
214 + lockdep_is_held(lock));
215 call_rcu(&node->rcu, node_free_rcu);
217 - /* TODO: Note that we currently don't walk up and down in order to
218 - * free any potential filler nodes. This means that this function
219 - * doesn't free up as much as it could, which could be revisited
225 + child->parent_bit_packed = parent->parent_bit_packed;
226 + *(struct allowedips_node **)(parent->parent_bit_packed & ~3UL) = child;
227 + call_rcu(&parent->rcu, node_free_rcu);
231 --- a/drivers/net/wireguard/allowedips.h
232 +++ b/drivers/net/wireguard/allowedips.h
233 @@ -19,7 +19,7 @@ struct allowedips_node {
234 u8 bits[16] __aligned(__alignof(u64));
236 /* Keep rarely used members at bottom to be beyond cache line. */
237 - struct allowedips_node *__rcu *parent_bit;
238 + unsigned long parent_bit_packed;
240 struct list_head peer_list;
242 @@ -30,7 +30,7 @@ struct allowedips {
243 struct allowedips_node __rcu *root4;
244 struct allowedips_node __rcu *root6;
247 +} __aligned(4); /* We pack the lower 2 bits of &root, but m68k only gives 16-bit alignment. */
249 void wg_allowedips_init(struct allowedips *table);
250 void wg_allowedips_free(struct allowedips *table, struct mutex *mutex);
251 --- a/drivers/net/wireguard/selftest/allowedips.c
252 +++ b/drivers/net/wireguard/selftest/allowedips.c
255 #include <linux/siphash.h>
257 -static __init void swap_endian_and_apply_cidr(u8 *dst, const u8 *src, u8 bits,
260 - swap_endian(dst, src, bits);
261 - memset(dst + (cidr + 7) / 8, 0, bits / 8 - (cidr + 7) / 8);
263 - dst[(cidr + 7) / 8 - 1] &= ~0U << ((8 - (cidr % 8)) % 8);
266 static __init void print_node(struct allowedips_node *node, u8 bits)
268 char *fmt_connection = KERN_DEBUG "\t\"%p/%d\" -> \"%p/%d\";\n";
269 - char *fmt_declaration = KERN_DEBUG
270 - "\t\"%p/%d\"[style=%s, color=\"#%06x\"];\n";
271 + char *fmt_declaration = KERN_DEBUG "\t\"%p/%d\"[style=%s, color=\"#%06x\"];\n";
272 + u8 ip1[16], ip2[16], cidr1, cidr2;
273 char *style = "dotted";
274 - u8 ip1[16], ip2[16];
280 fmt_connection = KERN_DEBUG "\t\"%pI4/%d\" -> \"%pI4/%d\";\n";
281 - fmt_declaration = KERN_DEBUG
282 - "\t\"%pI4/%d\"[style=%s, color=\"#%06x\"];\n";
283 + fmt_declaration = KERN_DEBUG "\t\"%pI4/%d\"[style=%s, color=\"#%06x\"];\n";
284 } else if (bits == 128) {
285 fmt_connection = KERN_DEBUG "\t\"%pI6/%d\" -> \"%pI6/%d\";\n";
286 - fmt_declaration = KERN_DEBUG
287 - "\t\"%pI6/%d\"[style=%s, color=\"#%06x\"];\n";
288 + fmt_declaration = KERN_DEBUG "\t\"%pI6/%d\"[style=%s, color=\"#%06x\"];\n";
291 hsiphash_key_t key = { { 0 } };
292 @@ -55,24 +45,20 @@ static __init void print_node(struct all
293 hsiphash_1u32(0xabad1dea, &key) % 200;
296 - swap_endian_and_apply_cidr(ip1, node->bits, bits, node->cidr);
297 - printk(fmt_declaration, ip1, node->cidr, style, color);
298 + wg_allowedips_read_node(node, ip1, &cidr1);
299 + printk(fmt_declaration, ip1, cidr1, style, color);
301 - swap_endian_and_apply_cidr(ip2,
302 - rcu_dereference_raw(node->bit[0])->bits, bits,
304 - printk(fmt_connection, ip1, node->cidr, ip2,
305 - rcu_dereference_raw(node->bit[0])->cidr);
306 - print_node(rcu_dereference_raw(node->bit[0]), bits);
307 + wg_allowedips_read_node(rcu_dereference_raw(node->bit[0]), ip2, &cidr2);
308 + printk(fmt_connection, ip1, cidr1, ip2, cidr2);
311 - swap_endian_and_apply_cidr(ip2,
312 - rcu_dereference_raw(node->bit[1])->bits,
314 - printk(fmt_connection, ip1, node->cidr, ip2,
315 - rcu_dereference_raw(node->bit[1])->cidr);
316 - print_node(rcu_dereference_raw(node->bit[1]), bits);
317 + wg_allowedips_read_node(rcu_dereference_raw(node->bit[1]), ip2, &cidr2);
318 + printk(fmt_connection, ip1, cidr1, ip2, cidr2);
321 + print_node(rcu_dereference_raw(node->bit[0]), bits);
323 + print_node(rcu_dereference_raw(node->bit[1]), bits);
326 static __init void print_tree(struct allowedips_node __rcu *top, u8 bits)
327 @@ -121,8 +107,8 @@ static __init inline union nf_inet_addr
329 union nf_inet_addr mask;
331 - memset(&mask, 0x00, 128 / 8);
332 - memset(&mask, 0xff, cidr / 8);
333 + memset(&mask, 0, sizeof(mask));
334 + memset(&mask.all, 0xff, cidr / 8);
336 mask.all[cidr / 32] = (__force u32)htonl(
337 (0xFFFFFFFFUL << (32 - (cidr % 32))) & 0xFFFFFFFFUL);
338 @@ -149,42 +135,36 @@ horrible_mask_self(struct horrible_allow
341 static __init inline bool
342 -horrible_match_v4(const struct horrible_allowedips_node *node,
343 - struct in_addr *ip)
344 +horrible_match_v4(const struct horrible_allowedips_node *node, struct in_addr *ip)
346 return (ip->s_addr & node->mask.ip) == node->ip.ip;
349 static __init inline bool
350 -horrible_match_v6(const struct horrible_allowedips_node *node,
351 - struct in6_addr *ip)
352 +horrible_match_v6(const struct horrible_allowedips_node *node, struct in6_addr *ip)
354 - return (ip->in6_u.u6_addr32[0] & node->mask.ip6[0]) ==
356 - (ip->in6_u.u6_addr32[1] & node->mask.ip6[1]) ==
358 - (ip->in6_u.u6_addr32[2] & node->mask.ip6[2]) ==
360 + return (ip->in6_u.u6_addr32[0] & node->mask.ip6[0]) == node->ip.ip6[0] &&
361 + (ip->in6_u.u6_addr32[1] & node->mask.ip6[1]) == node->ip.ip6[1] &&
362 + (ip->in6_u.u6_addr32[2] & node->mask.ip6[2]) == node->ip.ip6[2] &&
363 (ip->in6_u.u6_addr32[3] & node->mask.ip6[3]) == node->ip.ip6[3];
367 -horrible_insert_ordered(struct horrible_allowedips *table,
368 - struct horrible_allowedips_node *node)
369 +horrible_insert_ordered(struct horrible_allowedips *table, struct horrible_allowedips_node *node)
371 struct horrible_allowedips_node *other = NULL, *where = NULL;
372 u8 my_cidr = horrible_mask_to_cidr(node->mask);
374 hlist_for_each_entry(other, &table->head, table) {
375 - if (!memcmp(&other->mask, &node->mask,
376 - sizeof(union nf_inet_addr)) &&
377 - !memcmp(&other->ip, &node->ip,
378 - sizeof(union nf_inet_addr)) &&
379 - other->ip_version == node->ip_version) {
380 + if (other->ip_version == node->ip_version &&
381 + !memcmp(&other->mask, &node->mask, sizeof(union nf_inet_addr)) &&
382 + !memcmp(&other->ip, &node->ip, sizeof(union nf_inet_addr))) {
383 other->value = node->value;
388 + hlist_for_each_entry(other, &table->head, table) {
390 if (horrible_mask_to_cidr(other->mask) <= my_cidr)
392 @@ -201,8 +181,7 @@ static __init int
393 horrible_allowedips_insert_v4(struct horrible_allowedips *table,
394 struct in_addr *ip, u8 cidr, void *value)
396 - struct horrible_allowedips_node *node = kzalloc(sizeof(*node),
398 + struct horrible_allowedips_node *node = kzalloc(sizeof(*node), GFP_KERNEL);
402 @@ -219,8 +198,7 @@ static __init int
403 horrible_allowedips_insert_v6(struct horrible_allowedips *table,
404 struct in6_addr *ip, u8 cidr, void *value)
406 - struct horrible_allowedips_node *node = kzalloc(sizeof(*node),
408 + struct horrible_allowedips_node *node = kzalloc(sizeof(*node), GFP_KERNEL);
412 @@ -234,39 +212,43 @@ horrible_allowedips_insert_v6(struct hor
416 -horrible_allowedips_lookup_v4(struct horrible_allowedips *table,
417 - struct in_addr *ip)
418 +horrible_allowedips_lookup_v4(struct horrible_allowedips *table, struct in_addr *ip)
420 struct horrible_allowedips_node *node;
423 hlist_for_each_entry(node, &table->head, table) {
424 - if (node->ip_version != 4)
426 - if (horrible_match_v4(node, ip)) {
430 + if (node->ip_version == 4 && horrible_match_v4(node, ip))
431 + return node->value;
438 -horrible_allowedips_lookup_v6(struct horrible_allowedips *table,
439 - struct in6_addr *ip)
440 +horrible_allowedips_lookup_v6(struct horrible_allowedips *table, struct in6_addr *ip)
442 struct horrible_allowedips_node *node;
445 hlist_for_each_entry(node, &table->head, table) {
446 - if (node->ip_version != 6)
447 + if (node->ip_version == 6 && horrible_match_v6(node, ip))
448 + return node->value;
455 +horrible_allowedips_remove_by_value(struct horrible_allowedips *table, void *value)
457 + struct horrible_allowedips_node *node;
458 + struct hlist_node *h;
460 + hlist_for_each_entry_safe(node, h, &table->head, table) {
461 + if (node->value != value)
463 - if (horrible_match_v6(node, ip)) {
467 + hlist_del(&node->table);
474 static __init bool randomized_test(void)
475 @@ -397,23 +379,33 @@ static __init bool randomized_test(void)
476 print_tree(t.root6, 128);
479 - for (i = 0; i < NUM_QUERIES; ++i) {
480 - prandom_bytes(ip, 4);
481 - if (lookup(t.root4, 32, ip) !=
482 - horrible_allowedips_lookup_v4(&h, (struct in_addr *)ip)) {
483 - pr_err("allowedips random self-test: FAIL\n");
485 + for (j = 0;; ++j) {
486 + for (i = 0; i < NUM_QUERIES; ++i) {
487 + prandom_bytes(ip, 4);
488 + if (lookup(t.root4, 32, ip) != horrible_allowedips_lookup_v4(&h, (struct in_addr *)ip)) {
489 + horrible_allowedips_lookup_v4(&h, (struct in_addr *)ip);
490 + pr_err("allowedips random v4 self-test: FAIL\n");
493 + prandom_bytes(ip, 16);
494 + if (lookup(t.root6, 128, ip) != horrible_allowedips_lookup_v6(&h, (struct in6_addr *)ip)) {
495 + pr_err("allowedips random v6 self-test: FAIL\n");
499 + if (j >= NUM_PEERS)
501 + mutex_lock(&mutex);
502 + wg_allowedips_remove_by_peer(&t, peers[j], &mutex);
503 + mutex_unlock(&mutex);
504 + horrible_allowedips_remove_by_value(&h, peers[j]);
507 - for (i = 0; i < NUM_QUERIES; ++i) {
508 - prandom_bytes(ip, 16);
509 - if (lookup(t.root6, 128, ip) !=
510 - horrible_allowedips_lookup_v6(&h, (struct in6_addr *)ip)) {
511 - pr_err("allowedips random self-test: FAIL\n");
514 + if (t.root4 || t.root6) {
515 + pr_err("allowedips random self-test removal: FAIL\n");