1 From 397624e12244ec038f51cb1f178ccb7a2ec562e5 Mon Sep 17 00:00:00 2001
2 From: "T.J. Alumbaugh" <talumbau@google.com>
3 Date: Wed, 18 Jan 2023 00:18:23 +0000
4 Subject: [PATCH 14/19] UPSTREAM: mm: multi-gen LRU: section for Bloom filters
6 Move Bloom filters code into a dedicated section. Improve the design doc
7 to explain Bloom filter usage and connection between aging and eviction in
10 Link: https://lkml.kernel.org/r/20230118001827.1040870-4-talumbau@google.com
11 Change-Id: I73e866f687c1ed9f5c8538086aa39408b79897db
12 Signed-off-by: T.J. Alumbaugh <talumbau@google.com>
13 Cc: Yu Zhao <yuzhao@google.com>
14 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
15 (cherry picked from commit ccbbbb85945d8f0255aa9dbc1b617017e2294f2c)
17 Signed-off-by: T.J. Mercier <tjmercier@google.com>
19 Documentation/mm/multigen_lru.rst | 16 +++
20 mm/vmscan.c | 180 +++++++++++++++---------------
21 2 files changed, 108 insertions(+), 88 deletions(-)
23 diff --git a/Documentation/mm/multigen_lru.rst b/Documentation/mm/multigen_lru.rst
24 index bd988a142bc2f..770b5d539856c 100644
25 --- a/Documentation/mm/multigen_lru.rst
26 +++ b/Documentation/mm/multigen_lru.rst
27 @@ -170,6 +170,22 @@ promotes hot pages. If the scan was done cacheline efficiently, it
28 adds the PMD entry pointing to the PTE table to the Bloom filter. This
29 forms a feedback loop between the eviction and the aging.
33 +Bloom filters are a space and memory efficient data structure for set
34 +membership test, i.e., test if an element is not in the set or may be
37 +In the eviction path, specifically, in ``lru_gen_look_around()``, if a
38 +PMD has a sufficient number of hot pages, its address is placed in the
39 +filter. In the aging path, set membership means that the PTE range
40 +will be scanned for young pages.
42 +Note that Bloom filters are probabilistic on set membership. If a test
43 +is false positive, the cost is an additional scan of a range of PTEs,
44 +which may yield hot pages anyway. Parameters of the filter itself can
45 +control the false positive rate in the limit.
49 The multi-gen LRU can be disassembled into the following parts:
50 diff --git a/mm/vmscan.c b/mm/vmscan.c
51 index 8fa82630240d6..74b4f9d660b56 100644
54 @@ -3208,6 +3208,98 @@ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
55 get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
58 +/******************************************************************************
60 + ******************************************************************************/
63 + * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
64 + * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
65 + * bits in a bitmap, k is the number of hash functions and n is the number of
68 + * Page table walkers use one of the two filters to reduce their search space.
69 + * To get rid of non-leaf entries that no longer have enough leaf entries, the
70 + * aging uses the double-buffering technique to flip to the other filter each
71 + * time it produces a new generation. For non-leaf entries that have enough
72 + * leaf entries, the aging carries them over to the next generation in
73 + * walk_pmd_range(); the eviction also report them when walking the rmap
74 + * in lru_gen_look_around().
76 + * For future optimizations:
77 + * 1. It's not necessary to keep both filters all the time. The spare one can be
78 + * freed after the RCU grace period and reallocated if needed again.
79 + * 2. And when reallocating, it's worth scaling its size according to the number
80 + * of inserted entries in the other filter, to reduce the memory overhead on
81 + * small systems and false positives on large systems.
82 + * 3. Jenkins' hash function is an alternative to Knuth's.
84 +#define BLOOM_FILTER_SHIFT 15
86 +static inline int filter_gen_from_seq(unsigned long seq)
88 + return seq % NR_BLOOM_FILTERS;
91 +static void get_item_key(void *item, int *key)
93 + u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
95 + BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
97 + key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
98 + key[1] = hash >> BLOOM_FILTER_SHIFT;
101 +static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
104 + unsigned long *filter;
105 + int gen = filter_gen_from_seq(seq);
107 + filter = READ_ONCE(lruvec->mm_state.filters[gen]);
111 + get_item_key(item, key);
113 + return test_bit(key[0], filter) && test_bit(key[1], filter);
116 +static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
119 + unsigned long *filter;
120 + int gen = filter_gen_from_seq(seq);
122 + filter = READ_ONCE(lruvec->mm_state.filters[gen]);
126 + get_item_key(item, key);
128 + if (!test_bit(key[0], filter))
129 + set_bit(key[0], filter);
130 + if (!test_bit(key[1], filter))
131 + set_bit(key[1], filter);
134 +static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
136 + unsigned long *filter;
137 + int gen = filter_gen_from_seq(seq);
139 + filter = lruvec->mm_state.filters[gen];
141 + bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
145 + filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
146 + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
147 + WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
150 /******************************************************************************
152 ******************************************************************************/
153 @@ -3333,94 +3425,6 @@ void lru_gen_migrate_mm(struct mm_struct *mm)
158 - * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
159 - * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
160 - * bits in a bitmap, k is the number of hash functions and n is the number of
163 - * Page table walkers use one of the two filters to reduce their search space.
164 - * To get rid of non-leaf entries that no longer have enough leaf entries, the
165 - * aging uses the double-buffering technique to flip to the other filter each
166 - * time it produces a new generation. For non-leaf entries that have enough
167 - * leaf entries, the aging carries them over to the next generation in
168 - * walk_pmd_range(); the eviction also report them when walking the rmap
169 - * in lru_gen_look_around().
171 - * For future optimizations:
172 - * 1. It's not necessary to keep both filters all the time. The spare one can be
173 - * freed after the RCU grace period and reallocated if needed again.
174 - * 2. And when reallocating, it's worth scaling its size according to the number
175 - * of inserted entries in the other filter, to reduce the memory overhead on
176 - * small systems and false positives on large systems.
177 - * 3. Jenkins' hash function is an alternative to Knuth's.
179 -#define BLOOM_FILTER_SHIFT 15
181 -static inline int filter_gen_from_seq(unsigned long seq)
183 - return seq % NR_BLOOM_FILTERS;
186 -static void get_item_key(void *item, int *key)
188 - u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
190 - BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
192 - key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
193 - key[1] = hash >> BLOOM_FILTER_SHIFT;
196 -static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
198 - unsigned long *filter;
199 - int gen = filter_gen_from_seq(seq);
201 - filter = lruvec->mm_state.filters[gen];
203 - bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
207 - filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
208 - __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
209 - WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
212 -static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
215 - unsigned long *filter;
216 - int gen = filter_gen_from_seq(seq);
218 - filter = READ_ONCE(lruvec->mm_state.filters[gen]);
222 - get_item_key(item, key);
224 - if (!test_bit(key[0], filter))
225 - set_bit(key[0], filter);
226 - if (!test_bit(key[1], filter))
227 - set_bit(key[1], filter);
230 -static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
233 - unsigned long *filter;
234 - int gen = filter_gen_from_seq(seq);
236 - filter = READ_ONCE(lruvec->mm_state.filters[gen]);
240 - get_item_key(item, key);
242 - return test_bit(key[0], filter) && test_bit(key[1], filter);
245 static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)