d63b03e6e8755d2656daae35abab6db418b60c85
[openwrt/staging/ldir.git] /
1 From 8c20e2eb5f2a0175b774134685e4d7bd93e85ff8 Mon Sep 17 00:00:00 2001
2 From: Yu Zhao <yuzhao@google.com>
3 Date: Wed, 21 Dec 2022 21:18:59 -0700
4 Subject: [PATCH 01/19] UPSTREAM: mm: multi-gen LRU: rename lru_gen_struct to
5 lru_gen_folio
6
7 Patch series "mm: multi-gen LRU: memcg LRU", v3.
8
9 Overview
10 ========
11
12 An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs,
13 since each node and memcg combination has an LRU of folios (see
14 mem_cgroup_lruvec()).
15
16 Its goal is to improve the scalability of global reclaim, which is
17 critical to system-wide memory overcommit in data centers. Note that
18 memcg reclaim is currently out of scope.
19
20 Its memory bloat is a pointer to each lruvec and negligible to each
21 pglist_data. In terms of traversing memcgs during global reclaim, it
22 improves the best-case complexity from O(n) to O(1) and does not affect
23 the worst-case complexity O(n). Therefore, on average, it has a sublinear
24 complexity in contrast to the current linear complexity.
25
26 The basic structure of an memcg LRU can be understood by an analogy to
27 the active/inactive LRU (of folios):
28 1. It has the young and the old (generations), i.e., the counterparts
29 to the active and the inactive;
30 2. The increment of max_seq triggers promotion, i.e., the counterpart
31 to activation;
32 3. Other events trigger similar operations, e.g., offlining an memcg
33 triggers demotion, i.e., the counterpart to deactivation.
34
35 In terms of global reclaim, it has two distinct features:
36 1. Sharding, which allows each thread to start at a random memcg (in
37 the old generation) and improves parallelism;
38 2. Eventual fairness, which allows direct reclaim to bail out at will
39 and reduces latency without affecting fairness over some time.
40
41 The commit message in patch 6 details the workflow:
42 https://lore.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com/
43
44 The following is a simple test to quickly verify its effectiveness.
45
46 Test design:
47 1. Create multiple memcgs.
48 2. Each memcg contains a job (fio).
49 3. All jobs access the same amount of memory randomly.
50 4. The system does not experience global memory pressure.
51 5. Periodically write to the root memory.reclaim.
52
53 Desired outcome:
54 1. All memcgs have similar pgsteal counts, i.e., stddev(pgsteal)
55 over mean(pgsteal) is close to 0%.
56 2. The total pgsteal is close to the total requested through
57 memory.reclaim, i.e., sum(pgsteal) over sum(requested) is close
58 to 100%.
59
60 Actual outcome [1]:
61 MGLRU off MGLRU on
62 stddev(pgsteal) / mean(pgsteal) 75% 20%
63 sum(pgsteal) / sum(requested) 425% 95%
64
65 ####################################################################
66 MEMCGS=128
67
68 for ((memcg = 0; memcg < $MEMCGS; memcg++)); do
69 mkdir /sys/fs/cgroup/memcg$memcg
70 done
71
72 start() {
73 echo $BASHPID > /sys/fs/cgroup/memcg$memcg/cgroup.procs
74
75 fio -name=memcg$memcg --numjobs=1 --ioengine=mmap \
76 --filename=/dev/zero --size=1920M --rw=randrw \
77 --rate=64m,64m --random_distribution=random \
78 --fadvise_hint=0 --time_based --runtime=10h \
79 --group_reporting --minimal
80 }
81
82 for ((memcg = 0; memcg < $MEMCGS; memcg++)); do
83 start &
84 done
85
86 sleep 600
87
88 for ((i = 0; i < 600; i++)); do
89 echo 256m >/sys/fs/cgroup/memory.reclaim
90 sleep 6
91 done
92
93 for ((memcg = 0; memcg < $MEMCGS; memcg++)); do
94 grep "pgsteal " /sys/fs/cgroup/memcg$memcg/memory.stat
95 done
96 ####################################################################
97
98 [1]: This was obtained from running the above script (touches less
99 than 256GB memory) on an EPYC 7B13 with 512GB DRAM for over an
100 hour.
101
102 This patch (of 8):
103
104 The new name lru_gen_folio will be more distinct from the coming
105 lru_gen_memcg.
106
107 Link: https://lkml.kernel.org/r/20221222041905.2431096-1-yuzhao@google.com
108 Link: https://lkml.kernel.org/r/20221222041905.2431096-2-yuzhao@google.com
109 Signed-off-by: Yu Zhao <yuzhao@google.com>
110 Cc: Johannes Weiner <hannes@cmpxchg.org>
111 Cc: Jonathan Corbet <corbet@lwn.net>
112 Cc: Michael Larabel <Michael@MichaelLarabel.com>
113 Cc: Michal Hocko <mhocko@kernel.org>
114 Cc: Mike Rapoport <rppt@kernel.org>
115 Cc: Roman Gushchin <roman.gushchin@linux.dev>
116 Cc: Suren Baghdasaryan <surenb@google.com>
117 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
118 Bug: 274865848
119 (cherry picked from commit 391655fe08d1f942359a11148aa9aaf3f99d6d6f)
120 Change-Id: I7df67e0e2435ba28f10eaa57d28d98b61a9210a6
121 Signed-off-by: T.J. Mercier <tjmercier@google.com>
122 ---
123 include/linux/mm_inline.h | 4 ++--
124 include/linux/mmzone.h | 6 +++---
125 mm/vmscan.c | 34 +++++++++++++++++-----------------
126 mm/workingset.c | 4 ++--
127 4 files changed, 24 insertions(+), 24 deletions(-)
128
129 diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
130 index e8ed225d8f7ca..f63968bd7de59 100644
131 --- a/include/linux/mm_inline.h
132 +++ b/include/linux/mm_inline.h
133 @@ -178,7 +178,7 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli
134 int zone = folio_zonenum(folio);
135 int delta = folio_nr_pages(folio);
136 enum lru_list lru = type * LRU_INACTIVE_FILE;
137 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
138 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
139
140 VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
141 VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
142 @@ -224,7 +224,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
143 int gen = folio_lru_gen(folio);
144 int type = folio_is_file_lru(folio);
145 int zone = folio_zonenum(folio);
146 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
147 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
148
149 VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);
150
151 diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
152 index 5f74891556f33..bd3e4689f72dc 100644
153 --- a/include/linux/mmzone.h
154 +++ b/include/linux/mmzone.h
155 @@ -404,7 +404,7 @@ enum {
156 * The number of pages in each generation is eventually consistent and therefore
157 * can be transiently negative when reset_batch_size() is pending.
158 */
159 -struct lru_gen_struct {
160 +struct lru_gen_folio {
161 /* the aging increments the youngest generation number */
162 unsigned long max_seq;
163 /* the eviction increments the oldest generation numbers */
164 @@ -461,7 +461,7 @@ struct lru_gen_mm_state {
165 struct lru_gen_mm_walk {
166 /* the lruvec under reclaim */
167 struct lruvec *lruvec;
168 - /* unstable max_seq from lru_gen_struct */
169 + /* unstable max_seq from lru_gen_folio */
170 unsigned long max_seq;
171 /* the next address within an mm to scan */
172 unsigned long next_addr;
173 @@ -524,7 +524,7 @@ struct lruvec {
174 unsigned long flags;
175 #ifdef CONFIG_LRU_GEN
176 /* evictable pages divided into generations */
177 - struct lru_gen_struct lrugen;
178 + struct lru_gen_folio lrugen;
179 /* to concurrently iterate lru_gen_mm_list */
180 struct lru_gen_mm_state mm_state;
181 #endif
182 diff --git a/mm/vmscan.c b/mm/vmscan.c
183 index d18296109aa7e..27142caf284c1 100644
184 --- a/mm/vmscan.c
185 +++ b/mm/vmscan.c
186 @@ -3190,7 +3190,7 @@ static int get_nr_gens(struct lruvec *lruvec, int type)
187
188 static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
189 {
190 - /* see the comment on lru_gen_struct */
191 + /* see the comment on lru_gen_folio */
192 return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
193 get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
194 get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
195 @@ -3596,7 +3596,7 @@ struct ctrl_pos {
196 static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
197 struct ctrl_pos *pos)
198 {
199 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
200 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
201 int hist = lru_hist_from_seq(lrugen->min_seq[type]);
202
203 pos->refaulted = lrugen->avg_refaulted[type][tier] +
204 @@ -3611,7 +3611,7 @@ static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
205 static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
206 {
207 int hist, tier;
208 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
209 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
210 bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
211 unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
212
213 @@ -3688,7 +3688,7 @@ static int folio_update_gen(struct folio *folio, int gen)
214 static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
215 {
216 int type = folio_is_file_lru(folio);
217 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
218 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
219 int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
220 unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
221
222 @@ -3733,7 +3733,7 @@ static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
223 static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
224 {
225 int gen, type, zone;
226 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
227 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
228
229 walk->batched = 0;
230
231 @@ -4250,7 +4250,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
232 {
233 int zone;
234 int remaining = MAX_LRU_BATCH;
235 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
236 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
237 int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
238
239 if (type == LRU_GEN_ANON && !can_swap)
240 @@ -4286,7 +4286,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
241 {
242 int gen, type, zone;
243 bool success = false;
244 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
245 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
246 DEFINE_MIN_SEQ(lruvec);
247
248 VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
249 @@ -4307,7 +4307,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
250 ;
251 }
252
253 - /* see the comment on lru_gen_struct */
254 + /* see the comment on lru_gen_folio */
255 if (can_swap) {
256 min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
257 min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
258 @@ -4329,7 +4329,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
259 {
260 int prev, next;
261 int type, zone;
262 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
263 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
264
265 spin_lock_irq(&lruvec->lru_lock);
266
267 @@ -4387,7 +4387,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
268 bool success;
269 struct lru_gen_mm_walk *walk;
270 struct mm_struct *mm = NULL;
271 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
272 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
273
274 VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
275
276 @@ -4452,7 +4452,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsig
277 unsigned long old = 0;
278 unsigned long young = 0;
279 unsigned long total = 0;
280 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
281 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
282 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
283
284 for (type = !can_swap; type < ANON_AND_FILE; type++) {
285 @@ -4737,7 +4737,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
286 int delta = folio_nr_pages(folio);
287 int refs = folio_lru_refs(folio);
288 int tier = lru_tier_from_refs(refs);
289 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
290 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
291
292 VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio);
293
294 @@ -4837,7 +4837,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
295 int scanned = 0;
296 int isolated = 0;
297 int remaining = MAX_LRU_BATCH;
298 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
299 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
300 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
301
302 VM_WARN_ON_ONCE(!list_empty(list));
303 @@ -5237,7 +5237,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
304
305 static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
306 {
307 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
308 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
309
310 if (lrugen->enabled) {
311 enum lru_list lru;
312 @@ -5519,7 +5519,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
313 int i;
314 int type, tier;
315 int hist = lru_hist_from_seq(seq);
316 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
317 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
318
319 for (tier = 0; tier < MAX_NR_TIERS; tier++) {
320 seq_printf(m, " %10d", tier);
321 @@ -5569,7 +5569,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v)
322 unsigned long seq;
323 bool full = !debugfs_real_fops(m->file)->write;
324 struct lruvec *lruvec = v;
325 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
326 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
327 int nid = lruvec_pgdat(lruvec)->node_id;
328 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
329 DEFINE_MAX_SEQ(lruvec);
330 @@ -5823,7 +5823,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
331 {
332 int i;
333 int gen, type, zone;
334 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
335 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
336
337 lrugen->max_seq = MIN_NR_GENS + 1;
338 lrugen->enabled = lru_gen_enabled();
339 diff --git a/mm/workingset.c b/mm/workingset.c
340 index ae7e984b23c6b..688aaa73f64e8 100644
341 --- a/mm/workingset.c
342 +++ b/mm/workingset.c
343 @@ -223,7 +223,7 @@ static void *lru_gen_eviction(struct folio *folio)
344 unsigned long token;
345 unsigned long min_seq;
346 struct lruvec *lruvec;
347 - struct lru_gen_struct *lrugen;
348 + struct lru_gen_folio *lrugen;
349 int type = folio_is_file_lru(folio);
350 int delta = folio_nr_pages(folio);
351 int refs = folio_lru_refs(folio);
352 @@ -252,7 +252,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow)
353 unsigned long token;
354 unsigned long min_seq;
355 struct lruvec *lruvec;
356 - struct lru_gen_struct *lrugen;
357 + struct lru_gen_folio *lrugen;
358 struct mem_cgroup *memcg;
359 struct pglist_data *pgdat;
360 int type = folio_is_file_lru(folio);
361 --
362 2.40.1
363