836a16b8c77a4f47667806718a86c2a0141f5fc1
[openwrt/staging/blogic.git] /
1 From 348fdbada9fb3f0bf1a53651be46319105af187f Mon Sep 17 00:00:00 2001
2 From: Yu Zhao <yuzhao@google.com>
3 Date: Wed, 21 Dec 2022 21:18:59 -0700
4 Subject: [PATCH 21/29] mm: multi-gen LRU: rename lru_gen_struct to
5 lru_gen_page
6
7 Patch series "mm: multi-gen LRU: memcg LRU", v3.
8
9 Overview
10 ========
11
12 An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs,
13 since each node and memcg combination has an LRU of pages (see
14 mem_cgroup_lruvec()).
15
16 Its goal is to improve the scalability of global reclaim, which is
17 critical to system-wide memory overcommit in data centers. Note that
18 memcg reclaim is currently out of scope.
19
20 Its memory bloat is a pointer to each lruvec and negligible to each
21 pglist_data. In terms of traversing memcgs during global reclaim, it
22 improves the best-case complexity from O(n) to O(1) and does not affect
23 the worst-case complexity O(n). Therefore, on average, it has a sublinear
24 complexity in contrast to the current linear complexity.
25
26 The basic structure of an memcg LRU can be understood by an analogy to
27 the active/inactive LRU (of pages):
28 1. It has the young and the old (generations), i.e., the counterparts
29 to the active and the inactive;
30 2. The increment of max_seq triggers promotion, i.e., the counterpart
31 to activation;
32 3. Other events trigger similar operations, e.g., offlining an memcg
33 triggers demotion, i.e., the counterpart to deactivation.
34
35 In terms of global reclaim, it has two distinct features:
36 1. Sharding, which allows each thread to start at a random memcg (in
37 the old generation) and improves parallelism;
38 2. Eventual fairness, which allows direct reclaim to bail out at will
39 and reduces latency without affecting fairness over some time.
40
41 The commit message in patch 6 details the workflow:
42 https://lore.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com/
43
44 The following is a simple test to quickly verify its effectiveness.
45
46 Test design:
47 1. Create multiple memcgs.
48 2. Each memcg contains a job (fio).
49 3. All jobs access the same amount of memory randomly.
50 4. The system does not experience global memory pressure.
51 5. Periodically write to the root memory.reclaim.
52
53 Desired outcome:
54 1. All memcgs have similar pgsteal counts, i.e., stddev(pgsteal)
55 over mean(pgsteal) is close to 0%.
56 2. The total pgsteal is close to the total requested through
57 memory.reclaim, i.e., sum(pgsteal) over sum(requested) is close
58 to 100%.
59
60 Actual outcome [1]:
61 MGLRU off MGLRU on
62 stddev(pgsteal) / mean(pgsteal) 75% 20%
63 sum(pgsteal) / sum(requested) 425% 95%
64
65 ####################################################################
66 MEMCGS=128
67
68 for ((memcg = 0; memcg < $MEMCGS; memcg++)); do
69 mkdir /sys/fs/cgroup/memcg$memcg
70 done
71
72 start() {
73 echo $BASHPID > /sys/fs/cgroup/memcg$memcg/cgroup.procs
74
75 fio -name=memcg$memcg --numjobs=1 --ioengine=mmap \
76 --filename=/dev/zero --size=1920M --rw=randrw \
77 --rate=64m,64m --random_distribution=random \
78 --fadvise_hint=0 --time_based --runtime=10h \
79 --group_reporting --minimal
80 }
81
82 for ((memcg = 0; memcg < $MEMCGS; memcg++)); do
83 start &
84 done
85
86 sleep 600
87
88 for ((i = 0; i < 600; i++)); do
89 echo 256m >/sys/fs/cgroup/memory.reclaim
90 sleep 6
91 done
92
93 for ((memcg = 0; memcg < $MEMCGS; memcg++)); do
94 grep "pgsteal " /sys/fs/cgroup/memcg$memcg/memory.stat
95 done
96 ####################################################################
97
98 [1]: This was obtained from running the above script (touches less
99 than 256GB memory) on an EPYC 7B13 with 512GB DRAM for over an
100 hour.
101
102 This patch (of 8):
103
104 The new name lru_gen_page will be more distinct from the coming
105 lru_gen_memcg.
106
107 Link: https://lkml.kernel.org/r/20221222041905.2431096-1-yuzhao@google.com
108 Link: https://lkml.kernel.org/r/20221222041905.2431096-2-yuzhao@google.com
109 Signed-off-by: Yu Zhao <yuzhao@google.com>
110 Cc: Johannes Weiner <hannes@cmpxchg.org>
111 Cc: Jonathan Corbet <corbet@lwn.net>
112 Cc: Michael Larabel <Michael@MichaelLarabel.com>
113 Cc: Michal Hocko <mhocko@kernel.org>
114 Cc: Mike Rapoport <rppt@kernel.org>
115 Cc: Roman Gushchin <roman.gushchin@linux.dev>
116 Cc: Suren Baghdasaryan <surenb@google.com>
117 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
118 ---
119 include/linux/mm_inline.h | 4 ++--
120 include/linux/mmzone.h | 6 +++---
121 mm/vmscan.c | 34 +++++++++++++++++-----------------
122 mm/workingset.c | 4 ++--
123 4 files changed, 24 insertions(+), 24 deletions(-)
124
125 diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
126 index 8a6a2a23f9b6..27c4890503c5 100644
127 --- a/include/linux/mm_inline.h
128 +++ b/include/linux/mm_inline.h
129 @@ -168,7 +168,7 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct page *page,
130 int zone = page_zonenum(page);
131 int delta = thp_nr_pages(page);
132 enum lru_list lru = type * LRU_INACTIVE_FILE;
133 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
134 + struct lru_gen_page *lrugen = &lruvec->lrugen;
135
136 VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
137 VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
138 @@ -214,7 +214,7 @@ static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bo
139 int gen = page_lru_gen(page);
140 int type = page_is_file_lru(page);
141 int zone = page_zonenum(page);
142 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
143 + struct lru_gen_page *lrugen = &lruvec->lrugen;
144
145 VM_WARN_ON_ONCE_PAGE(gen != -1, page);
146
147 diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
148 index 6b85ba1f4e18..5856b026c089 100644
149 --- a/include/linux/mmzone.h
150 +++ b/include/linux/mmzone.h
151 @@ -394,7 +394,7 @@ enum {
152 * The number of pages in each generation is eventually consistent and therefore
153 * can be transiently negative when reset_batch_size() is pending.
154 */
155 -struct lru_gen_struct {
156 +struct lru_gen_page {
157 /* the aging increments the youngest generation number */
158 unsigned long max_seq;
159 /* the eviction increments the oldest generation numbers */
160 @@ -451,7 +451,7 @@ struct lru_gen_mm_state {
161 struct lru_gen_mm_walk {
162 /* the lruvec under reclaim */
163 struct lruvec *lruvec;
164 - /* unstable max_seq from lru_gen_struct */
165 + /* unstable max_seq from lru_gen_page */
166 unsigned long max_seq;
167 /* the next address within an mm to scan */
168 unsigned long next_addr;
169 @@ -514,7 +514,7 @@ struct lruvec {
170 unsigned long flags;
171 #ifdef CONFIG_LRU_GEN
172 /* evictable pages divided into generations */
173 - struct lru_gen_struct lrugen;
174 + struct lru_gen_page lrugen;
175 /* to concurrently iterate lru_gen_mm_list */
176 struct lru_gen_mm_state mm_state;
177 #endif
178 diff --git a/mm/vmscan.c b/mm/vmscan.c
179 index 4ab376abeaae..3b1b5bd9736a 100644
180 --- a/mm/vmscan.c
181 +++ b/mm/vmscan.c
182 @@ -2910,7 +2910,7 @@ static int get_nr_gens(struct lruvec *lruvec, int type)
183
184 static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
185 {
186 - /* see the comment on lru_gen_struct */
187 + /* see the comment on lru_gen_page */
188 return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
189 get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
190 get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
191 @@ -3316,7 +3316,7 @@ struct ctrl_pos {
192 static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
193 struct ctrl_pos *pos)
194 {
195 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
196 + struct lru_gen_page *lrugen = &lruvec->lrugen;
197 int hist = lru_hist_from_seq(lrugen->min_seq[type]);
198
199 pos->refaulted = lrugen->avg_refaulted[type][tier] +
200 @@ -3331,7 +3331,7 @@ static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
201 static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
202 {
203 int hist, tier;
204 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
205 + struct lru_gen_page *lrugen = &lruvec->lrugen;
206 bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
207 unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
208
209 @@ -3408,7 +3408,7 @@ static int page_update_gen(struct page *page, int gen)
210 static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaiming)
211 {
212 int type = page_is_file_lru(page);
213 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
214 + struct lru_gen_page *lrugen = &lruvec->lrugen;
215 int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
216 unsigned long new_flags, old_flags = READ_ONCE(page->flags);
217
218 @@ -3453,7 +3453,7 @@ static void update_batch_size(struct lru_gen_mm_walk *walk, struct page *page,
219 static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
220 {
221 int gen, type, zone;
222 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
223 + struct lru_gen_page *lrugen = &lruvec->lrugen;
224
225 walk->batched = 0;
226
227 @@ -3979,7 +3979,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
228 {
229 int zone;
230 int remaining = MAX_LRU_BATCH;
231 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
232 + struct lru_gen_page *lrugen = &lruvec->lrugen;
233 int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
234
235 if (type == LRU_GEN_ANON && !can_swap)
236 @@ -4015,7 +4015,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
237 {
238 int gen, type, zone;
239 bool success = false;
240 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
241 + struct lru_gen_page *lrugen = &lruvec->lrugen;
242 DEFINE_MIN_SEQ(lruvec);
243
244 VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
245 @@ -4036,7 +4036,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
246 ;
247 }
248
249 - /* see the comment on lru_gen_struct */
250 + /* see the comment on lru_gen_page */
251 if (can_swap) {
252 min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
253 min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
254 @@ -4058,7 +4058,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
255 {
256 int prev, next;
257 int type, zone;
258 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
259 + struct lru_gen_page *lrugen = &lruvec->lrugen;
260
261 spin_lock_irq(&lruvec->lru_lock);
262
263 @@ -4116,7 +4116,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
264 bool success;
265 struct lru_gen_mm_walk *walk;
266 struct mm_struct *mm = NULL;
267 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
268 + struct lru_gen_page *lrugen = &lruvec->lrugen;
269
270 VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
271
272 @@ -4181,7 +4181,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsig
273 unsigned long old = 0;
274 unsigned long young = 0;
275 unsigned long total = 0;
276 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
277 + struct lru_gen_page *lrugen = &lruvec->lrugen;
278 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
279
280 for (type = !can_swap; type < ANON_AND_FILE; type++) {
281 @@ -4466,7 +4466,7 @@ static bool sort_page(struct lruvec *lruvec, struct page *page, int tier_idx)
282 int delta = thp_nr_pages(page);
283 int refs = page_lru_refs(page);
284 int tier = lru_tier_from_refs(refs);
285 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
286 + struct lru_gen_page *lrugen = &lruvec->lrugen;
287
288 VM_WARN_ON_ONCE_PAGE(gen >= MAX_NR_GENS, page);
289
290 @@ -4566,7 +4566,7 @@ static int scan_pages(struct lruvec *lruvec, struct scan_control *sc,
291 int scanned = 0;
292 int isolated = 0;
293 int remaining = MAX_LRU_BATCH;
294 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
295 + struct lru_gen_page *lrugen = &lruvec->lrugen;
296 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
297
298 VM_WARN_ON_ONCE(!list_empty(list));
299 @@ -4967,7 +4967,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
300
301 static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
302 {
303 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
304 + struct lru_gen_page *lrugen = &lruvec->lrugen;
305
306 if (lrugen->enabled) {
307 enum lru_list lru;
308 @@ -5247,7 +5247,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
309 int i;
310 int type, tier;
311 int hist = lru_hist_from_seq(seq);
312 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
313 + struct lru_gen_page *lrugen = &lruvec->lrugen;
314
315 for (tier = 0; tier < MAX_NR_TIERS; tier++) {
316 seq_printf(m, " %10d", tier);
317 @@ -5296,7 +5296,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v)
318 unsigned long seq;
319 bool full = !debugfs_real_fops(m->file)->write;
320 struct lruvec *lruvec = v;
321 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
322 + struct lru_gen_page *lrugen = &lruvec->lrugen;
323 int nid = lruvec_pgdat(lruvec)->node_id;
324 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
325 DEFINE_MAX_SEQ(lruvec);
326 @@ -5549,7 +5549,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
327 {
328 int i;
329 int gen, type, zone;
330 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
331 + struct lru_gen_page *lrugen = &lruvec->lrugen;
332
333 lrugen->max_seq = MIN_NR_GENS + 1;
334 lrugen->enabled = lru_gen_enabled();
335 diff --git a/mm/workingset.c b/mm/workingset.c
336 index aeba62cebf8c..a5e1798c6d60 100644
337 --- a/mm/workingset.c
338 +++ b/mm/workingset.c
339 @@ -223,7 +223,7 @@ static void *lru_gen_eviction(struct page *page)
340 unsigned long token;
341 unsigned long min_seq;
342 struct lruvec *lruvec;
343 - struct lru_gen_struct *lrugen;
344 + struct lru_gen_page *lrugen;
345 int type = page_is_file_lru(page);
346 int delta = thp_nr_pages(page);
347 int refs = page_lru_refs(page);
348 @@ -252,7 +252,7 @@ static void lru_gen_refault(struct page *page, void *shadow)
349 unsigned long token;
350 unsigned long min_seq;
351 struct lruvec *lruvec;
352 - struct lru_gen_struct *lrugen;
353 + struct lru_gen_page *lrugen;
354 struct mem_cgroup *memcg;
355 struct pglist_data *pgdat;
356 int type = page_is_file_lru(page);
357 --
358 2.40.0
359