1 From 8c20e2eb5f2a0175b774134685e4d7bd93e85ff8 Mon Sep 17 00:00:00 2001
2 From: Yu Zhao <yuzhao@google.com>
3 Date: Wed, 21 Dec 2022 21:18:59 -0700
4 Subject: [PATCH 01/19] UPSTREAM: mm: multi-gen LRU: rename lru_gen_struct to
7 Patch series "mm: multi-gen LRU: memcg LRU", v3.
12 An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs,
13 since each node and memcg combination has an LRU of folios (see
16 Its goal is to improve the scalability of global reclaim, which is
17 critical to system-wide memory overcommit in data centers. Note that
18 memcg reclaim is currently out of scope.
20 Its memory bloat is a pointer to each lruvec and negligible to each
21 pglist_data. In terms of traversing memcgs during global reclaim, it
22 improves the best-case complexity from O(n) to O(1) and does not affect
23 the worst-case complexity O(n). Therefore, on average, it has a sublinear
24 complexity in contrast to the current linear complexity.
26 The basic structure of an memcg LRU can be understood by an analogy to
27 the active/inactive LRU (of folios):
28 1. It has the young and the old (generations), i.e., the counterparts
29 to the active and the inactive;
30 2. The increment of max_seq triggers promotion, i.e., the counterpart
32 3. Other events trigger similar operations, e.g., offlining an memcg
33 triggers demotion, i.e., the counterpart to deactivation.
35 In terms of global reclaim, it has two distinct features:
36 1. Sharding, which allows each thread to start at a random memcg (in
37 the old generation) and improves parallelism;
38 2. Eventual fairness, which allows direct reclaim to bail out at will
39 and reduces latency without affecting fairness over some time.
41 The commit message in patch 6 details the workflow:
42 https://lore.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com/
44 The following is a simple test to quickly verify its effectiveness.
47 1. Create multiple memcgs.
48 2. Each memcg contains a job (fio).
49 3. All jobs access the same amount of memory randomly.
50 4. The system does not experience global memory pressure.
51 5. Periodically write to the root memory.reclaim.
54 1. All memcgs have similar pgsteal counts, i.e., stddev(pgsteal)
55 over mean(pgsteal) is close to 0%.
56 2. The total pgsteal is close to the total requested through
57 memory.reclaim, i.e., sum(pgsteal) over sum(requested) is close
62 stddev(pgsteal) / mean(pgsteal) 75% 20%
63 sum(pgsteal) / sum(requested) 425% 95%
65 ####################################################################
68 for ((memcg = 0; memcg < $MEMCGS; memcg++)); do
69 mkdir /sys/fs/cgroup/memcg$memcg
73 echo $BASHPID > /sys/fs/cgroup/memcg$memcg/cgroup.procs
75 fio -name=memcg$memcg --numjobs=1 --ioengine=mmap \
76 --filename=/dev/zero --size=1920M --rw=randrw \
77 --rate=64m,64m --random_distribution=random \
78 --fadvise_hint=0 --time_based --runtime=10h \
79 --group_reporting --minimal
82 for ((memcg = 0; memcg < $MEMCGS; memcg++)); do
88 for ((i = 0; i < 600; i++)); do
89 echo 256m >/sys/fs/cgroup/memory.reclaim
93 for ((memcg = 0; memcg < $MEMCGS; memcg++)); do
94 grep "pgsteal " /sys/fs/cgroup/memcg$memcg/memory.stat
96 ####################################################################
98 [1]: This was obtained from running the above script (touches less
99 than 256GB memory) on an EPYC 7B13 with 512GB DRAM for over an
104 The new name lru_gen_folio will be more distinct from the coming
107 Link: https://lkml.kernel.org/r/20221222041905.2431096-1-yuzhao@google.com
108 Link: https://lkml.kernel.org/r/20221222041905.2431096-2-yuzhao@google.com
109 Signed-off-by: Yu Zhao <yuzhao@google.com>
110 Cc: Johannes Weiner <hannes@cmpxchg.org>
111 Cc: Jonathan Corbet <corbet@lwn.net>
112 Cc: Michael Larabel <Michael@MichaelLarabel.com>
113 Cc: Michal Hocko <mhocko@kernel.org>
114 Cc: Mike Rapoport <rppt@kernel.org>
115 Cc: Roman Gushchin <roman.gushchin@linux.dev>
116 Cc: Suren Baghdasaryan <surenb@google.com>
117 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
119 (cherry picked from commit 391655fe08d1f942359a11148aa9aaf3f99d6d6f)
120 Change-Id: I7df67e0e2435ba28f10eaa57d28d98b61a9210a6
121 Signed-off-by: T.J. Mercier <tjmercier@google.com>
123 include/linux/mm_inline.h | 4 ++--
124 include/linux/mmzone.h | 6 +++---
125 mm/vmscan.c | 34 +++++++++++++++++-----------------
126 mm/workingset.c | 4 ++--
127 4 files changed, 24 insertions(+), 24 deletions(-)
129 diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
130 index e8ed225d8f7ca..f63968bd7de59 100644
131 --- a/include/linux/mm_inline.h
132 +++ b/include/linux/mm_inline.h
133 @@ -178,7 +178,7 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli
134 int zone = folio_zonenum(folio);
135 int delta = folio_nr_pages(folio);
136 enum lru_list lru = type * LRU_INACTIVE_FILE;
137 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
138 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
140 VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
141 VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
142 @@ -224,7 +224,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
143 int gen = folio_lru_gen(folio);
144 int type = folio_is_file_lru(folio);
145 int zone = folio_zonenum(folio);
146 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
147 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
149 VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);
151 diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
152 index 5f74891556f33..bd3e4689f72dc 100644
153 --- a/include/linux/mmzone.h
154 +++ b/include/linux/mmzone.h
155 @@ -404,7 +404,7 @@ enum {
156 * The number of pages in each generation is eventually consistent and therefore
157 * can be transiently negative when reset_batch_size() is pending.
159 -struct lru_gen_struct {
160 +struct lru_gen_folio {
161 /* the aging increments the youngest generation number */
162 unsigned long max_seq;
163 /* the eviction increments the oldest generation numbers */
164 @@ -461,7 +461,7 @@ struct lru_gen_mm_state {
165 struct lru_gen_mm_walk {
166 /* the lruvec under reclaim */
167 struct lruvec *lruvec;
168 - /* unstable max_seq from lru_gen_struct */
169 + /* unstable max_seq from lru_gen_folio */
170 unsigned long max_seq;
171 /* the next address within an mm to scan */
172 unsigned long next_addr;
173 @@ -524,7 +524,7 @@ struct lruvec {
175 #ifdef CONFIG_LRU_GEN
176 /* evictable pages divided into generations */
177 - struct lru_gen_struct lrugen;
178 + struct lru_gen_folio lrugen;
179 /* to concurrently iterate lru_gen_mm_list */
180 struct lru_gen_mm_state mm_state;
182 diff --git a/mm/vmscan.c b/mm/vmscan.c
183 index d18296109aa7e..27142caf284c1 100644
186 @@ -3190,7 +3190,7 @@ static int get_nr_gens(struct lruvec *lruvec, int type)
188 static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
190 - /* see the comment on lru_gen_struct */
191 + /* see the comment on lru_gen_folio */
192 return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
193 get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
194 get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
195 @@ -3596,7 +3596,7 @@ struct ctrl_pos {
196 static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
197 struct ctrl_pos *pos)
199 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
200 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
201 int hist = lru_hist_from_seq(lrugen->min_seq[type]);
203 pos->refaulted = lrugen->avg_refaulted[type][tier] +
204 @@ -3611,7 +3611,7 @@ static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
205 static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
208 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
209 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
210 bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
211 unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
213 @@ -3688,7 +3688,7 @@ static int folio_update_gen(struct folio *folio, int gen)
214 static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
216 int type = folio_is_file_lru(folio);
217 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
218 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
219 int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
220 unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
222 @@ -3733,7 +3733,7 @@ static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
223 static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
226 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
227 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
231 @@ -4250,7 +4250,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
234 int remaining = MAX_LRU_BATCH;
235 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
236 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
237 int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
239 if (type == LRU_GEN_ANON && !can_swap)
240 @@ -4286,7 +4286,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
243 bool success = false;
244 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
245 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
246 DEFINE_MIN_SEQ(lruvec);
248 VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
249 @@ -4307,7 +4307,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
253 - /* see the comment on lru_gen_struct */
254 + /* see the comment on lru_gen_folio */
256 min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
257 min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
258 @@ -4329,7 +4329,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
262 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
263 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
265 spin_lock_irq(&lruvec->lru_lock);
267 @@ -4387,7 +4387,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
269 struct lru_gen_mm_walk *walk;
270 struct mm_struct *mm = NULL;
271 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
272 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
274 VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
276 @@ -4452,7 +4452,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsig
277 unsigned long old = 0;
278 unsigned long young = 0;
279 unsigned long total = 0;
280 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
281 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
282 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
284 for (type = !can_swap; type < ANON_AND_FILE; type++) {
285 @@ -4737,7 +4737,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
286 int delta = folio_nr_pages(folio);
287 int refs = folio_lru_refs(folio);
288 int tier = lru_tier_from_refs(refs);
289 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
290 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
292 VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio);
294 @@ -4837,7 +4837,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
297 int remaining = MAX_LRU_BATCH;
298 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
299 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
300 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
302 VM_WARN_ON_ONCE(!list_empty(list));
303 @@ -5237,7 +5237,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
305 static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
307 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
308 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
310 if (lrugen->enabled) {
312 @@ -5519,7 +5519,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
315 int hist = lru_hist_from_seq(seq);
316 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
317 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
319 for (tier = 0; tier < MAX_NR_TIERS; tier++) {
320 seq_printf(m, " %10d", tier);
321 @@ -5569,7 +5569,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v)
323 bool full = !debugfs_real_fops(m->file)->write;
324 struct lruvec *lruvec = v;
325 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
326 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
327 int nid = lruvec_pgdat(lruvec)->node_id;
328 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
329 DEFINE_MAX_SEQ(lruvec);
330 @@ -5823,7 +5823,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
334 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
335 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
337 lrugen->max_seq = MIN_NR_GENS + 1;
338 lrugen->enabled = lru_gen_enabled();
339 diff --git a/mm/workingset.c b/mm/workingset.c
340 index ae7e984b23c6b..688aaa73f64e8 100644
341 --- a/mm/workingset.c
342 +++ b/mm/workingset.c
343 @@ -223,7 +223,7 @@ static void *lru_gen_eviction(struct folio *folio)
345 unsigned long min_seq;
346 struct lruvec *lruvec;
347 - struct lru_gen_struct *lrugen;
348 + struct lru_gen_folio *lrugen;
349 int type = folio_is_file_lru(folio);
350 int delta = folio_nr_pages(folio);
351 int refs = folio_lru_refs(folio);
352 @@ -252,7 +252,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow)
354 unsigned long min_seq;
355 struct lruvec *lruvec;
356 - struct lru_gen_struct *lrugen;
357 + struct lru_gen_folio *lrugen;
358 struct mem_cgroup *memcg;
359 struct pglist_data *pgdat;
360 int type = folio_is_file_lru(folio);