1 From 348fdbada9fb3f0bf1a53651be46319105af187f Mon Sep 17 00:00:00 2001
2 From: Yu Zhao <yuzhao@google.com>
3 Date: Wed, 21 Dec 2022 21:18:59 -0700
4 Subject: [PATCH 21/29] mm: multi-gen LRU: rename lru_gen_struct to
7 Patch series "mm: multi-gen LRU: memcg LRU", v3.
12 An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs,
13 since each node and memcg combination has an LRU of pages (see
16 Its goal is to improve the scalability of global reclaim, which is
17 critical to system-wide memory overcommit in data centers. Note that
18 memcg reclaim is currently out of scope.
20 Its memory bloat is a pointer to each lruvec and negligible to each
21 pglist_data. In terms of traversing memcgs during global reclaim, it
22 improves the best-case complexity from O(n) to O(1) and does not affect
23 the worst-case complexity O(n). Therefore, on average, it has a sublinear
24 complexity in contrast to the current linear complexity.
26 The basic structure of an memcg LRU can be understood by an analogy to
27 the active/inactive LRU (of pages):
28 1. It has the young and the old (generations), i.e., the counterparts
29 to the active and the inactive;
30 2. The increment of max_seq triggers promotion, i.e., the counterpart
32 3. Other events trigger similar operations, e.g., offlining an memcg
33 triggers demotion, i.e., the counterpart to deactivation.
35 In terms of global reclaim, it has two distinct features:
36 1. Sharding, which allows each thread to start at a random memcg (in
37 the old generation) and improves parallelism;
38 2. Eventual fairness, which allows direct reclaim to bail out at will
39 and reduces latency without affecting fairness over some time.
41 The commit message in patch 6 details the workflow:
42 https://lore.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com/
44 The following is a simple test to quickly verify its effectiveness.
47 1. Create multiple memcgs.
48 2. Each memcg contains a job (fio).
49 3. All jobs access the same amount of memory randomly.
50 4. The system does not experience global memory pressure.
51 5. Periodically write to the root memory.reclaim.
54 1. All memcgs have similar pgsteal counts, i.e., stddev(pgsteal)
55 over mean(pgsteal) is close to 0%.
56 2. The total pgsteal is close to the total requested through
57 memory.reclaim, i.e., sum(pgsteal) over sum(requested) is close
62 stddev(pgsteal) / mean(pgsteal) 75% 20%
63 sum(pgsteal) / sum(requested) 425% 95%
65 ####################################################################
68 for ((memcg = 0; memcg < $MEMCGS; memcg++)); do
69 mkdir /sys/fs/cgroup/memcg$memcg
73 echo $BASHPID > /sys/fs/cgroup/memcg$memcg/cgroup.procs
75 fio -name=memcg$memcg --numjobs=1 --ioengine=mmap \
76 --filename=/dev/zero --size=1920M --rw=randrw \
77 --rate=64m,64m --random_distribution=random \
78 --fadvise_hint=0 --time_based --runtime=10h \
79 --group_reporting --minimal
82 for ((memcg = 0; memcg < $MEMCGS; memcg++)); do
88 for ((i = 0; i < 600; i++)); do
89 echo 256m >/sys/fs/cgroup/memory.reclaim
93 for ((memcg = 0; memcg < $MEMCGS; memcg++)); do
94 grep "pgsteal " /sys/fs/cgroup/memcg$memcg/memory.stat
96 ####################################################################
98 [1]: This was obtained from running the above script (touches less
99 than 256GB memory) on an EPYC 7B13 with 512GB DRAM for over an
104 The new name lru_gen_page will be more distinct from the coming
107 Link: https://lkml.kernel.org/r/20221222041905.2431096-1-yuzhao@google.com
108 Link: https://lkml.kernel.org/r/20221222041905.2431096-2-yuzhao@google.com
109 Signed-off-by: Yu Zhao <yuzhao@google.com>
110 Cc: Johannes Weiner <hannes@cmpxchg.org>
111 Cc: Jonathan Corbet <corbet@lwn.net>
112 Cc: Michael Larabel <Michael@MichaelLarabel.com>
113 Cc: Michal Hocko <mhocko@kernel.org>
114 Cc: Mike Rapoport <rppt@kernel.org>
115 Cc: Roman Gushchin <roman.gushchin@linux.dev>
116 Cc: Suren Baghdasaryan <surenb@google.com>
117 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
119 include/linux/mm_inline.h | 4 ++--
120 include/linux/mmzone.h | 6 +++---
121 mm/vmscan.c | 34 +++++++++++++++++-----------------
122 mm/workingset.c | 4 ++--
123 4 files changed, 24 insertions(+), 24 deletions(-)
125 --- a/include/linux/mm_inline.h
126 +++ b/include/linux/mm_inline.h
127 @@ -168,7 +168,7 @@ static inline void lru_gen_update_size(s
128 int zone = page_zonenum(page);
129 int delta = thp_nr_pages(page);
130 enum lru_list lru = type * LRU_INACTIVE_FILE;
131 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
132 + struct lru_gen_page *lrugen = &lruvec->lrugen;
134 VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
135 VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
136 @@ -214,7 +214,7 @@ static inline bool lru_gen_add_page(stru
137 int gen = page_lru_gen(page);
138 int type = page_is_file_lru(page);
139 int zone = page_zonenum(page);
140 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
141 + struct lru_gen_page *lrugen = &lruvec->lrugen;
143 VM_WARN_ON_ONCE_PAGE(gen != -1, page);
145 --- a/include/linux/mmzone.h
146 +++ b/include/linux/mmzone.h
147 @@ -394,7 +394,7 @@ enum {
148 * The number of pages in each generation is eventually consistent and therefore
149 * can be transiently negative when reset_batch_size() is pending.
151 -struct lru_gen_struct {
152 +struct lru_gen_page {
153 /* the aging increments the youngest generation number */
154 unsigned long max_seq;
155 /* the eviction increments the oldest generation numbers */
156 @@ -451,7 +451,7 @@ struct lru_gen_mm_state {
157 struct lru_gen_mm_walk {
158 /* the lruvec under reclaim */
159 struct lruvec *lruvec;
160 - /* unstable max_seq from lru_gen_struct */
161 + /* unstable max_seq from lru_gen_page */
162 unsigned long max_seq;
163 /* the next address within an mm to scan */
164 unsigned long next_addr;
165 @@ -514,7 +514,7 @@ struct lruvec {
167 #ifdef CONFIG_LRU_GEN
168 /* evictable pages divided into generations */
169 - struct lru_gen_struct lrugen;
170 + struct lru_gen_page lrugen;
171 /* to concurrently iterate lru_gen_mm_list */
172 struct lru_gen_mm_state mm_state;
176 @@ -2910,7 +2910,7 @@ static int get_nr_gens(struct lruvec *lr
178 static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
180 - /* see the comment on lru_gen_struct */
181 + /* see the comment on lru_gen_page */
182 return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
183 get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
184 get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
185 @@ -3316,7 +3316,7 @@ struct ctrl_pos {
186 static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
187 struct ctrl_pos *pos)
189 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
190 + struct lru_gen_page *lrugen = &lruvec->lrugen;
191 int hist = lru_hist_from_seq(lrugen->min_seq[type]);
193 pos->refaulted = lrugen->avg_refaulted[type][tier] +
194 @@ -3331,7 +3331,7 @@ static void read_ctrl_pos(struct lruvec
195 static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
198 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
199 + struct lru_gen_page *lrugen = &lruvec->lrugen;
200 bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
201 unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
203 @@ -3408,7 +3408,7 @@ static int page_update_gen(struct page *
204 static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaiming)
206 int type = page_is_file_lru(page);
207 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
208 + struct lru_gen_page *lrugen = &lruvec->lrugen;
209 int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
210 unsigned long new_flags, old_flags = READ_ONCE(page->flags);
212 @@ -3453,7 +3453,7 @@ static void update_batch_size(struct lru
213 static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
216 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
217 + struct lru_gen_page *lrugen = &lruvec->lrugen;
221 @@ -3979,7 +3979,7 @@ static bool inc_min_seq(struct lruvec *l
224 int remaining = MAX_LRU_BATCH;
225 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
226 + struct lru_gen_page *lrugen = &lruvec->lrugen;
227 int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
229 if (type == LRU_GEN_ANON && !can_swap)
230 @@ -4015,7 +4015,7 @@ static bool try_to_inc_min_seq(struct lr
233 bool success = false;
234 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
235 + struct lru_gen_page *lrugen = &lruvec->lrugen;
236 DEFINE_MIN_SEQ(lruvec);
238 VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
239 @@ -4036,7 +4036,7 @@ next:
243 - /* see the comment on lru_gen_struct */
244 + /* see the comment on lru_gen_page */
246 min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
247 min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
248 @@ -4058,7 +4058,7 @@ static void inc_max_seq(struct lruvec *l
252 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
253 + struct lru_gen_page *lrugen = &lruvec->lrugen;
255 spin_lock_irq(&lruvec->lru_lock);
257 @@ -4116,7 +4116,7 @@ static bool try_to_inc_max_seq(struct lr
259 struct lru_gen_mm_walk *walk;
260 struct mm_struct *mm = NULL;
261 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
262 + struct lru_gen_page *lrugen = &lruvec->lrugen;
264 VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
266 @@ -4181,7 +4181,7 @@ static bool should_run_aging(struct lruv
267 unsigned long old = 0;
268 unsigned long young = 0;
269 unsigned long total = 0;
270 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
271 + struct lru_gen_page *lrugen = &lruvec->lrugen;
272 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
274 for (type = !can_swap; type < ANON_AND_FILE; type++) {
275 @@ -4466,7 +4466,7 @@ static bool sort_page(struct lruvec *lru
276 int delta = thp_nr_pages(page);
277 int refs = page_lru_refs(page);
278 int tier = lru_tier_from_refs(refs);
279 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
280 + struct lru_gen_page *lrugen = &lruvec->lrugen;
282 VM_WARN_ON_ONCE_PAGE(gen >= MAX_NR_GENS, page);
284 @@ -4566,7 +4566,7 @@ static int scan_pages(struct lruvec *lru
287 int remaining = MAX_LRU_BATCH;
288 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
289 + struct lru_gen_page *lrugen = &lruvec->lrugen;
290 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
292 VM_WARN_ON_ONCE(!list_empty(list));
293 @@ -4967,7 +4967,7 @@ done:
295 static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
297 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
298 + struct lru_gen_page *lrugen = &lruvec->lrugen;
300 if (lrugen->enabled) {
302 @@ -5247,7 +5247,7 @@ static void lru_gen_seq_show_full(struct
305 int hist = lru_hist_from_seq(seq);
306 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
307 + struct lru_gen_page *lrugen = &lruvec->lrugen;
309 for (tier = 0; tier < MAX_NR_TIERS; tier++) {
310 seq_printf(m, " %10d", tier);
311 @@ -5296,7 +5296,7 @@ static int lru_gen_seq_show(struct seq_f
313 bool full = !debugfs_real_fops(m->file)->write;
314 struct lruvec *lruvec = v;
315 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
316 + struct lru_gen_page *lrugen = &lruvec->lrugen;
317 int nid = lruvec_pgdat(lruvec)->node_id;
318 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
319 DEFINE_MAX_SEQ(lruvec);
320 @@ -5549,7 +5549,7 @@ void lru_gen_init_lruvec(struct lruvec *
324 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
325 + struct lru_gen_page *lrugen = &lruvec->lrugen;
327 lrugen->max_seq = MIN_NR_GENS + 1;
328 lrugen->enabled = lru_gen_enabled();
329 --- a/mm/workingset.c
330 +++ b/mm/workingset.c
331 @@ -223,7 +223,7 @@ static void *lru_gen_eviction(struct pag
333 unsigned long min_seq;
334 struct lruvec *lruvec;
335 - struct lru_gen_struct *lrugen;
336 + struct lru_gen_page *lrugen;
337 int type = page_is_file_lru(page);
338 int delta = thp_nr_pages(page);
339 int refs = page_lru_refs(page);
340 @@ -252,7 +252,7 @@ static void lru_gen_refault(struct page
342 unsigned long min_seq;
343 struct lruvec *lruvec;
344 - struct lru_gen_struct *lrugen;
345 + struct lru_gen_page *lrugen;
346 struct mem_cgroup *memcg;
347 struct pglist_data *pgdat;
348 int type = page_is_file_lru(page);