From: Kazuki Hashimoto Date: Wed, 24 May 2023 14:58:03 +0000 (+0900) Subject: kernel: Backport MGLRU patch from 6.4 X-Git-Url: http://git.lede-project.org./?a=commitdiff_plain;h=ae8db3941ce90f4b00dfeb36a428425cdc53144d;p=openwrt%2Fstaging%2Fnbd.git kernel: Backport MGLRU patch from 6.4 This fixes a bug where the reclaim path could occasionally have long tail latency. Signed-off-by: Kazuki Hashimoto --- diff --git a/target/linux/generic/backport-5.15/021-v6.4-mm-Multi-gen-LRU-remove-wait_event_killable.patch b/target/linux/generic/backport-5.15/021-v6.4-mm-Multi-gen-LRU-remove-wait_event_killable.patch new file mode 100644 index 0000000000..e154494a51 --- /dev/null +++ b/target/linux/generic/backport-5.15/021-v6.4-mm-Multi-gen-LRU-remove-wait_event_killable.patch @@ -0,0 +1,266 @@ +From 087ed25eaf5a78a678508e893f80addab9b1c103 Mon Sep 17 00:00:00 2001 +From: Kalesh Singh +Date: Thu, 13 Apr 2023 14:43:26 -0700 +Subject: [PATCH] mm: Multi-gen LRU: remove wait_event_killable() + +Android 14 and later default to MGLRU [1] and field telemetry showed +occasional long tail latency (>100ms) in the reclaim path. + +Tracing revealed priority inversion in the reclaim path. In +try_to_inc_max_seq(), when high priority tasks were blocked on +wait_event_killable(), the preemption of the low priority task to call +wake_up_all() caused those high priority tasks to wait longer than +necessary. In general, this problem is not different from others of its +kind, e.g., one caused by mutex_lock(). However, it is specific to MGLRU +because it introduced the new wait queue lruvec->mm_state.wait. + +The purpose of this new wait queue is to avoid the thundering herd +problem. If many direct reclaimers rush into try_to_inc_max_seq(), only +one can succeed, i.e., the one to wake up the rest, and the rest who +failed might cause premature OOM kills if they do not wait. So far there +is no evidence supporting this scenario, based on how often the wait has +been hit. And this begs the question how useful the wait queue is in +practice. + +Based on Minchan's recommendation, which is in line with his commit +6d4675e60135 ("mm: don't be stuck to rmap lock on reclaim path") and the +rest of the MGLRU code which also uses trylock when possible, remove the +wait queue. + +[1] https://android-review.googlesource.com/q/I7ed7fbfd6ef9ce10053347528125dd98c39e50bf + +Link: https://lkml.kernel.org/r/20230413214326.2147568-1-kaleshsingh@google.com +Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks") +Change-Id: I911f3968fd1adb25171279cc5b6f48ccb7efc8de +Signed-off-by: Kalesh Singh +Suggested-by: Minchan Kim +Reported-by: Wei Wang +Acked-by: Yu Zhao +Cc: Minchan Kim +Cc: Jan Alexander Steffens (heftig) +Cc: Oleksandr Natalenko +Cc: Suleiman Souhlal +Cc: Suren Baghdasaryan +Signed-off-by: Andrew Morton +--- + include/linux/mmzone.h | 8 +-- + mm/vmscan.c | 111 +++++++++++++++-------------------------- + 2 files changed, 42 insertions(+), 77 deletions(-) + +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -452,18 +452,14 @@ enum { + struct lru_gen_mm_state { + /* set to max_seq after each iteration */ + unsigned long seq; +- /* where the current iteration continues (inclusive) */ ++ /* where the current iteration continues after */ + struct list_head *head; +- /* where the last iteration ended (exclusive) */ ++ /* where the last iteration ended before */ + struct list_head *tail; +- /* to wait for the last page table walker to finish */ +- struct wait_queue_head wait; + /* Bloom filters flip after each iteration */ + unsigned long *filters[NR_BLOOM_FILTERS]; + /* the mm stats for debugging */ + unsigned long stats[NR_HIST_GENS][NR_MM_STATS]; +- /* the number of concurrent page table walkers */ +- int nr_walkers; + }; + + struct lru_gen_mm_walk { +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2999,18 +2999,13 @@ void lru_gen_del_mm(struct mm_struct *mm + if (!lruvec) + continue; + +- /* where the last iteration ended (exclusive) */ ++ /* where the current iteration continues after */ ++ if (lruvec->mm_state.head == &mm->lru_gen.list) ++ lruvec->mm_state.head = lruvec->mm_state.head->prev; ++ ++ /* where the last iteration ended before */ + if (lruvec->mm_state.tail == &mm->lru_gen.list) + lruvec->mm_state.tail = lruvec->mm_state.tail->next; +- +- /* where the current iteration continues (inclusive) */ +- if (lruvec->mm_state.head != &mm->lru_gen.list) +- continue; +- +- lruvec->mm_state.head = lruvec->mm_state.head->next; +- /* the deletion ends the current iteration */ +- if (lruvec->mm_state.head == &mm_list->fifo) +- WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1); + } + + list_del_init(&mm->lru_gen.list); +@@ -3194,68 +3189,54 @@ static bool iterate_mm_list(struct lruve + struct mm_struct **iter) + { + bool first = false; +- bool last = true; ++ bool last = false; + struct mm_struct *mm = NULL; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + struct lru_gen_mm_state *mm_state = &lruvec->mm_state; + + /* +- * There are four interesting cases for this page table walker: +- * 1. It tries to start a new iteration of mm_list with a stale max_seq; +- * there is nothing left to do. +- * 2. It's the first of the current generation, and it needs to reset +- * the Bloom filter for the next generation. +- * 3. It reaches the end of mm_list, and it needs to increment +- * mm_state->seq; the iteration is done. +- * 4. It's the last of the current generation, and it needs to reset the +- * mm stats counters for the next generation. ++ * mm_state->seq is incremented after each iteration of mm_list. There ++ * are three interesting cases for this page table walker: ++ * 1. It tries to start a new iteration with a stale max_seq: there is ++ * nothing left to do. ++ * 2. It started the next iteration: it needs to reset the Bloom filter ++ * so that a fresh set of PTE tables can be recorded. ++ * 3. It ended the current iteration: it needs to reset the mm stats ++ * counters and tell its caller to increment max_seq. + */ + spin_lock(&mm_list->lock); + + VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq); +- VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq); +- VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers); + +- if (walk->max_seq <= mm_state->seq) { +- if (!*iter) +- last = false; ++ if (walk->max_seq <= mm_state->seq) + goto done; +- } + +- if (!mm_state->nr_walkers) { +- VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo); ++ if (!mm_state->head) ++ mm_state->head = &mm_list->fifo; + +- mm_state->head = mm_list->fifo.next; ++ if (mm_state->head == &mm_list->fifo) + first = true; +- } +- +- while (!mm && mm_state->head != &mm_list->fifo) { +- mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); + ++ do { + mm_state->head = mm_state->head->next; ++ if (mm_state->head == &mm_list->fifo) { ++ WRITE_ONCE(mm_state->seq, mm_state->seq + 1); ++ last = true; ++ break; ++ } + + /* force scan for those added after the last iteration */ +- if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) { +- mm_state->tail = mm_state->head; ++ if (!mm_state->tail || mm_state->tail == mm_state->head) { ++ mm_state->tail = mm_state->head->next; + walk->force_scan = true; + } + ++ mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); + if (should_skip_mm(mm, walk)) + mm = NULL; +- } +- +- if (mm_state->head == &mm_list->fifo) +- WRITE_ONCE(mm_state->seq, mm_state->seq + 1); ++ } while (!mm); + done: +- if (*iter && !mm) +- mm_state->nr_walkers--; +- if (!*iter && mm) +- mm_state->nr_walkers++; +- +- if (mm_state->nr_walkers) +- last = false; +- + if (*iter || last) + reset_mm_stats(lruvec, walk, last); + +@@ -3283,9 +3264,9 @@ static bool iterate_mm_list_nowalk(struc + + VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq); + +- if (max_seq > mm_state->seq && !mm_state->nr_walkers) { +- VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo); +- ++ if (max_seq > mm_state->seq) { ++ mm_state->head = NULL; ++ mm_state->tail = NULL; + WRITE_ONCE(mm_state->seq, mm_state->seq + 1); + reset_mm_stats(lruvec, NULL, true); + success = true; +@@ -3894,10 +3875,6 @@ restart: + + walk_pmd_range(&val, addr, next, args); + +- /* a racy check to curtail the waiting time */ +- if (wq_has_sleeper(&walk->lruvec->mm_state.wait)) +- return 1; +- + if (need_resched() || walk->batched >= MAX_LRU_BATCH) { + end = (addr | ~PUD_MASK) + 1; + goto done; +@@ -3930,8 +3907,14 @@ static void walk_mm(struct lruvec *lruve + walk->next_addr = FIRST_USER_ADDRESS; + + do { ++ DEFINE_MAX_SEQ(lruvec); ++ + err = -EBUSY; + ++ /* another thread might have called inc_max_seq() */ ++ if (walk->max_seq != max_seq) ++ break; ++ + /* page_update_gen() requires stable page_memcg() */ + if (!mem_cgroup_trylock_pages(memcg)) + break; +@@ -4164,25 +4147,12 @@ static bool try_to_inc_max_seq(struct lr + success = iterate_mm_list(lruvec, walk, &mm); + if (mm) + walk_mm(lruvec, mm, walk); +- +- cond_resched(); + } while (mm); + done: +- if (!success) { +- if (sc->priority <= DEF_PRIORITY - 2) +- wait_event_killable(lruvec->mm_state.wait, +- max_seq < READ_ONCE(lrugen->max_seq)); +- return false; +- } ++ if (success) ++ inc_max_seq(lruvec, can_swap, force_scan); + +- VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq)); +- +- inc_max_seq(lruvec, can_swap, force_scan); +- /* either this sees any waiters or they will see updated max_seq */ +- if (wq_has_sleeper(&lruvec->mm_state.wait)) +- wake_up_all(&lruvec->mm_state.wait); +- +- return true; ++ return success; + } + + static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) +@@ -5746,7 +5716,6 @@ void lru_gen_init_lruvec(struct lruvec * + INIT_LIST_HEAD(&lrugen->pages[gen][type][zone]); + + lruvec->mm_state.seq = MIN_NR_GENS; +- init_waitqueue_head(&lruvec->mm_state.wait); + } + + #ifdef CONFIG_MEMCG diff --git a/target/linux/generic/backport-6.1/020-v6.4-19-mm-Multi-gen-LRU-remove-wait_event_killable.patch b/target/linux/generic/backport-6.1/020-v6.4-19-mm-Multi-gen-LRU-remove-wait_event_killable.patch new file mode 100644 index 0000000000..4bf796c93d --- /dev/null +++ b/target/linux/generic/backport-6.1/020-v6.4-19-mm-Multi-gen-LRU-remove-wait_event_killable.patch @@ -0,0 +1,280 @@ +From 418038c22452df38cde519cc8c662bb15139764a Mon Sep 17 00:00:00 2001 +From: Kalesh Singh +Date: Thu, 13 Apr 2023 14:43:26 -0700 +Subject: [PATCH 19/19] mm: Multi-gen LRU: remove wait_event_killable() + +Android 14 and later default to MGLRU [1] and field telemetry showed +occasional long tail latency (>100ms) in the reclaim path. + +Tracing revealed priority inversion in the reclaim path. In +try_to_inc_max_seq(), when high priority tasks were blocked on +wait_event_killable(), the preemption of the low priority task to call +wake_up_all() caused those high priority tasks to wait longer than +necessary. In general, this problem is not different from others of its +kind, e.g., one caused by mutex_lock(). However, it is specific to MGLRU +because it introduced the new wait queue lruvec->mm_state.wait. + +The purpose of this new wait queue is to avoid the thundering herd +problem. If many direct reclaimers rush into try_to_inc_max_seq(), only +one can succeed, i.e., the one to wake up the rest, and the rest who +failed might cause premature OOM kills if they do not wait. So far there +is no evidence supporting this scenario, based on how often the wait has +been hit. And this begs the question how useful the wait queue is in +practice. + +Based on Minchan's recommendation, which is in line with his commit +6d4675e60135 ("mm: don't be stuck to rmap lock on reclaim path") and the +rest of the MGLRU code which also uses trylock when possible, remove the +wait queue. + +[1] https://android-review.googlesource.com/q/I7ed7fbfd6ef9ce10053347528125dd98c39e50bf + +Link: https://lkml.kernel.org/r/20230413214326.2147568-1-kaleshsingh@google.com +Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks") +Signed-off-by: Kalesh Singh +Suggested-by: Minchan Kim +Reported-by: Wei Wang +Acked-by: Yu Zhao +Cc: Minchan Kim +Cc: Jan Alexander Steffens (heftig) +Cc: Oleksandr Natalenko +Cc: Suleiman Souhlal +Cc: Suren Baghdasaryan +Signed-off-by: Andrew Morton +--- + include/linux/mmzone.h | 8 +-- + mm/vmscan.c | 112 +++++++++++++++-------------------------- + 2 files changed, 42 insertions(+), 78 deletions(-) + +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index 403c7461e7a70..d62a5accf1be4 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -453,18 +453,14 @@ enum { + struct lru_gen_mm_state { + /* set to max_seq after each iteration */ + unsigned long seq; +- /* where the current iteration continues (inclusive) */ ++ /* where the current iteration continues after */ + struct list_head *head; +- /* where the last iteration ended (exclusive) */ ++ /* where the last iteration ended before */ + struct list_head *tail; +- /* to wait for the last page table walker to finish */ +- struct wait_queue_head wait; + /* Bloom filters flip after each iteration */ + unsigned long *filters[NR_BLOOM_FILTERS]; + /* the mm stats for debugging */ + unsigned long stats[NR_HIST_GENS][NR_MM_STATS]; +- /* the number of concurrent page table walkers */ +- int nr_walkers; + }; + + struct lru_gen_mm_walk { +diff --git a/mm/vmscan.c b/mm/vmscan.c +index f6ce7a1fd78a3..851758303dbf4 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -3371,18 +3371,13 @@ void lru_gen_del_mm(struct mm_struct *mm) + if (!lruvec) + continue; + +- /* where the last iteration ended (exclusive) */ ++ /* where the current iteration continues after */ ++ if (lruvec->mm_state.head == &mm->lru_gen.list) ++ lruvec->mm_state.head = lruvec->mm_state.head->prev; ++ ++ /* where the last iteration ended before */ + if (lruvec->mm_state.tail == &mm->lru_gen.list) + lruvec->mm_state.tail = lruvec->mm_state.tail->next; +- +- /* where the current iteration continues (inclusive) */ +- if (lruvec->mm_state.head != &mm->lru_gen.list) +- continue; +- +- lruvec->mm_state.head = lruvec->mm_state.head->next; +- /* the deletion ends the current iteration */ +- if (lruvec->mm_state.head == &mm_list->fifo) +- WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1); + } + + list_del_init(&mm->lru_gen.list); +@@ -3478,68 +3473,54 @@ static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, + struct mm_struct **iter) + { + bool first = false; +- bool last = true; ++ bool last = false; + struct mm_struct *mm = NULL; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + struct lru_gen_mm_state *mm_state = &lruvec->mm_state; + + /* +- * There are four interesting cases for this page table walker: +- * 1. It tries to start a new iteration of mm_list with a stale max_seq; +- * there is nothing left to do. +- * 2. It's the first of the current generation, and it needs to reset +- * the Bloom filter for the next generation. +- * 3. It reaches the end of mm_list, and it needs to increment +- * mm_state->seq; the iteration is done. +- * 4. It's the last of the current generation, and it needs to reset the +- * mm stats counters for the next generation. ++ * mm_state->seq is incremented after each iteration of mm_list. There ++ * are three interesting cases for this page table walker: ++ * 1. It tries to start a new iteration with a stale max_seq: there is ++ * nothing left to do. ++ * 2. It started the next iteration: it needs to reset the Bloom filter ++ * so that a fresh set of PTE tables can be recorded. ++ * 3. It ended the current iteration: it needs to reset the mm stats ++ * counters and tell its caller to increment max_seq. + */ + spin_lock(&mm_list->lock); + + VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq); +- VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq); +- VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers); + +- if (walk->max_seq <= mm_state->seq) { +- if (!*iter) +- last = false; ++ if (walk->max_seq <= mm_state->seq) + goto done; +- } + +- if (!mm_state->nr_walkers) { +- VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo); ++ if (!mm_state->head) ++ mm_state->head = &mm_list->fifo; + +- mm_state->head = mm_list->fifo.next; ++ if (mm_state->head == &mm_list->fifo) + first = true; +- } +- +- while (!mm && mm_state->head != &mm_list->fifo) { +- mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); + ++ do { + mm_state->head = mm_state->head->next; ++ if (mm_state->head == &mm_list->fifo) { ++ WRITE_ONCE(mm_state->seq, mm_state->seq + 1); ++ last = true; ++ break; ++ } + + /* force scan for those added after the last iteration */ +- if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) { +- mm_state->tail = mm_state->head; ++ if (!mm_state->tail || mm_state->tail == mm_state->head) { ++ mm_state->tail = mm_state->head->next; + walk->force_scan = true; + } + ++ mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); + if (should_skip_mm(mm, walk)) + mm = NULL; +- } +- +- if (mm_state->head == &mm_list->fifo) +- WRITE_ONCE(mm_state->seq, mm_state->seq + 1); ++ } while (!mm); + done: +- if (*iter && !mm) +- mm_state->nr_walkers--; +- if (!*iter && mm) +- mm_state->nr_walkers++; +- +- if (mm_state->nr_walkers) +- last = false; +- + if (*iter || last) + reset_mm_stats(lruvec, walk, last); + +@@ -3567,9 +3548,9 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq) + + VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq); + +- if (max_seq > mm_state->seq && !mm_state->nr_walkers) { +- VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo); +- ++ if (max_seq > mm_state->seq) { ++ mm_state->head = NULL; ++ mm_state->tail = NULL; + WRITE_ONCE(mm_state->seq, mm_state->seq + 1); + reset_mm_stats(lruvec, NULL, true); + success = true; +@@ -4172,10 +4153,6 @@ static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end, + + walk_pmd_range(&val, addr, next, args); + +- /* a racy check to curtail the waiting time */ +- if (wq_has_sleeper(&walk->lruvec->mm_state.wait)) +- return 1; +- + if (need_resched() || walk->batched >= MAX_LRU_BATCH) { + end = (addr | ~PUD_MASK) + 1; + goto done; +@@ -4208,8 +4185,14 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_ + walk->next_addr = FIRST_USER_ADDRESS; + + do { ++ DEFINE_MAX_SEQ(lruvec); ++ + err = -EBUSY; + ++ /* another thread might have called inc_max_seq() */ ++ if (walk->max_seq != max_seq) ++ break; ++ + /* folio_update_gen() requires stable folio_memcg() */ + if (!mem_cgroup_trylock_pages(memcg)) + break; +@@ -4442,25 +4425,12 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, + success = iterate_mm_list(lruvec, walk, &mm); + if (mm) + walk_mm(lruvec, mm, walk); +- +- cond_resched(); + } while (mm); + done: +- if (!success) { +- if (sc->priority <= DEF_PRIORITY - 2) +- wait_event_killable(lruvec->mm_state.wait, +- max_seq < READ_ONCE(lrugen->max_seq)); +- return false; +- } ++ if (success) ++ inc_max_seq(lruvec, can_swap, force_scan); + +- VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq)); +- +- inc_max_seq(lruvec, can_swap, force_scan); +- /* either this sees any waiters or they will see updated max_seq */ +- if (wq_has_sleeper(&lruvec->mm_state.wait)) +- wake_up_all(&lruvec->mm_state.wait); +- +- return true; ++ return success; + } + + /****************************************************************************** +@@ -6105,7 +6075,6 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) + INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]); + + lruvec->mm_state.seq = MIN_NR_GENS; +- init_waitqueue_head(&lruvec->mm_state.wait); + } + + #ifdef CONFIG_MEMCG +@@ -6138,7 +6107,6 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg) + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + +- VM_WARN_ON_ONCE(lruvec->mm_state.nr_walkers); + VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, + sizeof(lruvec->lrugen.nr_pages))); + +-- +2.40.1 +