mm/vmscan: don't change pgdat state on base of a single LRU list state

author Andrey Ryabinin <aryabinin@virtuozzo.com>

Tue, 10 Apr 2018 23:27:59 +0000 (16:27 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 11 Apr 2018 17:28:30 +0000 (10:28 -0700)
author Andrey Ryabinin <aryabinin@virtuozzo.com>
Tue, 10 Apr 2018 23:27:59 +0000 (16:27 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 11 Apr 2018 17:28:30 +0000 (10:28 -0700)
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 403f59edd53e9d58a59783be96d6c42627ca46a2..1ecc648b619160578951f62f9a36315f8d8783b2 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -116,6 +116,16 @@ struct scan_control {
  
         /* Number of pages freed so far during a call to shrink_zones() */
         unsigned long nr_reclaimed;
+
+       struct {
+               unsigned int dirty;
+               unsigned int unqueued_dirty;
+               unsigned int congested;
+               unsigned int writeback;
+               unsigned int immediate;
+               unsigned int file_taken;
+               unsigned int taken;
+       } nr;
  };
  
  #ifdef ARCH_HAS_PREFETCH
@@ -1754,23 +1764,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
         mem_cgroup_uncharge_list(&page_list);
         free_unref_page_list(&page_list);
  
-       /*
-        * If reclaim is isolating dirty pages under writeback, it implies
-        * that the long-lived page allocation rate is exceeding the page
-        * laundering rate. Either the global limits are not being effective
-        * at throttling processes due to the page distribution throughout
-        * zones or there is heavy usage of a slow backing device. The
-        * only option is to throttle from reclaim context which is not ideal
-        * as there is no guarantee the dirtying process is throttled in the
-        * same way balance_dirty_pages() manages.
-        *
-        * Once a node is flagged PGDAT_WRITEBACK, kswapd will count the number
-        * of pages under pages flagged for immediate reclaim and stall if any
-        * are encountered in the nr_immediate check below.
-        */
-       if (stat.nr_writeback && stat.nr_writeback == nr_taken)
-               set_bit(PGDAT_WRITEBACK, &pgdat->flags);
-
         /*
          * If dirty pages are scanned that are not queued for IO, it
          * implies that flushers are not doing their job. This can
@@ -1785,40 +1778,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
         if (stat.nr_unqueued_dirty == nr_taken)
                 wakeup_flusher_threads(WB_REASON_VMSCAN);
  
-       /*
-        * Legacy memcg will stall in page writeback so avoid forcibly
-        * stalling here.
-        */
-       if (sane_reclaim(sc)) {
-               /*
-                * Tag a node as congested if all the dirty pages scanned were
-                * backed by a congested BDI and wait_iff_congested will stall.
-                */
-               if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested)
-                       set_bit(PGDAT_CONGESTED, &pgdat->flags);
-
-               /* Allow kswapd to start writing pages during reclaim. */
-               if (stat.nr_unqueued_dirty == nr_taken)
-                       set_bit(PGDAT_DIRTY, &pgdat->flags);
-
-               /*
-                * If kswapd scans pages marked marked for immediate
-                * reclaim and under writeback (nr_immediate), it implies
-                * that pages are cycling through the LRU faster than
-                * they are written so also forcibly stall.
-                */
-               if (stat.nr_immediate)
-                       congestion_wait(BLK_RW_ASYNC, HZ/10);
-       }
-
-       /*
-        * Stall direct reclaim for IO completions if underlying BDIs and node
-        * is congested. Allow kswapd to continue until it starts encountering
-        * unqueued dirty pages or cycling through the LRU too quickly.
-        */
-       if (!sc->hibernation_mode && !current_is_kswapd() &&
-           current_may_throttle())
-               wait_iff_congested(pgdat, BLK_RW_ASYNC, HZ/10);
+       sc->nr.dirty += stat.nr_dirty;
+       sc->nr.congested += stat.nr_congested;
+       sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
+       sc->nr.writeback += stat.nr_writeback;
+       sc->nr.immediate += stat.nr_immediate;
+       sc->nr.taken += nr_taken;
+       if (file)
+               sc->nr.file_taken += nr_taken;
  
         trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
                         nr_scanned, nr_reclaimed,
@@ -2522,6 +2489,8 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
                 unsigned long node_lru_pages = 0;
                 struct mem_cgroup *memcg;
  
+               memset(&sc->nr, 0, sizeof(sc->nr));
+
                 nr_reclaimed = sc->nr_reclaimed;
                 nr_scanned = sc->nr_scanned;
  
@@ -2587,6 +2556,61 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
                 if (sc->nr_reclaimed - nr_reclaimed)
                         reclaimable = true;
  
+               /*
+                * If reclaim is isolating dirty pages under writeback, it
+                * implies that the long-lived page allocation rate is exceeding
+                * the page laundering rate. Either the global limits are not
+                * being effective at throttling processes due to the page
+                * distribution throughout zones or there is heavy usage of a
+                * slow backing device. The only option is to throttle from
+                * reclaim context which is not ideal as there is no guarantee
+                * the dirtying process is throttled in the same way
+                * balance_dirty_pages() manages.
+                *
+                * Once a node is flagged PGDAT_WRITEBACK, kswapd will count the
+                * number of pages under pages flagged for immediate reclaim and
+                * stall if any are encountered in the nr_immediate check below.
+                */
+               if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
+                       set_bit(PGDAT_WRITEBACK, &pgdat->flags);
+
+               /*
+                * Legacy memcg will stall in page writeback so avoid forcibly
+                * stalling here.
+                */
+               if (sane_reclaim(sc)) {
+                       /*
+                        * Tag a node as congested if all the dirty pages
+                        * scanned were backed by a congested BDI and
+                        * wait_iff_congested will stall.
+                        */
+                       if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
+                               set_bit(PGDAT_CONGESTED, &pgdat->flags);
+
+                       /* Allow kswapd to start writing pages during reclaim.*/
+                       if (sc->nr.unqueued_dirty == sc->nr.file_taken)
+                               set_bit(PGDAT_DIRTY, &pgdat->flags);
+
+                       /*
+                        * If kswapd scans pages marked marked for immediate
+                        * reclaim and under writeback (nr_immediate), it
+                        * implies that pages are cycling through the LRU
+                        * faster than they are written so also forcibly stall.
+                        */
+                       if (sc->nr.immediate)
+                               congestion_wait(BLK_RW_ASYNC, HZ/10);
+               }
+
+               /*
+                * Stall direct reclaim for IO completions if underlying BDIs
+                * and node is congested. Allow kswapd to continue until it
+                * starts encountering unqueued dirty pages or cycling through
+                * the LRU too quickly.
+                */
+               if (!sc->hibernation_mode && !current_is_kswapd() &&
+                   current_may_throttle())
+                       wait_iff_congested(pgdat, BLK_RW_ASYNC, HZ/10);
+
         } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
                                          sc->nr_scanned - nr_scanned, sc));
author	Andrey Ryabinin <aryabinin@virtuozzo.com>
	Tue, 10 Apr 2018 23:27:59 +0000 (16:27 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 11 Apr 2018 17:28:30 +0000 (10:28 -0700)