mm: reclaim small amounts of memory when an external fragmentation event occurs

author Mel Gorman <mgorman@techsingularity.net>

Fri, 28 Dec 2018 08:35:52 +0000 (00:35 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 28 Dec 2018 20:11:48 +0000 (12:11 -0800)
author Mel Gorman <mgorman@techsingularity.net>
Fri, 28 Dec 2018 08:35:52 +0000 (00:35 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 28 Dec 2018 20:11:48 +0000 (12:11 -0800)
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt

index 7d73882e2c273c57d1277c701b71a1774b22763c..187ce4f599a267eed30f655b81580a89b45ce33c 100644 (file)
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -63,6 +63,7 @@ Currently, these files are in /proc/sys/vm:
  - swappiness
  - user_reserve_kbytes
  - vfs_cache_pressure
+- watermark_boost_factor
  - watermark_scale_factor
  - zone_reclaim_mode
  
@@ -856,6 +857,26 @@ ten times more freeable objects than there are.
  
  =============================================================
  
+watermark_boost_factor:
+
+This factor controls the level of reclaim when memory is being fragmented.
+It defines the percentage of the high watermark of a zone that will be
+reclaimed if pages of different mobility are being mixed within pageblocks.
+The intent is that compaction has less work to do in the future and to
+increase the success rate of future high-order allocations such as SLUB
+allocations, THP and hugetlbfs pages.
+
+To make it sensible with respect to the watermark_scale_factor parameter,
+the unit is in fractions of 10,000. The default value of 15,000 means
+that up to 150% of the high watermark will be reclaimed in the event of
+a pageblock being mixed due to fragmentation. The level of reclaim is
+determined by the number of fragmentation events that occurred in the
+recent past. If this value is smaller than a pageblock then a pageblocks
+worth of pages will be reclaimed (e.g.  2MB on 64-bit x86). A boost factor
+of 0 will disable the feature.
+
+=============================================================
+
  watermark_scale_factor:
  
  This factor controls the aggressiveness of kswapd. It defines the
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 1d2be4c2d34aa9d0d8b04581e392ad3479dceaef..031b2ce983f9d5322dfeefa725d7af513ea10048 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2256,6 +2256,7 @@ extern void zone_pcp_reset(struct zone *zone);
  
  /* page_alloc.c */
  extern int min_free_kbytes;
+extern int watermark_boost_factor;
  extern int watermark_scale_factor;
  
  /* nommu.c */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index dcf1b66a96ab918088cb8b370e3088365e161d69..5b4bfb90fb942045d92e12fbfd3c63790dbb54e5 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -269,10 +269,10 @@ enum zone_watermarks {
         NR_WMARK
  };
  
-#define min_wmark_pages(z) (z->_watermark[WMARK_MIN])
-#define low_wmark_pages(z) (z->_watermark[WMARK_LOW])
-#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH])
-#define wmark_pages(z, i) (z->_watermark[i])
+#define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost)
+#define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost)
+#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
+#define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
  
  struct per_cpu_pages {
         int count;              /* number of pages in the list */
@@ -364,6 +364,7 @@ struct zone {
  
         /* zone watermarks, access with *_wmark_pages(zone) macros */
         unsigned long _watermark[NR_WMARK];
+       unsigned long watermark_boost;
  
         unsigned long nr_reserved_highatomic;
  
@@ -890,6 +891,8 @@ static inline int is_highmem(struct zone *zone)
  struct ctl_table;
  int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
                                         void __user *, size_t *, loff_t *);
+int watermark_boost_factor_sysctl_handler(struct ctl_table *, int,
+                                       void __user *, size_t *, loff_t *);
  int watermark_scale_factor_sysctl_handler(struct ctl_table *, int,
                                         void __user *, size_t *, loff_t *);
  extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES];
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index 5fc724e4e454c3304ecaebe7c868eb622f784eb8..1825f712e73bb74809e921503622dd5511b94923 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1462,6 +1462,14 @@ static struct ctl_table vm_table[] = {
                 .proc_handler   = min_free_kbytes_sysctl_handler,
                 .extra1         = &zero,
         },
+       {
+               .procname       = "watermark_boost_factor",
+               .data           = &watermark_boost_factor,
+               .maxlen         = sizeof(watermark_boost_factor),
+               .mode           = 0644,
+               .proc_handler   = watermark_boost_factor_sysctl_handler,
+               .extra1         = &zero,
+       },
         {
                 .procname       = "watermark_scale_factor",
                 .data           = &watermark_scale_factor,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 32b3e121a388b2a4263fd1053a18f07531b3db98..80373eca453dde248cbac5b7285b38197b6eb4fb 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -262,6 +262,7 @@ compound_page_dtor * const compound_page_dtors[] = {
  
  int min_free_kbytes = 1024;
  int user_min_free_kbytes = -1;
+int watermark_boost_factor __read_mostly = 15000;
  int watermark_scale_factor = 10;
  
  static unsigned long nr_kernel_pages __meminitdata;
@@ -2129,6 +2130,21 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
         return false;
  }
  
+static inline void boost_watermark(struct zone *zone)
+{
+       unsigned long max_boost;
+
+       if (!watermark_boost_factor)
+               return;
+
+       max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
+                       watermark_boost_factor, 10000);
+       max_boost = max(pageblock_nr_pages, max_boost);
+
+       zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
+               max_boost);
+}
+
  /*
   * This function implements actual steal behaviour. If order is large enough,
   * we can steal whole pageblock. If not, we first move freepages in this
@@ -2138,7 +2154,7 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
   * itself, so pages freed in the future will be put on the correct free list.
   */
  static void steal_suitable_fallback(struct zone *zone, struct page *page,
-                                       int start_type, bool whole_block)
+               unsigned int alloc_flags, int start_type, bool whole_block)
  {
         unsigned int current_order = page_order(page);
         struct free_area *area;
@@ -2160,6 +2176,15 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
                 goto single_page;
         }
  
+       /*
+        * Boost watermarks to increase reclaim pressure to reduce the
+        * likelihood of future fallbacks. Wake kswapd now as the node
+        * may be balanced overall and kswapd will not wake naturally.
+        */
+       boost_watermark(zone);
+       if (alloc_flags & ALLOC_KSWAPD)
+               wakeup_kswapd(zone, 0, 0, zone_idx(zone));
+
         /* We are not allowed to try stealing from the whole block */
         if (!whole_block)
                 goto single_page;
@@ -2443,7 +2468,8 @@ do_steal:
         page = list_first_entry(&area->free_list[fallback_mt],
                                                         struct page, lru);
  
-       steal_suitable_fallback(zone, page, start_migratetype, can_steal);
+       steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
+                                                               can_steal);
  
         trace_mm_page_alloc_extfrag(page, order, current_order,
                 start_migratetype, fallback_mt);
@@ -7454,6 +7480,7 @@ static void __setup_per_zone_wmarks(void)
  
                 zone->_watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
                 zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
+               zone->watermark_boost = 0;
  
                 spin_unlock_irqrestore(&zone->lock, flags);
         }
@@ -7554,6 +7581,18 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
         return 0;
  }
  
+int watermark_boost_factor_sysctl_handler(struct ctl_table *table, int write,
+       void __user *buffer, size_t *length, loff_t *ppos)
+{
+       int rc;
+
+       rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+       if (rc)
+               return rc;
+
+       return 0;
+}
+
  int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
         void __user *buffer, size_t *length, loff_t *ppos)
  {
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 24ab1f7394abaafa9e0dccac37597ae65ef77e0f..bd8971a29204d74514df39faa4fd47b417dddddd 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -88,6 +88,9 @@ struct scan_control {
         /* Can pages be swapped as part of reclaim? */
         unsigned int may_swap:1;
  
+       /* e.g. boosted watermark reclaim leaves slabs alone */
+       unsigned int may_shrinkslab:1;
+
         /*
          * Cgroups are not reclaimed below their configured memory.low,
          * unless we threaten to OOM. If any cgroups are skipped due to
@@ -2756,8 +2759,10 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
                         shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
                         node_lru_pages += lru_pages;
  
-                       shrink_slab(sc->gfp_mask, pgdat->node_id,
+                       if (sc->may_shrinkslab) {
+                               shrink_slab(sc->gfp_mask, pgdat->node_id,
                                     memcg, sc->priority);
+                       }
  
                         /* Record the group's reclaim efficiency */
                         vmpressure(sc->gfp_mask, memcg, false,
@@ -3239,6 +3244,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                 .may_writepage = !laptop_mode,
                 .may_unmap = 1,
                 .may_swap = 1,
+               .may_shrinkslab = 1,
         };
  
         /*
@@ -3283,6 +3289,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
                 .may_unmap = 1,
                 .reclaim_idx = MAX_NR_ZONES - 1,
                 .may_swap = !noswap,
+               .may_shrinkslab = 1,
         };
         unsigned long lru_pages;
  
@@ -3329,6 +3336,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                 .may_writepage = !laptop_mode,
                 .may_unmap = 1,
                 .may_swap = may_swap,
+               .may_shrinkslab = 1,
         };
  
         /*
@@ -3379,6 +3387,30 @@ static void age_active_anon(struct pglist_data *pgdat,
         } while (memcg);
  }
  
+static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx)
+{
+       int i;
+       struct zone *zone;
+
+       /*
+        * Check for watermark boosts top-down as the higher zones
+        * are more likely to be boosted. Both watermarks and boosts
+        * should not be checked at the time time as reclaim would
+        * start prematurely when there is no boosting and a lower
+        * zone is balanced.
+        */
+       for (i = classzone_idx; i >= 0; i--) {
+               zone = pgdat->node_zones + i;
+               if (!managed_zone(zone))
+                       continue;
+
+               if (zone->watermark_boost)
+                       return true;
+       }
+
+       return false;
+}
+
  /*
   * Returns true if there is an eligible zone balanced for the request order
   * and classzone_idx
@@ -3389,6 +3421,10 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
         unsigned long mark = -1;
         struct zone *zone;
  
+       /*
+        * Check watermarks bottom-up as lower zones are more likely to
+        * meet watermarks.
+        */
         for (i = 0; i <= classzone_idx; i++) {
                 zone = pgdat->node_zones + i;
  
@@ -3517,14 +3553,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
         unsigned long nr_soft_reclaimed;
         unsigned long nr_soft_scanned;
         unsigned long pflags;
+       unsigned long nr_boost_reclaim;
+       unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
+       bool boosted;
         struct zone *zone;
         struct scan_control sc = {
                 .gfp_mask = GFP_KERNEL,
                 .order = order,
-               .priority = DEF_PRIORITY,
-               .may_writepage = !laptop_mode,
                 .may_unmap = 1,
-               .may_swap = 1,
         };
  
         psi_memstall_enter(&pflags);
@@ -3532,9 +3568,28 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
  
         count_vm_event(PAGEOUTRUN);
  
+       /*
+        * Account for the reclaim boost. Note that the zone boost is left in
+        * place so that parallel allocations that are near the watermark will
+        * stall or direct reclaim until kswapd is finished.
+        */
+       nr_boost_reclaim = 0;
+       for (i = 0; i <= classzone_idx; i++) {
+               zone = pgdat->node_zones + i;
+               if (!managed_zone(zone))
+                       continue;
+
+               nr_boost_reclaim += zone->watermark_boost;
+               zone_boosts[i] = zone->watermark_boost;
+       }
+       boosted = nr_boost_reclaim;
+
+restart:
+       sc.priority = DEF_PRIORITY;
         do {
                 unsigned long nr_reclaimed = sc.nr_reclaimed;
                 bool raise_priority = true;
+               bool balanced;
                 bool ret;
  
                 sc.reclaim_idx = classzone_idx;
@@ -3561,13 +3616,40 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                 }
  
                 /*
-                * Only reclaim if there are no eligible zones. Note that
-                * sc.reclaim_idx is not used as buffer_heads_over_limit may
-                * have adjusted it.
+                * If the pgdat is imbalanced then ignore boosting and preserve
+                * the watermarks for a later time and restart. Note that the
+                * zone watermarks will be still reset at the end of balancing
+                * on the grounds that the normal reclaim should be enough to
+                * re-evaluate if boosting is required when kswapd next wakes.
+                */
+               balanced = pgdat_balanced(pgdat, sc.order, classzone_idx);
+               if (!balanced && nr_boost_reclaim) {
+                       nr_boost_reclaim = 0;
+                       goto restart;
+               }
+
+               /*
+                * If boosting is not active then only reclaim if there are no
+                * eligible zones. Note that sc.reclaim_idx is not used as
+                * buffer_heads_over_limit may have adjusted it.
                  */
-               if (pgdat_balanced(pgdat, sc.order, classzone_idx))
+               if (!nr_boost_reclaim && balanced)
                         goto out;
  
+               /* Limit the priority of boosting to avoid reclaim writeback */
+               if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
+                       raise_priority = false;
+
+               /*
+                * Do not writeback or swap pages for boosted reclaim. The
+                * intent is to relieve pressure not issue sub-optimal IO
+                * from reclaim context. If no pages are reclaimed, the
+                * reclaim will be aborted.
+                */
+               sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
+               sc.may_swap = !nr_boost_reclaim;
+               sc.may_shrinkslab = !nr_boost_reclaim;
+
                 /*
                  * Do some background aging of the anon list, to give
                  * pages a chance to be referenced before reclaiming. All
@@ -3619,6 +3701,16 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                  * progress in reclaiming pages
                  */
                 nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
+               nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
+
+               /*
+                * If reclaim made no progress for a boost, stop reclaim as
+                * IO cannot be queued and it could be an infinite loop in
+                * extreme circumstances.
+                */
+               if (nr_boost_reclaim && !nr_reclaimed)
+                       break;
+
                 if (raise_priority || !nr_reclaimed)
                         sc.priority--;
         } while (sc.priority >= 1);
@@ -3627,6 +3719,28 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                 pgdat->kswapd_failures++;
  
  out:
+       /* If reclaim was boosted, account for the reclaim done in this pass */
+       if (boosted) {
+               unsigned long flags;
+
+               for (i = 0; i <= classzone_idx; i++) {
+                       if (!zone_boosts[i])
+                               continue;
+
+                       /* Increments are under the zone lock */
+                       zone = pgdat->node_zones + i;
+                       spin_lock_irqsave(&zone->lock, flags);
+                       zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
+                       spin_unlock_irqrestore(&zone->lock, flags);
+               }
+
+               /*
+                * As there is now likely space, wakeup kcompact to defragment
+                * pageblocks.
+                */
+               wakeup_kcompactd(pgdat, pageblock_order, classzone_idx);
+       }
+
         snapshot_refaults(NULL, pgdat);
         __fs_reclaim_release();
         psi_memstall_leave(&pflags);
@@ -3855,7 +3969,8 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
  
         /* Hopeless node, leave it to direct reclaim if possible */
         if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
-           pgdat_balanced(pgdat, order, classzone_idx)) {
+           (pgdat_balanced(pgdat, order, classzone_idx) &&
+            !pgdat_watermark_boosted(pgdat, classzone_idx))) {
                 /*
                  * There may be plenty of free memory available, but it's too
                  * fragmented for high-order allocations.  Wake up kcompactd
author	Mel Gorman <mgorman@techsingularity.net>
	Fri, 28 Dec 2018 08:35:52 +0000 (00:35 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 28 Dec 2018 20:11:48 +0000 (12:11 -0800)
Documentation/sysctl/vm.txt		patch \| blob \| history
include/linux/mm.h		patch \| blob \| history
include/linux/mmzone.h		patch \| blob \| history
kernel/sysctl.c		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history
mm/vmscan.c		patch \| blob \| history