mm: page allocator: adjust the per-cpu counter threshold when memory is low

author Mel Gorman <mel@csn.ul.ie>

Thu, 13 Jan 2011 23:45:41 +0000 (15:45 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 14 Jan 2011 01:32:31 +0000 (17:32 -0800)
author Mel Gorman <mel@csn.ul.ie>
Thu, 13 Jan 2011 23:45:41 +0000 (15:45 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 14 Jan 2011 01:32:31 +0000 (17:32 -0800)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index 39c24ebe9cfd4e75b8841deec06aa6d78c91c2ad..48906629335c128adcfa86f3ce2527069b8abe67 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -458,12 +458,6 @@ static inline int zone_is_oom_locked(const struct zone *zone)
         return test_bit(ZONE_OOM_LOCKED, &zone->flags);
  }
  
-#ifdef CONFIG_SMP
-unsigned long zone_nr_free_pages(struct zone *zone);
-#else
-#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES)
-#endif /* CONFIG_SMP */
-
  /*
   * The "priority" of VM scanning is how much of the queues we will scan in one
   * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
@@ -661,7 +655,9 @@ typedef struct pglist_data {
  extern struct mutex zonelists_mutex;
  void build_all_zonelists(void *data);
  void wakeup_kswapd(struct zone *zone, int order);
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+               int classzone_idx, int alloc_flags);
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
                 int classzone_idx, int alloc_flags);
  enum memmap_context {
         MEMMAP_EARLY,
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h

index eaaea37b3b75dd64b73a34a0e3beb31417bdd0d6..e4cc21cf5870227f5710d3b6ca344f025a05c342 100644 (file)
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -254,6 +254,8 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item);
  extern void __dec_zone_state(struct zone *, enum zone_stat_item);
  
  void refresh_cpu_vm_stats(int);
+void reduce_pgdat_percpu_threshold(pg_data_t *pgdat);
+void restore_pgdat_percpu_threshold(pg_data_t *pgdat);
  #else /* CONFIG_SMP */
  
  /*
@@ -298,6 +300,9 @@ static inline void __dec_zone_page_state(struct page *page,
  #define dec_zone_page_state __dec_zone_page_state
  #define mod_zone_page_state __mod_zone_page_state
  
+static inline void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) { }
+static inline void restore_pgdat_percpu_threshold(pg_data_t *pgdat) { }
+
  static inline void refresh_cpu_vm_stats(int cpu) { }
  #endif
  
diff --git a/mm/mmzone.c b/mm/mmzone.c

index e35bfb82c8555b7377334dbea42bfcf588b0bab8..f5b7d1760213e53db3c46e84dde56daf219ea0cd 100644 (file)
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn,
         return 1;
  }
  #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
-
-#ifdef CONFIG_SMP
-/* Called when a more accurate view of NR_FREE_PAGES is needed */
-unsigned long zone_nr_free_pages(struct zone *zone)
-{
-       unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
-
-       /*
-        * While kswapd is awake, it is considered the zone is under some
-        * memory pressure. Under pressure, there is a risk that
-        * per-cpu-counter-drift will allow the min watermark to be breached
-        * potentially causing a live-lock. While kswapd is awake and
-        * free pages are low, get a better estimate for free pages
-        */
-       if (nr_free_pages < zone->percpu_drift_mark &&
-                       !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
-               return zone_page_state_snapshot(zone, NR_FREE_PAGES);
-
-       return nr_free_pages;
-}
-#endif /* CONFIG_SMP */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 826ba6922e84a24b53a3c4999345f85a4f3dea02..22a1bb7723e412814961d0053bad647579fb3248 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1460,24 +1460,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
  #endif /* CONFIG_FAIL_PAGE_ALLOC */
  
  /*
- * Return 1 if free pages are above 'mark'. This takes into account the order
+ * Return true if free pages are above 'mark'. This takes into account the order
   * of the allocation.
   */
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
-                     int classzone_idx, int alloc_flags)
+static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+                     int classzone_idx, int alloc_flags, long free_pages)
  {
         /* free_pages my go negative - that's OK */
         long min = mark;
-       long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
         int o;
  
+       free_pages -= (1 << order) + 1;
         if (alloc_flags & ALLOC_HIGH)
                 min -= min / 2;
         if (alloc_flags & ALLOC_HARDER)
                 min -= min / 4;
  
         if (free_pages <= min + z->lowmem_reserve[classzone_idx])
-               return 0;
+               return false;
         for (o = 0; o < order; o++) {
                 /* At the next order, this order's pages become unavailable */
                 free_pages -= z->free_area[o].nr_free << o;
@@ -1486,9 +1486,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
                 min >>= 1;
  
                 if (free_pages <= min)
-                       return 0;
+                       return false;
         }
-       return 1;
+       return true;
+}
+
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+                     int classzone_idx, int alloc_flags)
+{
+       return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+                                       zone_page_state(z, NR_FREE_PAGES));
+}
+
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+                     int classzone_idx, int alloc_flags)
+{
+       long free_pages = zone_page_state(z, NR_FREE_PAGES);
+
+       if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
+               free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+
+       return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+                                                               free_pages);
  }
  
  #ifdef CONFIG_NUMA
@@ -2442,7 +2461,7 @@ void show_free_areas(void)
                         " all_unreclaimable? %s"
                         "\n",
                         zone->name,
-                       K(zone_nr_free_pages(zone)),
+                       K(zone_page_state(zone, NR_FREE_PAGES)),
                         K(min_wmark_pages(zone)),
                         K(low_wmark_pages(zone)),
                         K(high_wmark_pages(zone)),
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 9ca587c692748adbc715b440ff5aa8c89357c511..5da4295e7d672e8804151773e184d65fde1ddd67 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2143,7 +2143,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
                 if (zone->all_unreclaimable)
                         continue;
  
-               if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
+               if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
                                                                 0, 0))
                         return 1;
         }
@@ -2230,7 +2230,7 @@ loop_again:
                                 shrink_active_list(SWAP_CLUSTER_MAX, zone,
                                                         &sc, priority, 0);
  
-                       if (!zone_watermark_ok(zone, order,
+                       if (!zone_watermark_ok_safe(zone, order,
                                         high_wmark_pages(zone), 0, 0)) {
                                 end_zone = i;
                                 break;
@@ -2276,7 +2276,7 @@ loop_again:
                          * We put equal pressure on every zone, unless one
                          * zone has way too many pages free already.
                          */
-                       if (!zone_watermark_ok(zone, order,
+                       if (!zone_watermark_ok_safe(zone, order,
                                         8*high_wmark_pages(zone), end_zone, 0))
                                 shrink_zone(priority, zone, &sc);
                         reclaim_state->reclaimed_slab = 0;
@@ -2297,7 +2297,7 @@ loop_again:
                             total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
                                 sc.may_writepage = 1;
  
-                       if (!zone_watermark_ok(zone, order,
+                       if (!zone_watermark_ok_safe(zone, order,
                                         high_wmark_pages(zone), end_zone, 0)) {
                                 all_zones_ok = 0;
                                 /*
@@ -2305,7 +2305,7 @@ loop_again:
                                  * means that we have a GFP_ATOMIC allocation
                                  * failure risk. Hurry up!
                                  */
-                               if (!zone_watermark_ok(zone, order,
+                               if (!zone_watermark_ok_safe(zone, order,
                                             min_wmark_pages(zone), end_zone, 0))
                                         has_under_min_watermark_zone = 1;
                         } else {
@@ -2448,7 +2448,9 @@ static int kswapd(void *p)
                                  */
                                 if (!sleeping_prematurely(pgdat, order, remaining)) {
                                         trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
+                                       restore_pgdat_percpu_threshold(pgdat);
                                         schedule();
+                                       reduce_pgdat_percpu_threshold(pgdat);
                                 } else {
                                         if (remaining)
                                                 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
@@ -2487,16 +2489,17 @@ void wakeup_kswapd(struct zone *zone, int order)
         if (!populated_zone(zone))
                 return;
  
-       pgdat = zone->zone_pgdat;
-       if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
+       if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                 return;
+       pgdat = zone->zone_pgdat;
         if (pgdat->kswapd_max_order < order)
                 pgdat->kswapd_max_order = order;
-       trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
-       if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
-               return;
         if (!waitqueue_active(&pgdat->kswapd_wait))
                 return;
+       if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
+               return;
+
+       trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
         wake_up_interruptible(&pgdat->kswapd_wait);
  }
  
diff --git a/mm/vmstat.c b/mm/vmstat.c

index 312d728976f1661c4fa335a1ead46356b3bf091b..bc0f095791b4ff026279c54cf667f50da410ad05 100644 (file)
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -83,6 +83,30 @@ EXPORT_SYMBOL(vm_stat);
  
  #ifdef CONFIG_SMP
  
+static int calculate_pressure_threshold(struct zone *zone)
+{
+       int threshold;
+       int watermark_distance;
+
+       /*
+        * As vmstats are not up to date, there is drift between the estimated
+        * and real values. For high thresholds and a high number of CPUs, it
+        * is possible for the min watermark to be breached while the estimated
+        * value looks fine. The pressure threshold is a reduced value such
+        * that even the maximum amount of drift will not accidentally breach
+        * the min watermark
+        */
+       watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
+       threshold = max(1, (int)(watermark_distance / num_online_cpus()));
+
+       /*
+        * Maximum threshold is 125
+        */
+       threshold = min(125, threshold);
+
+       return threshold;
+}
+
  static int calculate_threshold(struct zone *zone)
  {
         int threshold;
@@ -161,6 +185,48 @@ static void refresh_zone_stat_thresholds(void)
         }
  }
  
+void reduce_pgdat_percpu_threshold(pg_data_t *pgdat)
+{
+       struct zone *zone;
+       int cpu;
+       int threshold;
+       int i;
+
+       get_online_cpus();
+       for (i = 0; i < pgdat->nr_zones; i++) {
+               zone = &pgdat->node_zones[i];
+               if (!zone->percpu_drift_mark)
+                       continue;
+
+               threshold = calculate_pressure_threshold(zone);
+               for_each_online_cpu(cpu)
+                       per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+                                                       = threshold;
+       }
+       put_online_cpus();
+}
+
+void restore_pgdat_percpu_threshold(pg_data_t *pgdat)
+{
+       struct zone *zone;
+       int cpu;
+       int threshold;
+       int i;
+
+       get_online_cpus();
+       for (i = 0; i < pgdat->nr_zones; i++) {
+               zone = &pgdat->node_zones[i];
+               if (!zone->percpu_drift_mark)
+                       continue;
+
+               threshold = calculate_threshold(zone);
+               for_each_online_cpu(cpu)
+                       per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+                                                       = threshold;
+       }
+       put_online_cpus();
+}
+
  /*
   * For use when we know that interrupts are disabled.
   */
@@ -911,7 +977,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                    "\n        scanned  %lu"
                    "\n        spanned  %lu"
                    "\n        present  %lu",
-                  zone_nr_free_pages(zone),
+                  zone_page_state(zone, NR_FREE_PAGES),
                    min_wmark_pages(zone),
                    low_wmark_pages(zone),
                    high_wmark_pages(zone),
author	Mel Gorman <mel@csn.ul.ie>
	Thu, 13 Jan 2011 23:45:41 +0000 (15:45 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 14 Jan 2011 01:32:31 +0000 (17:32 -0800)
include/linux/mmzone.h		patch \| blob \| history
include/linux/vmstat.h		patch \| blob \| history
mm/mmzone.c		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history
mm/vmscan.c		patch \| blob \| history
mm/vmstat.c		patch \| blob \| history