per-zone and reclaim enhancements for memory controller: modifies vmscan.c for isolat...

author KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Thu, 7 Feb 2008 08:14:37 +0000 (00:14 -0800)

committer Linus Torvalds <torvalds@woody.linux-foundation.org>

Thu, 7 Feb 2008 16:42:22 +0000 (08:42 -0800)
author KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Thu, 7 Feb 2008 08:14:37 +0000 (00:14 -0800)
committer Linus Torvalds <torvalds@woody.linux-foundation.org>
Thu, 7 Feb 2008 16:42:22 +0000 (08:42 -0800)
diff --git a/mm/vmscan.c b/mm/vmscan.c

index be4dfe87be03eef25206710409d9ae4e44143975..a26dabd62fed40c8ec7832dc4656feaf7d3909f9 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -856,7 +856,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                 __mod_zone_page_state(zone, NR_ACTIVE, -nr_active);
                 __mod_zone_page_state(zone, NR_INACTIVE,
                                                 -(nr_taken - nr_active));
-               zone->pages_scanned += nr_scan;
+               if (scan_global_lru(sc))
+                       zone->pages_scanned += nr_scan;
                 spin_unlock_irq(&zone->lru_lock);
  
                 nr_scanned += nr_scan;
@@ -888,8 +889,9 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                 if (current_is_kswapd()) {
                         __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
                         __count_vm_events(KSWAPD_STEAL, nr_freed);
-               } else
+               } else if (scan_global_lru(sc))
                         __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
+
                 __count_zone_vm_events(PGSTEAL, zone, nr_freed);
  
                 if (nr_taken == 0)
@@ -942,6 +944,113 @@ static inline int zone_is_near_oom(struct zone *zone)
                                 + zone_page_state(zone, NR_INACTIVE))*3;
  }
  
+/*
+ * Determine we should try to reclaim mapped pages.
+ * This is called only when sc->mem_cgroup is NULL.
+ */
+static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone,
+                               int priority)
+{
+       long mapped_ratio;
+       long distress;
+       long swap_tendency;
+       long imbalance;
+       int reclaim_mapped = 0;
+       int prev_priority;
+
+       if (scan_global_lru(sc) && zone_is_near_oom(zone))
+               return 1;
+       /*
+        * `distress' is a measure of how much trouble we're having
+        * reclaiming pages.  0 -> no problems.  100 -> great trouble.
+        */
+       if (scan_global_lru(sc))
+               prev_priority = zone->prev_priority;
+       else
+               prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup);
+
+       distress = 100 >> min(prev_priority, priority);
+
+       /*
+        * The point of this algorithm is to decide when to start
+        * reclaiming mapped memory instead of just pagecache.  Work out
+        * how much memory
+        * is mapped.
+        */
+       if (scan_global_lru(sc))
+               mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
+                               global_page_state(NR_ANON_PAGES)) * 100) /
+                                       vm_total_pages;
+       else
+               mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup);
+
+       /*
+        * Now decide how much we really want to unmap some pages.  The
+        * mapped ratio is downgraded - just because there's a lot of
+        * mapped memory doesn't necessarily mean that page reclaim
+        * isn't succeeding.
+        *
+        * The distress ratio is important - we don't want to start
+        * going oom.
+        *
+        * A 100% value of vm_swappiness overrides this algorithm
+        * altogether.
+        */
+       swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
+
+       /*
+        * If there's huge imbalance between active and inactive
+        * (think active 100 times larger than inactive) we should
+        * become more permissive, or the system will take too much
+        * cpu before it start swapping during memory pressure.
+        * Distress is about avoiding early-oom, this is about
+        * making swappiness graceful despite setting it to low
+        * values.
+        *
+        * Avoid div by zero with nr_inactive+1, and max resulting
+        * value is vm_total_pages.
+        */
+       if (scan_global_lru(sc)) {
+               imbalance  = zone_page_state(zone, NR_ACTIVE);
+               imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
+       } else
+               imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup);
+
+       /*
+        * Reduce the effect of imbalance if swappiness is low,
+        * this means for a swappiness very low, the imbalance
+        * must be much higher than 100 for this logic to make
+        * the difference.
+        *
+        * Max temporary value is vm_total_pages*100.
+        */
+       imbalance *= (vm_swappiness + 1);
+       imbalance /= 100;
+
+       /*
+        * If not much of the ram is mapped, makes the imbalance
+        * less relevant, it's high priority we refill the inactive
+        * list with mapped pages only in presence of high ratio of
+        * mapped pages.
+        *
+        * Max temporary value is vm_total_pages*100.
+        */
+       imbalance *= mapped_ratio;
+       imbalance /= 100;
+
+       /* apply imbalance feedback to swap_tendency */
+       swap_tendency += imbalance;
+
+       /*
+        * Now use this metric to decide whether to start moving mapped
+        * memory onto the inactive list.
+        */
+       if (swap_tendency >= 100)
+               reclaim_mapped = 1;
+
+       return reclaim_mapped;
+}
+
  /*
   * This moves pages from the active list to the inactive list.
   *
@@ -959,6 +1068,8 @@ static inline int zone_is_near_oom(struct zone *zone)
   * The downside is that we have to touch page->_count against each page.
   * But we had to alter page->flags anyway.
   */
+
+
  static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                                 struct scan_control *sc, int priority)
  {
@@ -972,100 +1083,21 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
         struct pagevec pvec;
         int reclaim_mapped = 0;
  
-       if (sc->may_swap) {
-               long mapped_ratio;
-               long distress;
-               long swap_tendency;
-               long imbalance;
-
-               if (zone_is_near_oom(zone))
-                       goto force_reclaim_mapped;
-
-               /*
-                * `distress' is a measure of how much trouble we're having
-                * reclaiming pages.  0 -> no problems.  100 -> great trouble.
-                */
-               distress = 100 >> min(zone->prev_priority, priority);
-
-               /*
-                * The point of this algorithm is to decide when to start
-                * reclaiming mapped memory instead of just pagecache.  Work out
-                * how much memory
-                * is mapped.
-                */
-               mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
-                               global_page_state(NR_ANON_PAGES)) * 100) /
-                                       vm_total_pages;
-
-               /*
-                * Now decide how much we really want to unmap some pages.  The
-                * mapped ratio is downgraded - just because there's a lot of
-                * mapped memory doesn't necessarily mean that page reclaim
-                * isn't succeeding.
-                *
-                * The distress ratio is important - we don't want to start
-                * going oom.
-                *
-                * A 100% value of vm_swappiness overrides this algorithm
-                * altogether.
-                */
-               swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
-
-               /*
-                * If there's huge imbalance between active and inactive
-                * (think active 100 times larger than inactive) we should
-                * become more permissive, or the system will take too much
-                * cpu before it start swapping during memory pressure.
-                * Distress is about avoiding early-oom, this is about
-                * making swappiness graceful despite setting it to low
-                * values.
-                *
-                * Avoid div by zero with nr_inactive+1, and max resulting
-                * value is vm_total_pages.
-                */
-               imbalance  = zone_page_state(zone, NR_ACTIVE);
-               imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
-
-               /*
-                * Reduce the effect of imbalance if swappiness is low,
-                * this means for a swappiness very low, the imbalance
-                * must be much higher than 100 for this logic to make
-                * the difference.
-                *
-                * Max temporary value is vm_total_pages*100.
-                */
-               imbalance *= (vm_swappiness + 1);
-               imbalance /= 100;
-
-               /*
-                * If not much of the ram is mapped, makes the imbalance
-                * less relevant, it's high priority we refill the inactive
-                * list with mapped pages only in presence of high ratio of
-                * mapped pages.
-                *
-                * Max temporary value is vm_total_pages*100.
-                */
-               imbalance *= mapped_ratio;
-               imbalance /= 100;
-
-               /* apply imbalance feedback to swap_tendency */
-               swap_tendency += imbalance;
-
-               /*
-                * Now use this metric to decide whether to start moving mapped
-                * memory onto the inactive list.
-                */
-               if (swap_tendency >= 100)
-force_reclaim_mapped:
-                       reclaim_mapped = 1;
-       }
+       if (sc->may_swap)
+               reclaim_mapped = calc_reclaim_mapped(sc, zone, priority);
  
         lru_add_drain();
         spin_lock_irq(&zone->lru_lock);
         pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
                                         ISOLATE_ACTIVE, zone,
                                         sc->mem_cgroup, 1);
-       zone->pages_scanned += pgscanned;
+       /*
+        * zone->pages_scanned is used for detect zone's oom
+        * mem_cgroup remembers nr_scan by itself.
+        */
+       if (scan_global_lru(sc))
+               zone->pages_scanned += pgscanned;
+
         __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
         spin_unlock_irq(&zone->lru_lock);
  
@@ -1155,25 +1187,39 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
         unsigned long nr_to_scan;
         unsigned long nr_reclaimed = 0;
  
-       /*
-        * Add one to `nr_to_scan' just to make sure that the kernel will
-        * slowly sift through the active list.
-        */
-       zone->nr_scan_active +=
-               (zone_page_state(zone, NR_ACTIVE) >> priority) + 1;
-       nr_active = zone->nr_scan_active;
-       if (nr_active >= sc->swap_cluster_max)
-               zone->nr_scan_active = 0;
-       else
-               nr_active = 0;
+       if (scan_global_lru(sc)) {
+               /*
+                * Add one to nr_to_scan just to make sure that the kernel
+                * will slowly sift through the active list.
+                */
+               zone->nr_scan_active +=
+                       (zone_page_state(zone, NR_ACTIVE) >> priority) + 1;
+               nr_active = zone->nr_scan_active;
+               zone->nr_scan_inactive +=
+                       (zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
+               nr_inactive = zone->nr_scan_inactive;
+               if (nr_inactive >= sc->swap_cluster_max)
+                       zone->nr_scan_inactive = 0;
+               else
+                       nr_inactive = 0;
+
+               if (nr_active >= sc->swap_cluster_max)
+                       zone->nr_scan_active = 0;
+               else
+                       nr_active = 0;
+       } else {
+               /*
+                * This reclaim occurs not because zone memory shortage but
+                * because memory controller hits its limit.
+                * Then, don't modify zone reclaim related data.
+                */
+               nr_active = mem_cgroup_calc_reclaim_active(sc->mem_cgroup,
+                                       zone, priority);
+
+               nr_inactive = mem_cgroup_calc_reclaim_inactive(sc->mem_cgroup,
+                                       zone, priority);
+       }
  
-       zone->nr_scan_inactive +=
-               (zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
-       nr_inactive = zone->nr_scan_inactive;
-       if (nr_inactive >= sc->swap_cluster_max)
-               zone->nr_scan_inactive = 0;
-       else
-               nr_inactive = 0;
  
         while (nr_active || nr_inactive) {
                 if (nr_active) {
@@ -1218,25 +1264,39 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
         unsigned long nr_reclaimed = 0;
         int i;
  
+
         sc->all_unreclaimable = 1;
         for (i = 0; zones[i] != NULL; i++) {
                 struct zone *zone = zones[i];
  
                 if (!populated_zone(zone))
                         continue;
+               /*
+                * Take care memory controller reclaiming has small influence
+                * to global LRU.
+                */
+               if (scan_global_lru(sc)) {
+                       if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+                               continue;
+                       note_zone_scanning_priority(zone, priority);
  
-               if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
-                       continue;
-
-               note_zone_scanning_priority(zone, priority);
-
-               if (zone_is_all_unreclaimable(zone) && priority != DEF_PRIORITY)
-                       continue;       /* Let kswapd poll it */
-
-               sc->all_unreclaimable = 0;
+                       if (zone_is_all_unreclaimable(zone) &&
+                                               priority != DEF_PRIORITY)
+                               continue;       /* Let kswapd poll it */
+                       sc->all_unreclaimable = 0;
+               } else {
+                       /*
+                        * Ignore cpuset limitation here. We just want to reduce
+                        * # of used pages by us regardless of memory shortage.
+                        */
+                       sc->all_unreclaimable = 0;
+                       mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
+                                                       priority);
+               }
  
                 nr_reclaimed += shrink_zone(priority, zone, sc);
         }
+
         return nr_reclaimed;
  }
   
@@ -1264,16 +1324,21 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
         unsigned long lru_pages = 0;
         int i;
  
-       count_vm_event(ALLOCSTALL);
-
-       for (i = 0; zones[i] != NULL; i++) {
-               struct zone *zone = zones[i];
+       if (scan_global_lru(sc))
+               count_vm_event(ALLOCSTALL);
+       /*
+        * mem_cgroup will not do shrink_slab.
+        */
+       if (scan_global_lru(sc)) {
+               for (i = 0; zones[i] != NULL; i++) {
+                       struct zone *zone = zones[i];
  
-               if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
-                       continue;
+                       if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+                               continue;
  
-               lru_pages += zone_page_state(zone, NR_ACTIVE)
-                               + zone_page_state(zone, NR_INACTIVE);
+                       lru_pages += zone_page_state(zone, NR_ACTIVE)
+                                       + zone_page_state(zone, NR_INACTIVE);
+               }
         }
  
         for (priority = DEF_PRIORITY; priority >= 0; priority--) {
@@ -1330,14 +1395,19 @@ out:
          */
         if (priority < 0)
                 priority = 0;
-       for (i = 0; zones[i] != NULL; i++) {
-               struct zone *zone = zones[i];
  
-               if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
-                       continue;
+       if (scan_global_lru(sc)) {
+               for (i = 0; zones[i] != NULL; i++) {
+                       struct zone *zone = zones[i];
+
+                       if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+                               continue;
+
+                       zone->prev_priority = priority;
+               }
+       } else
+               mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
  
-               zone->prev_priority = priority;
-       }
         return ret;
  }
author	KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
	Thu, 7 Feb 2008 08:14:37 +0000 (00:14 -0800)
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>
	Thu, 7 Feb 2008 16:42:22 +0000 (08:42 -0800)