[PATCH] zone_reclaim: dynamic slab reclaim

author Christoph Lameter <clameter@sgi.com>

Tue, 26 Sep 2006 06:31:52 +0000 (23:31 -0700)

committer Linus Torvalds <torvalds@g5.osdl.org>

Tue, 26 Sep 2006 15:48:51 +0000 (08:48 -0700)
author Christoph Lameter <clameter@sgi.com>
Tue, 26 Sep 2006 06:31:52 +0000 (23:31 -0700)
committer Linus Torvalds <torvalds@g5.osdl.org>
Tue, 26 Sep 2006 15:48:51 +0000 (08:48 -0700)
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt

index 7cee90223d3a0624296b42d983d786b25462dc56..20d0d797f539ce51d18f903aa4aec942ead16643 100644 (file)
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -29,6 +29,7 @@ Currently, these files are in /proc/sys/vm:
  - drop-caches
  - zone_reclaim_mode
  - min_unmapped_ratio
+- min_slab_ratio
  - panic_on_oom
  
  ==============================================================
@@ -138,7 +139,6 @@ This is value ORed together of
  1      = Zone reclaim on
  2      = Zone reclaim writes dirty pages out
  4      = Zone reclaim swaps pages
-8      = Also do a global slab reclaim pass
  
  zone_reclaim_mode is set during bootup to 1 if it is determined that pages
  from remote zones will cause a measurable performance reduction. The
@@ -162,18 +162,13 @@ Allowing regular swap effectively restricts allocations to the local
  node unless explicitly overridden by memory policies or cpuset
  configurations.
  
-It may be advisable to allow slab reclaim if the system makes heavy
-use of files and builds up large slab caches. However, the slab
-shrink operation is global, may take a long time and free slabs
-in all nodes of the system.
-
  =============================================================
  
  min_unmapped_ratio:
  
  This is available only on NUMA kernels.
  
-A percentage of the file backed pages in each zone.  Zone reclaim will only
+A percentage of the total pages in each zone.  Zone reclaim will only
  occur if more than this percentage of pages are file backed and unmapped.
  This is to insure that a minimal amount of local pages is still available for
  file I/O even if the node is overallocated.
@@ -182,6 +177,24 @@ The default is 1 percent.
  
  =============================================================
  
+min_slab_ratio:
+
+This is available only on NUMA kernels.
+
+A percentage of the total pages in each zone.  On Zone reclaim
+(fallback from the local zone occurs) slabs will be reclaimed if more
+than this percentage of pages in a zone are reclaimable slab pages.
+This insures that the slab growth stays under control even in NUMA
+systems that rarely perform global reclaim.
+
+The default is 5 percent.
+
+Note that slab reclaim is triggered in a per zone / node fashion.
+The process of reclaiming slab memory is currently not node specific
+and may not be fast.
+
+=============================================================
+
  panic_on_oom
  
  This enables or disables panic on out-of-memory feature.  If this is set to 1,
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index 08c41b9f92e0e8dc54621e1838be20564cd3c7ab..3693f1a5278839c23243691564a5085af0617313 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -171,6 +171,7 @@ struct zone {
          * zone reclaim becomes active if more unmapped pages exist.
          */
         unsigned long           min_unmapped_pages;
+       unsigned long           min_slab_pages;
         struct per_cpu_pageset  *pageset[NR_CPUS];
  #else
         struct per_cpu_pageset  pageset[NR_CPUS];
@@ -448,6 +449,8 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, struct file
                                         void __user *, size_t *, loff_t *);
  int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
                         struct file *, void __user *, size_t *, loff_t *);
+int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
+                       struct file *, void __user *, size_t *, loff_t *);
  
  #include <linux/topology.h>
  /* Returns the number of the current Node. */
diff --git a/include/linux/swap.h b/include/linux/swap.h

index 32db06c8ffe0801fe894afc10c9a8ba0200faee3..a2f5ad7c2d2e2ec700dd3d8b4e7dfac57f9e1136 100644 (file)
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -193,6 +193,7 @@ extern long vm_total_pages;
  #ifdef CONFIG_NUMA
  extern int zone_reclaim_mode;
  extern int sysctl_min_unmapped_ratio;
+extern int sysctl_min_slab_ratio;
  extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
  #else
  #define zone_reclaim_mode 0
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h

index 736ed917a4f8df2af5d25e1e8cc567cada1d863d..eca555781d05e1d67e62d070202ced6ac6bcfb84 100644 (file)
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -191,6 +191,7 @@ enum
         VM_MIN_UNMAPPED=32,     /* Set min percent of unmapped pages */
         VM_PANIC_ON_OOM=33,     /* panic at out-of-memory */
         VM_VDSO_ENABLED=34,     /* map VDSO into new processes? */
+       VM_MIN_SLAB=35,          /* Percent pages ignored by zone reclaim */
  };
  
  
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index 362a0cc371381cceab3f4a77b03430a604ecdb71..fd43c3e6786b5b19c8ff173b45638c56392c7d86 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -943,6 +943,17 @@ static ctl_table vm_table[] = {
                 .extra1         = &zero,
                 .extra2         = &one_hundred,
         },
+       {
+               .ctl_name       = VM_MIN_SLAB,
+               .procname       = "min_slab_ratio",
+               .data           = &sysctl_min_slab_ratio,
+               .maxlen         = sizeof(sysctl_min_slab_ratio),
+               .mode           = 0644,
+               .proc_handler   = &sysctl_min_slab_ratio_sysctl_handler,
+               .strategy       = &sysctl_intvec,
+               .extra1         = &zero,
+               .extra2         = &one_hundred,
+       },
  #endif
  #ifdef CONFIG_X86_32
         {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 47e98423b30dd6da8dea3e1bf03e8d7e3bf85cf0..cf913bdd433e29b3b371bd97383f2c3c152248ce 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2005,6 +2005,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
  #ifdef CONFIG_NUMA
                 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
                                                 / 100;
+               zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
  #endif
                 zone->name = zone_names[j];
                 spin_lock_init(&zone->lock);
@@ -2318,6 +2319,22 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
                                 sysctl_min_unmapped_ratio) / 100;
         return 0;
  }
+
+int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
+       struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+       struct zone *zone;
+       int rc;
+
+       rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+       if (rc)
+               return rc;
+
+       for_each_zone(zone)
+               zone->min_slab_pages = (zone->present_pages *
+                               sysctl_min_slab_ratio) / 100;
+       return 0;
+}
  #endif
  
  /*
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 349797ba4bac37fa476c5f33af8212afd33f3b3b..089e943c4d38f56c75309386dde288ab35eed7e8 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1527,7 +1527,6 @@ int zone_reclaim_mode __read_mostly;
  #define RECLAIM_ZONE (1<<0)    /* Run shrink_cache on the zone */
  #define RECLAIM_WRITE (1<<1)   /* Writeout pages during reclaim */
  #define RECLAIM_SWAP (1<<2)    /* Swap pages out during reclaim */
-#define RECLAIM_SLAB (1<<3)    /* Do a global slab shrink if the zone is out of memory */
  
  /*
   * Priority for ZONE_RECLAIM. This determines the fraction of pages
@@ -1542,6 +1541,12 @@ int zone_reclaim_mode __read_mostly;
   */
  int sysctl_min_unmapped_ratio = 1;
  
+/*
+ * If the number of slab pages in a zone grows beyond this percentage then
+ * slab reclaim needs to occur.
+ */
+int sysctl_min_slab_ratio = 5;
+
  /*
   * Try to free up some pages from this zone through reclaim.
   */
@@ -1573,29 +1578,37 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
         reclaim_state.reclaimed_slab = 0;
         p->reclaim_state = &reclaim_state;
  
-       /*
-        * Free memory by calling shrink zone with increasing priorities
-        * until we have enough memory freed.
-        */
-       priority = ZONE_RECLAIM_PRIORITY;
-       do {
-               nr_reclaimed += shrink_zone(priority, zone, &sc);
-               priority--;
-       } while (priority >= 0 && nr_reclaimed < nr_pages);
+       if (zone_page_state(zone, NR_FILE_PAGES) -
+               zone_page_state(zone, NR_FILE_MAPPED) >
+               zone->min_unmapped_pages) {
+               /*
+                * Free memory by calling shrink zone with increasing
+                * priorities until we have enough memory freed.
+                */
+               priority = ZONE_RECLAIM_PRIORITY;
+               do {
+                       nr_reclaimed += shrink_zone(priority, zone, &sc);
+                       priority--;
+               } while (priority >= 0 && nr_reclaimed < nr_pages);
+       }
  
-       if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
+       if (zone_page_state(zone, NR_SLAB_RECLAIMABLE) > zone->min_slab_pages) {
                 /*
                  * shrink_slab() does not currently allow us to determine how
-                * many pages were freed in this zone. So we just shake the slab
-                * a bit and then go off node for this particular allocation
-                * despite possibly having freed enough memory to allocate in
-                * this zone.  If we freed local memory then the next
-                * allocations will be local again.
+                * many pages were freed in this zone. So we take the current
+                * number of slab pages and shake the slab until it is reduced
+                * by the same nr_pages that we used for reclaiming unmapped
+                * pages.
                  *
-                * shrink_slab will free memory on all zones and may take
-                * a long time.
+                * Note that shrink_slab will free memory on all zones and may
+                * take a long time.
                  */
-               shrink_slab(sc.nr_scanned, gfp_mask, order);
+               unsigned long limit = zone_page_state(zone,
+                               NR_SLAB_RECLAIMABLE) - nr_pages;
+
+               while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
+                       zone_page_state(zone, NR_SLAB_RECLAIMABLE) > limit)
+                       ;
         }
  
         p->reclaim_state = NULL;
@@ -1609,7 +1622,8 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
         int node_id;
  
         /*
-        * Zone reclaim reclaims unmapped file backed pages.
+        * Zone reclaim reclaims unmapped file backed pages and
+        * slab pages if we are over the defined limits.
          *
          * A small portion of unmapped file backed pages is needed for
          * file I/O otherwise pages read by file I/O will be immediately
@@ -1618,7 +1632,9 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
          * unmapped file backed pages.
          */
         if (zone_page_state(zone, NR_FILE_PAGES) -
-           zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages)
+           zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages
+           && zone_page_state(zone, NR_SLAB_RECLAIMABLE)
+                       <= zone->min_slab_pages)
                 return 0;
  
         /*
author	Christoph Lameter <clameter@sgi.com>
	Tue, 26 Sep 2006 06:31:52 +0000 (23:31 -0700)
committer	Linus Torvalds <torvalds@g5.osdl.org>
	Tue, 26 Sep 2006 15:48:51 +0000 (08:48 -0700)
Documentation/sysctl/vm.txt		patch \| blob \| history
include/linux/mmzone.h		patch \| blob \| history
include/linux/swap.h		patch \| blob \| history
include/linux/sysctl.h		patch \| blob \| history
kernel/sysctl.c		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history
mm/vmscan.c		patch \| blob \| history