mm, page_alloc: wakeup kcompactd even if kswapd cannot free more memory
authorDavid Rientjes <rientjes@google.com>
Thu, 5 Apr 2018 23:25:16 +0000 (16:25 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 6 Apr 2018 04:36:27 +0000 (21:36 -0700)
Kswapd will not wakeup if per-zone watermarks are not failing or if too
many previous attempts at background reclaim have failed.

This can be true if there is a lot of free memory available.  For high-
order allocations, kswapd is responsible for waking up kcompactd for
background compaction.  If the zone is not below its watermarks or
reclaim has recently failed (lots of free memory, nothing left to
reclaim), kcompactd does not get woken up.

When __GFP_DIRECT_RECLAIM is not allowed, allow kcompactd to still be
woken up even if kswapd will not reclaim.  This allows high-order
allocations, such as thp, to still trigger background compaction even
when the zone has an abundance of free memory.

Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1803111659420.209721@chino.kir.corp.google.com
Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Documentation/trace/postprocess/trace-vmscan-postprocess.pl
include/linux/mmzone.h
include/trace/events/vmscan.h
mm/page_alloc.c
mm/vmscan.c

index ba976805853a546436655269c5a9cc29b6e070ce..66bfd8396877f933a1f70fffc87cbf09e9254772 100644 (file)
@@ -111,7 +111,7 @@ my $regex_direct_begin_default = 'order=([0-9]*) may_writepage=([0-9]*) gfp_flag
 my $regex_direct_end_default = 'nr_reclaimed=([0-9]*)';
 my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)';
 my $regex_kswapd_sleep_default = 'nid=([0-9]*)';
-my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*)';
+my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*) gfp_flags=([A-Z_|]*)';
 my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) classzone_idx=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_skipped=([0-9]*) nr_taken=([0-9]*) lru=([a-z_]*)';
 my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)';
 my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)';
@@ -201,7 +201,7 @@ $regex_kswapd_sleep = generate_traceevent_regex(
 $regex_wakeup_kswapd = generate_traceevent_regex(
                        "vmscan/mm_vmscan_wakeup_kswapd",
                        $regex_wakeup_kswapd_default,
-                       "nid", "zid", "order");
+                       "nid", "zid", "order", "gfp_flags");
 $regex_lru_isolate = generate_traceevent_regex(
                        "vmscan/mm_vmscan_lru_isolate",
                        $regex_lru_isolate_default,
index 5d935411d3c4b4f9bb95540b2626400904caf651..f11ae29005f1b44cfda0958235043a4e51dbd830 100644 (file)
@@ -776,7 +776,8 @@ static inline bool is_dev_zone(const struct zone *zone)
 #include <linux/memory_hotplug.h>
 
 void build_all_zonelists(pg_data_t *pgdat);
-void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
+void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
+                  enum zone_type classzone_idx);
 bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
                         int classzone_idx, unsigned int alloc_flags,
                         long free_pages);
index e0b8b9173e1c1a5fe5a40a0f52aeb4b0ea593771..6570c5b45ba158420058b21d95b4e4f88a136d01 100644 (file)
@@ -78,26 +78,29 @@ TRACE_EVENT(mm_vmscan_kswapd_wake,
 
 TRACE_EVENT(mm_vmscan_wakeup_kswapd,
 
-       TP_PROTO(int nid, int zid, int order),
+       TP_PROTO(int nid, int zid, int order, gfp_t gfp_flags),
 
-       TP_ARGS(nid, zid, order),
+       TP_ARGS(nid, zid, order, gfp_flags),
 
        TP_STRUCT__entry(
-               __field(        int,            nid     )
-               __field(        int,            zid     )
-               __field(        int,            order   )
+               __field(        int,    nid             )
+               __field(        int,    zid             )
+               __field(        int,    order           )
+               __field(        gfp_t,  gfp_flags       )
        ),
 
        TP_fast_assign(
                __entry->nid            = nid;
                __entry->zid            = zid;
                __entry->order          = order;
+               __entry->gfp_flags      = gfp_flags;
        ),
 
-       TP_printk("nid=%d zid=%d order=%d",
+       TP_printk("nid=%d zid=%d order=%d gfp_flags=%s",
                __entry->nid,
                __entry->zid,
-               __entry->order)
+               __entry->order,
+               show_gfp_flags(__entry->gfp_flags))
 );
 
 DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template,
index f6005b7c3446db493e4b46179948e67d663cda05..02c1a60d79378aa68670d5d8b7956e6364cc73c3 100644 (file)
@@ -3805,16 +3805,18 @@ retry:
        return page;
 }
 
-static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
+static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
+                            const struct alloc_context *ac)
 {
        struct zoneref *z;
        struct zone *zone;
        pg_data_t *last_pgdat = NULL;
+       enum zone_type high_zoneidx = ac->high_zoneidx;
 
-       for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
-                                       ac->high_zoneidx, ac->nodemask) {
+       for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx,
+                                       ac->nodemask) {
                if (last_pgdat != zone->zone_pgdat)
-                       wakeup_kswapd(zone, order, ac->high_zoneidx);
+                       wakeup_kswapd(zone, gfp_mask, order, high_zoneidx);
                last_pgdat = zone->zone_pgdat;
        }
 }
@@ -4093,7 +4095,7 @@ retry_cpuset:
                goto nopage;
 
        if (gfp_mask & __GFP_KSWAPD_RECLAIM)
-               wake_all_kswapds(order, ac);
+               wake_all_kswapds(order, gfp_mask, ac);
 
        /*
         * The adjusted alloc_flags might result in immediate success, so try
@@ -4151,7 +4153,7 @@ retry_cpuset:
 retry:
        /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
        if (gfp_mask & __GFP_KSWAPD_RECLAIM)
-               wake_all_kswapds(order, ac);
+               wake_all_kswapds(order, gfp_mask, ac);
 
        reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
        if (reserve_flags)
index 976be140a8ce9bab3deb57f531a81f22f98a6a0f..4390a8d5be41ee497569622e3b0381a851870114 100644 (file)
@@ -3539,16 +3539,21 @@ kswapd_try_sleep:
 }
 
 /*
- * A zone is low on free memory, so wake its kswapd task to service it.
+ * A zone is low on free memory or too fragmented for high-order memory.  If
+ * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's
+ * pgdat.  It will wake up kcompactd after reclaiming memory.  If kswapd reclaim
+ * has failed or is not needed, still wake up kcompactd if only compaction is
+ * needed.
  */
-void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
+void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
+                  enum zone_type classzone_idx)
 {
        pg_data_t *pgdat;
 
        if (!managed_zone(zone))
                return;
 
-       if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
+       if (!cpuset_zone_allowed(zone, gfp_flags))
                return;
        pgdat = zone->zone_pgdat;
        pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
@@ -3557,14 +3562,23 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
        if (!waitqueue_active(&pgdat->kswapd_wait))
                return;
 
-       /* Hopeless node, leave it to direct reclaim */
-       if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
-               return;
-
-       if (pgdat_balanced(pgdat, order, classzone_idx))
+       /* Hopeless node, leave it to direct reclaim if possible */
+       if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
+           pgdat_balanced(pgdat, order, classzone_idx)) {
+               /*
+                * There may be plenty of free memory available, but it's too
+                * fragmented for high-order allocations.  Wake up kcompactd
+                * and rely on compaction_suitable() to determine if it's
+                * needed.  If it fails, it will defer subsequent attempts to
+                * ratelimit its work.
+                */
+               if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
+                       wakeup_kcompactd(pgdat, order, classzone_idx);
                return;
+       }
 
-       trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order);
+       trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order,
+                                     gfp_flags);
        wake_up_interruptible(&pgdat->kswapd_wait);
 }