From f33261d75b88f55a08e6a9648cef73509979bfba Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 25 Jan 2011 15:07:20 -0800 Subject: [PATCH] mm: fix deferred congestion timeout if preferred zone is not allowed Before 0e093d99763e ("writeback: do not sleep on the congestion queue if there are no congested BDIs or if significant congestion is not being encountered in the current zone"), preferred_zone was only used for NUMA statistics, to determine the zoneidx from which to allocate from given the type requested, and whether to utilize memory compaction. wait_iff_congested(), though, uses preferred_zone to determine if the congestion wait should be deferred because its dirty pages are backed by a congested bdi. This incorrectly defers the timeout and busy loops in the page allocator with various cond_resched() calls if preferred_zone is not allowed in the current context, usually consuming 100% of a cpu. This patch ensures preferred_zone is an allowed zone in the fastpath depending on whether current is constrained by its cpuset or nodes in its mempolicy (when the nodemask passed is non-NULL). This is correct since the fastpath allocation always passes ALLOC_CPUSET when trying to allocate memory. In the slowpath, this patch resets preferred_zone to the first zone of the allowed type when the allocation is not constrained by current's cpuset, i.e. it does not pass ALLOC_CPUSET. This patch also ensures preferred_zone is from the set of allowed nodes when called from within direct reclaim since allocations are always constrained by cpusets in this context (it is blockable). Both of these uses of cpuset_current_mems_allowed are protected by get_mems_allowed(). Signed-off-by: David Rientjes Cc: Mel Gorman Cc: Johannes Weiner Cc: Minchan Kim Cc: Wu Fengguang Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 +++++++++++- mm/vmscan.c | 3 ++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 90c1439549fd..f4967910c967 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2034,6 +2034,14 @@ restart: */ alloc_flags = gfp_to_alloc_flags(gfp_mask); + /* + * Find the true preferred zone if the allocation is unconstrained by + * cpusets. + */ + if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) + first_zones_zonelist(zonelist, high_zoneidx, NULL, + &preferred_zone); + /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, @@ -2192,7 +2200,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, get_mems_allowed(); /* The preferred zone is used for statistics later */ - first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); + first_zones_zonelist(zonelist, high_zoneidx, + nodemask ? : &cpuset_current_mems_allowed, + &preferred_zone); if (!preferred_zone) { put_mems_allowed(); return NULL; diff --git a/mm/vmscan.c b/mm/vmscan.c index f5d90dedebba..148c6e630df2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2083,7 +2083,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct zone *preferred_zone; first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), - NULL, &preferred_zone); + &cpuset_current_mems_allowed, + &preferred_zone); wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); } } -- 2.30.2