From 66e1707bc34609f626e2e7b4fe7e454c9748bad5 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Thu, 7 Feb 2008 00:13:56 -0800 Subject: [PATCH] Memory controller: add per cgroup LRU and reclaim Add the page_cgroup to the per cgroup LRU. The reclaim algorithm has been modified to make the isolate_lru_pages() as a pluggable component. The scan_control data structure now accepts the cgroup on behalf of which reclaims are carried out. try_to_free_pages() has been extended to become cgroup aware. [akpm@linux-foundation.org: fix warning] [Lee.Schermerhorn@hp.com: initialize all scan_control's isolate_pages member] [bunk@kernel.org: make do_try_to_free_pages() static] [hugh@veritas.com: memcgroup: fix try_to_free order] [kamezawa.hiroyu@jp.fujitsu.com: this unlock_page_cgroup() is unnecessary] Signed-off-by: Pavel Emelianov Signed-off-by: Balbir Singh Cc: Paul Menage Cc: Peter Zijlstra Cc: "Eric W. Biederman" Cc: Nick Piggin Cc: Kirill Korotaev Cc: Herbert Poetzl Cc: David Rientjes Cc: Vaidyanathan Srinivasan Signed-off-by: Lee Schermerhorn Signed-off-by: Hugh Dickins Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 12 +++ include/linux/res_counter.h | 23 ++++++ include/linux/swap.h | 3 + mm/memcontrol.c | 148 ++++++++++++++++++++++++++++++++++-- mm/swap.c | 2 + mm/vmscan.c | 128 +++++++++++++++++++++++++------ 6 files changed, 286 insertions(+), 30 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f5b47efab48b..9c3c1c97c197 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -32,6 +32,13 @@ extern void page_assign_page_cgroup(struct page *page, extern struct page_cgroup *page_get_page_cgroup(struct page *page); extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm); extern void mem_cgroup_uncharge(struct page_cgroup *pc); +extern void mem_cgroup_move_lists(struct page_cgroup *pc, bool active); +extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, + struct list_head *dst, + unsigned long *scanned, int order, + int mode, struct zone *z, + struct mem_cgroup *mem_cont, + int active); static inline void mem_cgroup_uncharge_page(struct page *page) { @@ -71,6 +78,11 @@ static inline void mem_cgroup_uncharge_page(struct page *page) { } +static inline void mem_cgroup_move_lists(struct page_cgroup *pc, + bool active) +{ +} + #endif /* CONFIG_CGROUP_MEM_CONT */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h index eeb3f7749772..5e60a4f34243 100644 --- a/include/linux/res_counter.h +++ b/include/linux/res_counter.h @@ -99,4 +99,27 @@ int res_counter_charge(struct res_counter *counter, unsigned long val); void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val); void res_counter_uncharge(struct res_counter *counter, unsigned long val); +static inline bool res_counter_limit_check_locked(struct res_counter *cnt) +{ + if (cnt->usage < cnt->limit) + return true; + + return false; +} + +/* + * Helper function to detect if the cgroup is within it's limit or + * not. It's currently called from cgroup_rss_prepare() + */ +static inline bool res_counter_check_under_limit(struct res_counter *cnt) +{ + bool ret; + unsigned long flags; + + spin_lock_irqsave(&cnt->lock, flags); + ret = res_counter_limit_check_locked(cnt); + spin_unlock_irqrestore(&cnt->lock, flags); + return ret; +} + #endif diff --git a/include/linux/swap.h b/include/linux/swap.h index 353153ea0bd5..4d91bc0e0fd5 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -182,6 +183,8 @@ extern void swap_setup(void); /* linux/mm/vmscan.c */ extern unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask); +extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem); +extern int __isolate_lru_page(struct page *page, int mode); extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; extern int remove_mapping(struct address_space *mapping, struct page *page); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b25df2a9d024..9e9ff914c0f1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -22,10 +22,15 @@ #include #include #include +#include #include #include +#include +#include +#include struct cgroup_subsys mem_cgroup_subsys; +static const int MEM_CGROUP_RECLAIM_RETRIES = 5; /* * The memory controller data structure. The memory controller controls both @@ -51,6 +56,10 @@ struct mem_cgroup { */ struct list_head active_list; struct list_head inactive_list; + /* + * spin_lock to protect the per cgroup LRU + */ + spinlock_t lru_lock; }; /* @@ -141,6 +150,94 @@ void __always_inline unlock_page_cgroup(struct page *page) bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); } +void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) +{ + if (active) + list_move(&pc->lru, &pc->mem_cgroup->active_list); + else + list_move(&pc->lru, &pc->mem_cgroup->inactive_list); +} + +/* + * This routine assumes that the appropriate zone's lru lock is already held + */ +void mem_cgroup_move_lists(struct page_cgroup *pc, bool active) +{ + struct mem_cgroup *mem; + if (!pc) + return; + + mem = pc->mem_cgroup; + + spin_lock(&mem->lru_lock); + __mem_cgroup_move_lists(pc, active); + spin_unlock(&mem->lru_lock); +} + +unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, + struct list_head *dst, + unsigned long *scanned, int order, + int mode, struct zone *z, + struct mem_cgroup *mem_cont, + int active) +{ + unsigned long nr_taken = 0; + struct page *page; + unsigned long scan; + LIST_HEAD(pc_list); + struct list_head *src; + struct page_cgroup *pc; + + if (active) + src = &mem_cont->active_list; + else + src = &mem_cont->inactive_list; + + spin_lock(&mem_cont->lru_lock); + for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { + pc = list_entry(src->prev, struct page_cgroup, lru); + page = pc->page; + VM_BUG_ON(!pc); + + if (PageActive(page) && !active) { + __mem_cgroup_move_lists(pc, true); + scan--; + continue; + } + if (!PageActive(page) && active) { + __mem_cgroup_move_lists(pc, false); + scan--; + continue; + } + + /* + * Reclaim, per zone + * TODO: make the active/inactive lists per zone + */ + if (page_zone(page) != z) + continue; + + /* + * Check if the meta page went away from under us + */ + if (!list_empty(&pc->lru)) + list_move(&pc->lru, &pc_list); + else + continue; + + if (__isolate_lru_page(page, mode) == 0) { + list_move(&page->lru, dst); + nr_taken++; + } + } + + list_splice(&pc_list, src); + spin_unlock(&mem_cont->lru_lock); + + *scanned = scan; + return nr_taken; +} + /* * Charge the memory controller for page usage. * Return @@ -151,6 +248,8 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm) { struct mem_cgroup *mem; struct page_cgroup *pc, *race_pc; + unsigned long flags; + unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; /* * Should page_cgroup's go to their own slab? @@ -159,14 +258,20 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm) * to see if the cgroup page already has a page_cgroup associated * with it */ +retry: lock_page_cgroup(page); pc = page_get_page_cgroup(page); /* * The page_cgroup exists and the page has already been accounted */ if (pc) { - atomic_inc(&pc->ref_cnt); - goto done; + if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) { + /* this page is under being uncharged ? */ + unlock_page_cgroup(page); + cpu_relax(); + goto retry; + } else + goto done; } unlock_page_cgroup(page); @@ -197,7 +302,32 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm) * If we created the page_cgroup, we should free it on exceeding * the cgroup limit. */ - if (res_counter_charge(&mem->res, 1)) { + while (res_counter_charge(&mem->res, 1)) { + if (try_to_free_mem_cgroup_pages(mem)) + continue; + + /* + * try_to_free_mem_cgroup_pages() might not give us a full + * picture of reclaim. Some pages are reclaimed and might be + * moved to swap cache or just unmapped from the cgroup. + * Check the limit again to see if the reclaim reduced the + * current usage of the cgroup before giving up + */ + if (res_counter_check_under_limit(&mem->res)) + continue; + /* + * Since we control both RSS and cache, we end up with a + * very interesting scenario where we end up reclaiming + * memory (essentially RSS), since the memory is pushed + * to swap cache, we eventually end up adding those + * pages back to our list. Hence we give ourselves a + * few chances before we fail + */ + else if (nr_retries--) { + congestion_wait(WRITE, HZ/10); + continue; + } + css_put(&mem->css); goto free_pc; } @@ -221,14 +351,16 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm) pc->page = page; page_assign_page_cgroup(page, pc); + spin_lock_irqsave(&mem->lru_lock, flags); + list_add(&pc->lru, &mem->active_list); + spin_unlock_irqrestore(&mem->lru_lock, flags); + done: unlock_page_cgroup(page); return 0; free_pc: kfree(pc); - return -ENOMEM; err: - unlock_page_cgroup(page); return -ENOMEM; } @@ -240,6 +372,7 @@ void mem_cgroup_uncharge(struct page_cgroup *pc) { struct mem_cgroup *mem; struct page *page; + unsigned long flags; if (!pc) return; @@ -252,6 +385,10 @@ void mem_cgroup_uncharge(struct page_cgroup *pc) page_assign_page_cgroup(page, NULL); unlock_page_cgroup(page); res_counter_uncharge(&mem->res, 1); + + spin_lock_irqsave(&mem->lru_lock, flags); + list_del_init(&pc->lru); + spin_unlock_irqrestore(&mem->lru_lock, flags); kfree(pc); } } @@ -310,6 +447,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) res_counter_init(&mem->res); INIT_LIST_HEAD(&mem->active_list); INIT_LIST_HEAD(&mem->inactive_list); + spin_lock_init(&mem->lru_lock); return &mem->css; } diff --git a/mm/swap.c b/mm/swap.c index 57b7e25a939c..710a20bb9749 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -29,6 +29,7 @@ #include #include #include +#include /* How many pages do we try to swap or page in/out together? */ int page_cluster; @@ -175,6 +176,7 @@ void activate_page(struct page *page) SetPageActive(page); add_page_to_active_list(zone, page); __count_vm_event(PGACTIVATE); + mem_cgroup_move_lists(page_get_page_cgroup(page), true); } spin_unlock_irq(&zone->lru_lock); } diff --git a/mm/vmscan.c b/mm/vmscan.c index e5a9597e3bbc..7408a8a7d882 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -68,6 +69,15 @@ struct scan_control { int all_unreclaimable; int order; + + /* Which cgroup do we reclaim from */ + struct mem_cgroup *mem_cgroup; + + /* Pluggable isolate pages callback */ + unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, + unsigned long *scanned, int order, int mode, + struct zone *z, struct mem_cgroup *mem_cont, + int active); }; #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) @@ -626,7 +636,7 @@ keep: * * returns 0 on success, -ve errno on failure. */ -static int __isolate_lru_page(struct page *page, int mode) +int __isolate_lru_page(struct page *page, int mode) { int ret = -EINVAL; @@ -760,6 +770,21 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, return nr_taken; } +static unsigned long isolate_pages_global(unsigned long nr, + struct list_head *dst, + unsigned long *scanned, int order, + int mode, struct zone *z, + struct mem_cgroup *mem_cont, + int active) +{ + if (active) + return isolate_lru_pages(nr, &z->active_list, dst, + scanned, order, mode); + else + return isolate_lru_pages(nr, &z->inactive_list, dst, + scanned, order, mode); +} + /* * clear_active_flags() is a helper for shrink_active_list(), clearing * any active bits from the pages in the list. @@ -801,11 +826,11 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, unsigned long nr_freed; unsigned long nr_active; - nr_taken = isolate_lru_pages(sc->swap_cluster_max, - &zone->inactive_list, + nr_taken = sc->isolate_pages(sc->swap_cluster_max, &page_list, &nr_scan, sc->order, (sc->order > PAGE_ALLOC_COSTLY_ORDER)? - ISOLATE_BOTH : ISOLATE_INACTIVE); + ISOLATE_BOTH : ISOLATE_INACTIVE, + zone, sc->mem_cgroup, 0); nr_active = clear_active_flags(&page_list); __count_vm_events(PGDEACTIVATE, nr_active); @@ -1018,8 +1043,9 @@ force_reclaim_mapped: lru_add_drain(); spin_lock_irq(&zone->lru_lock); - pgmoved = isolate_lru_pages(nr_pages, &zone->active_list, - &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE); + pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, + ISOLATE_ACTIVE, zone, + sc->mem_cgroup, 1); zone->pages_scanned += pgscanned; __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); spin_unlock_irq(&zone->lru_lock); @@ -1051,6 +1077,7 @@ force_reclaim_mapped: ClearPageActive(page); list_move(&page->lru, &zone->inactive_list); + mem_cgroup_move_lists(page_get_page_cgroup(page), false); pgmoved++; if (!pagevec_add(&pvec, page)) { __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); @@ -1079,6 +1106,7 @@ force_reclaim_mapped: SetPageLRU(page); VM_BUG_ON(!PageActive(page)); list_move(&page->lru, &zone->active_list); + mem_cgroup_move_lists(page_get_page_cgroup(page), true); pgmoved++; if (!pagevec_add(&pvec, page)) { __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); @@ -1206,7 +1234,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones, * holds filesystem locks which prevent writeout this might not work, and the * allocation attempt will fail. */ -unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) +static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, + struct scan_control *sc) { int priority; int ret = 0; @@ -1215,14 +1244,6 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long lru_pages = 0; int i; - struct scan_control sc = { - .gfp_mask = gfp_mask, - .may_writepage = !laptop_mode, - .swap_cluster_max = SWAP_CLUSTER_MAX, - .may_swap = 1, - .swappiness = vm_swappiness, - .order = order, - }; count_vm_event(ALLOCSTALL); @@ -1237,17 +1258,22 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) } for (priority = DEF_PRIORITY; priority >= 0; priority--) { - sc.nr_scanned = 0; + sc->nr_scanned = 0; if (!priority) disable_swap_token(); - nr_reclaimed += shrink_zones(priority, zones, &sc); - shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); + nr_reclaimed += shrink_zones(priority, zones, sc); + /* + * Don't shrink slabs when reclaiming memory from + * over limit cgroups + */ + if (sc->mem_cgroup == NULL) + shrink_slab(sc->nr_scanned, gfp_mask, lru_pages); if (reclaim_state) { nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; } - total_scanned += sc.nr_scanned; - if (nr_reclaimed >= sc.swap_cluster_max) { + total_scanned += sc->nr_scanned; + if (nr_reclaimed >= sc->swap_cluster_max) { ret = 1; goto out; } @@ -1259,18 +1285,18 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) * that's undesirable in laptop mode, where we *want* lumpy * writeout. So in laptop mode, write out the whole world. */ - if (total_scanned > sc.swap_cluster_max + - sc.swap_cluster_max / 2) { + if (total_scanned > sc->swap_cluster_max + + sc->swap_cluster_max / 2) { wakeup_pdflush(laptop_mode ? 0 : total_scanned); - sc.may_writepage = 1; + sc->may_writepage = 1; } /* Take a nap, wait for some writeback to complete */ - if (sc.nr_scanned && priority < DEF_PRIORITY - 2) + if (sc->nr_scanned && priority < DEF_PRIORITY - 2) congestion_wait(WRITE, HZ/10); } /* top priority shrink_caches still had more to do? don't OOM, then */ - if (!sc.all_unreclaimable) + if (!sc->all_unreclaimable && sc->mem_cgroup == NULL) ret = 1; out: /* @@ -1293,6 +1319,54 @@ out: return ret; } +unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) +{ + struct scan_control sc = { + .gfp_mask = gfp_mask, + .may_writepage = !laptop_mode, + .swap_cluster_max = SWAP_CLUSTER_MAX, + .may_swap = 1, + .swappiness = vm_swappiness, + .order = order, + .mem_cgroup = NULL, + .isolate_pages = isolate_pages_global, + }; + + return do_try_to_free_pages(zones, gfp_mask, &sc); +} + +#ifdef CONFIG_CGROUP_MEM_CONT + +#ifdef CONFIG_HIGHMEM +#define ZONE_USERPAGES ZONE_HIGHMEM +#else +#define ZONE_USERPAGES ZONE_NORMAL +#endif + +unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont) +{ + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .may_writepage = !laptop_mode, + .may_swap = 1, + .swap_cluster_max = SWAP_CLUSTER_MAX, + .swappiness = vm_swappiness, + .order = 0, + .mem_cgroup = mem_cont, + .isolate_pages = mem_cgroup_isolate_pages, + }; + int node; + struct zone **zones; + + for_each_online_node(node) { + zones = NODE_DATA(node)->node_zonelists[ZONE_USERPAGES].zones; + if (do_try_to_free_pages(zones, sc.gfp_mask, &sc)) + return 1; + } + return 0; +} +#endif + /* * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at pages_high. @@ -1328,6 +1402,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) .swap_cluster_max = SWAP_CLUSTER_MAX, .swappiness = vm_swappiness, .order = order, + .mem_cgroup = NULL, + .isolate_pages = isolate_pages_global, }; /* * temp_priority is used to remember the scanning priority at which @@ -1649,6 +1725,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) .swap_cluster_max = nr_pages, .may_writepage = 1, .swappiness = vm_swappiness, + .isolate_pages = isolate_pages_global, }; current->reclaim_state = &reclaim_state; @@ -1834,6 +1911,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, .swappiness = vm_swappiness, + .isolate_pages = isolate_pages_global, }; unsigned long slab_reclaimable; -- 2.30.2