struct wb_domain cgwb_domain;
#endif
+#ifdef CONFIG_INET
+ unsigned long socket_pressure;
+#endif
+
/* List of events which userspace want to receive */
struct list_head event_list;
spinlock_t event_list_lock;
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
-struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
return css ? container_of(css, struct mem_cgroup, css) : NULL;
}
+#define mem_cgroup_from_counter(counter, member) \
+ container_of(counter, struct mem_cgroup, member)
+
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
struct mem_cgroup *,
struct mem_cgroup_reclaim_cookie *);
void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
+/**
+ * parent_mem_cgroup - find the accounting parent of a memcg
+ * @memcg: memcg whose parent to find
+ *
+ * Returns the parent memcg, or NULL if this is the root or the memory
+ * controller is in legacy no-hierarchy mode.
+ */
+static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
+{
+ if (!memcg->memory.parent)
+ return NULL;
+ return mem_cgroup_from_counter(memcg->memory.parent, memory);
+}
+
static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
struct mem_cgroup *root)
{
static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
{
#ifdef CONFIG_MEMCG_KMEM
- return memcg->tcp_mem.memory_pressure;
-#else
- return false;
+ if (memcg->tcp_mem.memory_pressure)
+ return true;
#endif
+ do {
+ if (time_before(jiffies, memcg->socket_pressure))
+ return true;
+ } while ((memcg = parent_mem_cgroup(memcg)));
+ return false;
}
#else
#define mem_cgroup_sockets_enabled 0
struct vmpressure {
unsigned long scanned;
unsigned long reclaimed;
+
+ unsigned long tree_scanned;
+ unsigned long tree_reclaimed;
/* The lock is used to keep the scanned/reclaimed above in sync. */
struct spinlock sr_lock;
struct mem_cgroup;
#ifdef CONFIG_MEMCG
-extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
+extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
unsigned long scanned, unsigned long reclaimed);
extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);
extern void vmpressure_unregister_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd);
#else
-static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
+static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
unsigned long scanned, unsigned long reclaimed) {}
static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg,
int prio) {}
return ret;
}
-#define mem_cgroup_from_counter(counter, member) \
- container_of(counter, struct mem_cgroup, member)
-
/**
* mem_cgroup_margin - calculate chargeable space of a memory cgroup
* @memcg: the memory cgroup
kfree(memcg);
}
-/*
- * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
- */
-struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
-{
- if (!memcg->memory.parent)
- return NULL;
- return mem_cgroup_from_counter(memcg->memory.parent, memory);
-}
-EXPORT_SYMBOL(parent_mem_cgroup);
-
static struct cgroup_subsys_state * __ref
mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
+#endif
+#ifdef CONFIG_INET
+ memcg->socket_pressure = jiffies;
#endif
return &memcg->css;
};
static bool vmpressure_event(struct vmpressure *vmpr,
- unsigned long scanned, unsigned long reclaimed)
+ enum vmpressure_levels level)
{
struct vmpressure_event *ev;
- enum vmpressure_levels level;
bool signalled = false;
- level = vmpressure_calc_level(scanned, reclaimed);
-
mutex_lock(&vmpr->events_lock);
list_for_each_entry(ev, &vmpr->events, node) {
struct vmpressure *vmpr = work_to_vmpressure(work);
unsigned long scanned;
unsigned long reclaimed;
+ enum vmpressure_levels level;
spin_lock(&vmpr->sr_lock);
/*
* here. No need for any locks here since we don't care if
* vmpr->reclaimed is in sync.
*/
- scanned = vmpr->scanned;
+ scanned = vmpr->tree_scanned;
if (!scanned) {
spin_unlock(&vmpr->sr_lock);
return;
}
- reclaimed = vmpr->reclaimed;
- vmpr->scanned = 0;
- vmpr->reclaimed = 0;
+ reclaimed = vmpr->tree_reclaimed;
+ vmpr->tree_scanned = 0;
+ vmpr->tree_reclaimed = 0;
spin_unlock(&vmpr->sr_lock);
+ level = vmpressure_calc_level(scanned, reclaimed);
+
do {
- if (vmpressure_event(vmpr, scanned, reclaimed))
+ if (vmpressure_event(vmpr, level))
break;
/*
* If not handled, propagate the event upward into the
* vmpressure() - Account memory pressure through scanned/reclaimed ratio
* @gfp: reclaimer's gfp mask
* @memcg: cgroup memory controller handle
+ * @tree: legacy subtree mode
* @scanned: number of pages scanned
* @reclaimed: number of pages reclaimed
*
* "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
* pressure index is then further refined and averaged over time.
*
+ * If @tree is set, vmpressure is in traditional userspace reporting
+ * mode: @memcg is considered the pressure root and userspace is
+ * notified of the entire subtree's reclaim efficiency.
+ *
+ * If @tree is not set, reclaim efficiency is recorded for @memcg, and
+ * only in-kernel users are notified.
+ *
* This function does not return any value.
*/
-void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
+void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
unsigned long scanned, unsigned long reclaimed)
{
struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
if (!scanned)
return;
- spin_lock(&vmpr->sr_lock);
- vmpr->scanned += scanned;
- vmpr->reclaimed += reclaimed;
- scanned = vmpr->scanned;
- spin_unlock(&vmpr->sr_lock);
+ if (tree) {
+ spin_lock(&vmpr->sr_lock);
+ vmpr->tree_scanned += scanned;
+ vmpr->tree_reclaimed += reclaimed;
+ scanned = vmpr->scanned;
+ spin_unlock(&vmpr->sr_lock);
- if (scanned < vmpressure_win)
- return;
- schedule_work(&vmpr->work);
+ if (scanned < vmpressure_win)
+ return;
+ schedule_work(&vmpr->work);
+ } else {
+ enum vmpressure_levels level;
+
+ /* For now, no users for root-level efficiency */
+ if (memcg == root_mem_cgroup)
+ return;
+
+ spin_lock(&vmpr->sr_lock);
+ scanned = vmpr->scanned += scanned;
+ reclaimed = vmpr->reclaimed += reclaimed;
+ if (scanned < vmpressure_win) {
+ spin_unlock(&vmpr->sr_lock);
+ return;
+ }
+ vmpr->scanned = vmpr->reclaimed = 0;
+ spin_unlock(&vmpr->sr_lock);
+
+ level = vmpressure_calc_level(scanned, reclaimed);
+
+ if (level > VMPRESSURE_LOW) {
+ /*
+ * Let the socket buffer allocator know that
+ * we are having trouble reclaiming LRU pages.
+ *
+ * For hysteresis keep the pressure state
+ * asserted for a second in which subsequent
+ * pressure events can occur.
+ */
+ memcg->socket_pressure = jiffies + HZ;
+ }
+ }
}
/**
* to the vmpressure() basically means that we signal 'critical'
* level.
*/
- vmpressure(gfp, memcg, vmpressure_win, 0);
+ vmpressure(gfp, memcg, true, vmpressure_win, 0);
}
/**
memcg = mem_cgroup_iter(root, NULL, &reclaim);
do {
unsigned long lru_pages;
+ unsigned long reclaimed;
unsigned long scanned;
struct lruvec *lruvec;
int swappiness;
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
swappiness = mem_cgroup_swappiness(memcg);
+ reclaimed = sc->nr_reclaimed;
scanned = sc->nr_scanned;
shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
memcg, sc->nr_scanned - scanned,
lru_pages);
+ /* Record the group's reclaim efficiency */
+ vmpressure(sc->gfp_mask, memcg, false,
+ sc->nr_scanned - scanned,
+ sc->nr_reclaimed - reclaimed);
+
/*
* Direct reclaim and kswapd have to scan all memory
* cgroups to fulfill the overall scan target for the
reclaim_state->reclaimed_slab = 0;
}
- vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
+ /* Record the subtree's reclaim efficiency */
+ vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
sc->nr_scanned - nr_scanned,
sc->nr_reclaimed - nr_reclaimed);