/* list_lru_walk_cb has to always return one of those */
enum lru_status {
LRU_REMOVED, /* item removed from list */
+ LRU_REMOVED_RETRY, /* item removed, but lock has been
+ dropped and reacquired */
LRU_ROTATE, /* item referenced, give another pass */
LRU_SKIP, /* item cannot be locked, skip */
LRU_RETRY, /* item not freeable. May drop the lock
};
void list_lru_destroy(struct list_lru *lru);
-int list_lru_init(struct list_lru *lru);
+int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key);
+static inline int list_lru_init(struct list_lru *lru)
+{
+ return list_lru_init_key(lru, NULL);
+}
/**
* list_lru_add: add an element to the lru list's tail
#endif
WORKINGSET_REFAULT,
WORKINGSET_ACTIVATE,
+ WORKINGSET_NODERECLAIM,
NR_ANON_TRANSPARENT_HUGEPAGES,
NR_FREE_CMA_PAGES,
NR_VM_ZONE_STAT_ITEMS };
#define RADIX_TREE_TAG_LONGS \
((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
+#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
+#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
+ RADIX_TREE_MAP_SHIFT))
+
+/* Height component in node->path */
+#define RADIX_TREE_HEIGHT_SHIFT (RADIX_TREE_MAX_PATH + 1)
+#define RADIX_TREE_HEIGHT_MASK ((1UL << RADIX_TREE_HEIGHT_SHIFT) - 1)
+
+/* Internally used bits of node->count */
+#define RADIX_TREE_COUNT_SHIFT (RADIX_TREE_MAP_SHIFT + 1)
+#define RADIX_TREE_COUNT_MASK ((1UL << RADIX_TREE_COUNT_SHIFT) - 1)
+
struct radix_tree_node {
- unsigned int height; /* Height from the bottom */
+ unsigned int path; /* Offset in parent & height from the bottom */
unsigned int count;
union {
- struct radix_tree_node *parent; /* Used when ascending tree */
- struct rcu_head rcu_head; /* Used when freeing node */
+ struct {
+ /* Used when ascending tree */
+ struct radix_tree_node *parent;
+ /* For tree user */
+ void *private_data;
+ };
+ /* Used when freeing node */
+ struct rcu_head rcu_head;
};
+ /* For tree user */
+ struct list_head private_list;
void __rcu *slots[RADIX_TREE_MAP_SIZE];
unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
};
-#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
-#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
- RADIX_TREE_MAP_SHIFT))
-
/* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
struct radix_tree_root {
unsigned int height;
struct radix_tree_node **nodep, void ***slotp);
void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
-bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index,
+bool __radix_tree_delete_node(struct radix_tree_root *root,
struct radix_tree_node *node);
void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
void *radix_tree_delete(struct radix_tree_root *, unsigned long);
void *workingset_eviction(struct address_space *mapping, struct page *page);
bool workingset_refault(void *shadow);
void workingset_activation(struct page *page);
+extern struct list_lru workingset_shadow_nodes;
+
+static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
+{
+ return node->count & RADIX_TREE_COUNT_MASK;
+}
+
+static inline void workingset_node_pages_inc(struct radix_tree_node *node)
+{
+ node->count++;
+}
+
+static inline void workingset_node_pages_dec(struct radix_tree_node *node)
+{
+ node->count--;
+}
+
+static inline unsigned int workingset_node_shadows(struct radix_tree_node *node)
+{
+ return node->count >> RADIX_TREE_COUNT_SHIFT;
+}
+
+static inline void workingset_node_shadows_inc(struct radix_tree_node *node)
+{
+ node->count += 1U << RADIX_TREE_COUNT_SHIFT;
+}
+
+static inline void workingset_node_shadows_dec(struct radix_tree_node *node)
+{
+ node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
+}
/* linux/mm/page_alloc.c */
extern unsigned long totalram_pages;
/* Increase the height. */
newheight = root->height+1;
- node->height = newheight;
+ BUG_ON(newheight & ~RADIX_TREE_HEIGHT_MASK);
+ node->path = newheight;
node->count = 1;
node->parent = NULL;
slot = root->rnode;
/* Have to add a child node. */
if (!(slot = radix_tree_node_alloc(root)))
return -ENOMEM;
- slot->height = height;
+ slot->path = height;
slot->parent = node;
if (node) {
rcu_assign_pointer(node->slots[offset], slot);
node->count++;
+ slot->path |= offset << RADIX_TREE_HEIGHT_SHIFT;
} else
rcu_assign_pointer(root->rnode, ptr_to_indirect(slot));
}
}
node = indirect_to_ptr(node);
- height = node->height;
+ height = node->path & RADIX_TREE_HEIGHT_MASK;
if (index > radix_tree_maxindex(height))
return NULL;
return (index == 0);
node = indirect_to_ptr(node);
- height = node->height;
+ height = node->path & RADIX_TREE_HEIGHT_MASK;
if (index > radix_tree_maxindex(height))
return 0;
{
unsigned shift, tag = flags & RADIX_TREE_ITER_TAG_MASK;
struct radix_tree_node *rnode, *node;
- unsigned long index, offset;
+ unsigned long index, offset, height;
if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag))
return NULL;
return NULL;
restart:
- shift = (rnode->height - 1) * RADIX_TREE_MAP_SHIFT;
+ height = rnode->path & RADIX_TREE_HEIGHT_MASK;
+ shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
offset = index >> shift;
/* Index outside of the tree */
unsigned int shift, height;
unsigned long i;
- height = slot->height;
+ height = slot->path & RADIX_TREE_HEIGHT_MASK;
shift = (height-1) * RADIX_TREE_MAP_SHIFT;
for ( ; height > 1; height--) {
}
node = indirect_to_ptr(node);
- max_index = radix_tree_maxindex(node->height);
+ max_index = radix_tree_maxindex(node->path &
+ RADIX_TREE_HEIGHT_MASK);
if (cur_index > max_index) {
rcu_read_unlock();
break;
*
* Returns %true if @node was freed, %false otherwise.
*/
-bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index,
+bool __radix_tree_delete_node(struct radix_tree_root *root,
struct radix_tree_node *node)
{
bool deleted = false;
parent = node->parent;
if (parent) {
- index >>= RADIX_TREE_MAP_SHIFT;
+ unsigned int offset;
- parent->slots[index & RADIX_TREE_MAP_MASK] = NULL;
+ offset = node->path >> RADIX_TREE_HEIGHT_SHIFT;
+ parent->slots[offset] = NULL;
parent->count--;
} else {
root_tag_clear_all(root);
node->slots[offset] = NULL;
node->count--;
- __radix_tree_delete_node(root, index, node);
+ __radix_tree_delete_node(root, node);
return entry;
}
EXPORT_SYMBOL(radix_tree_tagged);
static void
-radix_tree_node_ctor(void *node)
+radix_tree_node_ctor(void *arg)
{
- memset(node, 0, sizeof(struct radix_tree_node));
+ struct radix_tree_node *node = arg;
+
+ memset(node, 0, sizeof(*node));
+ INIT_LIST_HEAD(&node->private_list);
}
static __init unsigned long __maxindex(unsigned int height)
static void page_cache_tree_delete(struct address_space *mapping,
struct page *page, void *shadow)
{
- if (shadow) {
- void **slot;
+ struct radix_tree_node *node;
+ unsigned long index;
+ unsigned int offset;
+ unsigned int tag;
+ void **slot;
- slot = radix_tree_lookup_slot(&mapping->page_tree, page->index);
- radix_tree_replace_slot(slot, shadow);
+ VM_BUG_ON(!PageLocked(page));
+
+ __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
+
+ if (shadow) {
mapping->nrshadows++;
/*
* Make sure the nrshadows update is committed before
* same time and miss a shadow entry.
*/
smp_wmb();
- } else
- radix_tree_delete(&mapping->page_tree, page->index);
+ }
mapping->nrpages--;
+
+ if (!node) {
+ /* Clear direct pointer tags in root node */
+ mapping->page_tree.gfp_mask &= __GFP_BITS_MASK;
+ radix_tree_replace_slot(slot, shadow);
+ return;
+ }
+
+ /* Clear tree tags for the removed page */
+ index = page->index;
+ offset = index & RADIX_TREE_MAP_MASK;
+ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
+ if (test_bit(offset, node->tags[tag]))
+ radix_tree_tag_clear(&mapping->page_tree, index, tag);
+ }
+
+ /* Delete page, swap shadow entry */
+ radix_tree_replace_slot(slot, shadow);
+ workingset_node_pages_dec(node);
+ if (shadow)
+ workingset_node_shadows_inc(node);
+ else
+ if (__radix_tree_delete_node(&mapping->page_tree, node))
+ return;
+
+ /*
+ * Track node that only contains shadow entries.
+ *
+ * Avoid acquiring the list_lru lock if already tracked. The
+ * list_empty() test is safe as node->private_list is
+ * protected by mapping->tree_lock.
+ */
+ if (!workingset_node_pages(node) &&
+ list_empty(&node->private_list)) {
+ node->private_data = mapping;
+ list_lru_add(&workingset_shadow_nodes, &node->private_list);
+ }
}
/*
static int page_cache_tree_insert(struct address_space *mapping,
struct page *page, void **shadowp)
{
+ struct radix_tree_node *node;
void **slot;
int error;
- slot = radix_tree_lookup_slot(&mapping->page_tree, page->index);
- if (slot) {
+ error = __radix_tree_create(&mapping->page_tree, page->index,
+ &node, &slot);
+ if (error)
+ return error;
+ if (*slot) {
void *p;
p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
if (!radix_tree_exceptional_entry(p))
return -EEXIST;
- radix_tree_replace_slot(slot, page);
- mapping->nrshadows--;
- mapping->nrpages++;
if (shadowp)
*shadowp = p;
- return 0;
+ mapping->nrshadows--;
+ if (node)
+ workingset_node_shadows_dec(node);
}
- error = radix_tree_insert(&mapping->page_tree, page->index, page);
- if (!error)
- mapping->nrpages++;
- return error;
+ radix_tree_replace_slot(slot, page);
+ mapping->nrpages++;
+ if (node) {
+ workingset_node_pages_inc(node);
+ /*
+ * Don't track node that contains actual pages.
+ *
+ * Avoid acquiring the list_lru lock if already
+ * untracked. The list_empty() test is safe as
+ * node->private_list is protected by
+ * mapping->tree_lock.
+ */
+ if (!list_empty(&node->private_list))
+ list_lru_del(&workingset_shadow_nodes,
+ &node->private_list);
+ }
+ return 0;
}
static int __add_to_page_cache_locked(struct page *page,
ret = isolate(item, &nlru->lock, cb_arg);
switch (ret) {
+ case LRU_REMOVED_RETRY:
+ assert_spin_locked(&nlru->lock);
case LRU_REMOVED:
if (--nlru->nr_items == 0)
node_clear(nid, lru->active_nodes);
WARN_ON_ONCE(nlru->nr_items < 0);
isolated++;
+ /*
+ * If the lru lock has been dropped, our list
+ * traversal is now invalid and so we have to
+ * restart from scratch.
+ */
+ if (ret == LRU_REMOVED_RETRY)
+ goto restart;
break;
case LRU_ROTATE:
list_move_tail(item, &nlru->list);
* The lru lock has been dropped, our list traversal is
* now invalid and so we have to restart from scratch.
*/
+ assert_spin_locked(&nlru->lock);
goto restart;
default:
BUG();
}
EXPORT_SYMBOL_GPL(list_lru_walk_node);
-int list_lru_init(struct list_lru *lru)
+int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key)
{
int i;
size_t size = sizeof(*lru->node) * nr_node_ids;
nodes_clear(lru->active_nodes);
for (i = 0; i < nr_node_ids; i++) {
spin_lock_init(&lru->node[i].lock);
+ if (key)
+ lockdep_set_class(&lru->node[i].lock, key);
INIT_LIST_HEAD(&lru->node[i].list);
lru->node[i].nr_items = 0;
}
return 0;
}
-EXPORT_SYMBOL_GPL(list_lru_init);
+EXPORT_SYMBOL_GPL(list_lru_init_key);
void list_lru_destroy(struct list_lru *lru)
{
static void clear_exceptional_entry(struct address_space *mapping,
pgoff_t index, void *entry)
{
+ struct radix_tree_node *node;
+ void **slot;
+
/* Handled by shmem itself */
if (shmem_mapping(mapping))
return;
* without the tree itself locked. These unlocked entries
* need verification under the tree lock.
*/
- if (radix_tree_delete_item(&mapping->page_tree, index, entry) == entry)
- mapping->nrshadows--;
+ if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
+ goto unlock;
+ if (*slot != entry)
+ goto unlock;
+ radix_tree_replace_slot(slot, NULL);
+ mapping->nrshadows--;
+ if (!node)
+ goto unlock;
+ workingset_node_shadows_dec(node);
+ /*
+ * Don't track node without shadow entries.
+ *
+ * Avoid acquiring the list_lru lock if already untracked.
+ * The list_empty() test is safe as node->private_list is
+ * protected by mapping->tree_lock.
+ */
+ if (!workingset_node_shadows(node) &&
+ !list_empty(&node->private_list))
+ list_lru_del(&workingset_shadow_nodes, &node->private_list);
+ __radix_tree_delete_node(&mapping->page_tree, node);
+unlock:
spin_unlock_irq(&mapping->tree_lock);
}
#endif
"workingset_refault",
"workingset_activate",
+ "workingset_nodereclaim",
"nr_anon_transparent_hugepages",
"nr_free_cma",
"nr_dirty_threshold",
{
atomic_long_inc(&page_zone(page)->inactive_age);
}
+
+/*
+ * Shadow entries reflect the share of the working set that does not
+ * fit into memory, so their number depends on the access pattern of
+ * the workload. In most cases, they will refault or get reclaimed
+ * along with the inode, but a (malicious) workload that streams
+ * through files with a total size several times that of available
+ * memory, while preventing the inodes from being reclaimed, can
+ * create excessive amounts of shadow nodes. To keep a lid on this,
+ * track shadow nodes and reclaim them when they grow way past the
+ * point where they would still be useful.
+ */
+
+struct list_lru workingset_shadow_nodes;
+
+static unsigned long count_shadow_nodes(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ unsigned long shadow_nodes;
+ unsigned long max_nodes;
+ unsigned long pages;
+
+ /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
+ local_irq_disable();
+ shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid);
+ local_irq_enable();
+
+ pages = node_present_pages(sc->nid);
+ /*
+ * Active cache pages are limited to 50% of memory, and shadow
+ * entries that represent a refault distance bigger than that
+ * do not have any effect. Limit the number of shadow nodes
+ * such that shadow entries do not exceed the number of active
+ * cache pages, assuming a worst-case node population density
+ * of 1/8th on average.
+ *
+ * On 64-bit with 7 radix_tree_nodes per page and 64 slots
+ * each, this will reclaim shadow entries when they consume
+ * ~2% of available memory:
+ *
+ * PAGE_SIZE / radix_tree_nodes / node_entries / PAGE_SIZE
+ */
+ max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3);
+
+ if (shadow_nodes <= max_nodes)
+ return 0;
+
+ return shadow_nodes - max_nodes;
+}
+
+static enum lru_status shadow_lru_isolate(struct list_head *item,
+ spinlock_t *lru_lock,
+ void *arg)
+{
+ struct address_space *mapping;
+ struct radix_tree_node *node;
+ unsigned int i;
+ int ret;
+
+ /*
+ * Page cache insertions and deletions synchroneously maintain
+ * the shadow node LRU under the mapping->tree_lock and the
+ * lru_lock. Because the page cache tree is emptied before
+ * the inode can be destroyed, holding the lru_lock pins any
+ * address_space that has radix tree nodes on the LRU.
+ *
+ * We can then safely transition to the mapping->tree_lock to
+ * pin only the address_space of the particular node we want
+ * to reclaim, take the node off-LRU, and drop the lru_lock.
+ */
+
+ node = container_of(item, struct radix_tree_node, private_list);
+ mapping = node->private_data;
+
+ /* Coming from the list, invert the lock order */
+ if (!spin_trylock(&mapping->tree_lock)) {
+ spin_unlock(lru_lock);
+ ret = LRU_RETRY;
+ goto out;
+ }
+
+ list_del_init(item);
+ spin_unlock(lru_lock);
+
+ /*
+ * The nodes should only contain one or more shadow entries,
+ * no pages, so we expect to be able to remove them all and
+ * delete and free the empty node afterwards.
+ */
+
+ BUG_ON(!node->count);
+ BUG_ON(node->count & RADIX_TREE_COUNT_MASK);
+
+ for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
+ if (node->slots[i]) {
+ BUG_ON(!radix_tree_exceptional_entry(node->slots[i]));
+ node->slots[i] = NULL;
+ BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT));
+ node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
+ BUG_ON(!mapping->nrshadows);
+ mapping->nrshadows--;
+ }
+ }
+ BUG_ON(node->count);
+ inc_zone_state(page_zone(virt_to_page(node)), WORKINGSET_NODERECLAIM);
+ if (!__radix_tree_delete_node(&mapping->page_tree, node))
+ BUG();
+
+ spin_unlock(&mapping->tree_lock);
+ ret = LRU_REMOVED_RETRY;
+out:
+ local_irq_enable();
+ cond_resched();
+ local_irq_disable();
+ spin_lock(lru_lock);
+ return ret;
+}
+
+static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ unsigned long ret;
+
+ /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
+ local_irq_disable();
+ ret = list_lru_walk_node(&workingset_shadow_nodes, sc->nid,
+ shadow_lru_isolate, NULL, &sc->nr_to_scan);
+ local_irq_enable();
+ return ret;
+}
+
+static struct shrinker workingset_shadow_shrinker = {
+ .count_objects = count_shadow_nodes,
+ .scan_objects = scan_shadow_nodes,
+ .seeks = DEFAULT_SEEKS,
+ .flags = SHRINKER_NUMA_AWARE,
+};
+
+/*
+ * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
+ * mapping->tree_lock.
+ */
+static struct lock_class_key shadow_nodes_key;
+
+static int __init workingset_init(void)
+{
+ int ret;
+
+ ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
+ if (ret)
+ goto err;
+ ret = register_shrinker(&workingset_shadow_shrinker);
+ if (ret)
+ goto err_list_lru;
+ return 0;
+err_list_lru:
+ list_lru_destroy(&workingset_shadow_nodes);
+err:
+ return ret;
+}
+module_init(workingset_init);