lib/radix-tree.c: make radix_tree_node_alloc() work correctly within interrupt
authorJan Kara <jack@suse.cz>
Wed, 11 Sep 2013 21:26:05 +0000 (14:26 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 11 Sep 2013 22:59:36 +0000 (15:59 -0700)
With users of radix_tree_preload() run from interrupt (block/blk-ioc.c is
one such possible user), the following race can happen:

radix_tree_preload()
...
radix_tree_insert()
  radix_tree_node_alloc()
    if (rtp->nr) {
      ret = rtp->nodes[rtp->nr - 1];
<interrupt>
...
radix_tree_preload()
...
radix_tree_insert()
  radix_tree_node_alloc()
    if (rtp->nr) {
      ret = rtp->nodes[rtp->nr - 1];

And we give out one radix tree node twice.  That clearly results in radix
tree corruption with different results (usually OOPS) depending on which
two users of radix tree race.

We fix the problem by making radix_tree_node_alloc() always allocate fresh
radix tree nodes when in interrupt.  Using preloading when in interrupt
doesn't make sense since all the allocations have to be atomic anyway and
we cannot steal nodes from process-context users because some users rely
on radix_tree_insert() succeeding after radix_tree_preload().
in_interrupt() check is somewhat ugly but we cannot simply key off passed
gfp_mask as that is acquired from root_gfp_mask() and thus the same for
all preload users.

Another part of the fix is to avoid node preallocation in
radix_tree_preload() when passed gfp_mask doesn't allow waiting.  Again,
preallocation in such case doesn't make sense and when preallocation would
happen in interrupt we could possibly leak some allocated nodes.  However,
some users of radix_tree_preload() require following radix_tree_insert()
to succeed.  To avoid unexpected effects for these users,
radix_tree_preload() only warns if passed gfp mask doesn't allow waiting
and we provide a new function radix_tree_maybe_preload() for those users
which get different gfp mask from different call sites and which are
prepared to handle radix_tree_insert() failure.

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <jaxboe@fusionio.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
block/blk-ioc.c
fs/fscache/page.c
include/linux/radix-tree.h
lib/radix-tree.c
mm/filemap.c
mm/shmem.c
mm/swap_state.c

index 4464c823cff2a0f3228a2dd5d54da3c9cdcbbde4..46cd7bd18b347c580bfee1f4b86fb59321070e56 100644 (file)
@@ -367,7 +367,7 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
        if (!icq)
                return NULL;
 
-       if (radix_tree_preload(gfp_mask) < 0) {
+       if (radix_tree_maybe_preload(gfp_mask) < 0) {
                kmem_cache_free(et->icq_cache, icq);
                return NULL;
        }
index 8702b732109ad38139dfce55c2ff9e9a61923430..73899c1c34494555d73dd5714ecb21eb74c0296d 100644 (file)
@@ -913,7 +913,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
                (1 << FSCACHE_OP_WAITING) |
                (1 << FSCACHE_OP_UNUSE_COOKIE);
 
-       ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
+       ret = radix_tree_maybe_preload(gfp & ~__GFP_HIGHMEM);
        if (ret < 0)
                goto nomem_free;
 
index ffc444c38b0ab64ab999da3d670dde338e669265..403940787be18230a5201799d57020ed46177cab 100644 (file)
@@ -231,6 +231,7 @@ unsigned long radix_tree_next_hole(struct radix_tree_root *root,
 unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
                                unsigned long index, unsigned long max_scan);
 int radix_tree_preload(gfp_t gfp_mask);
+int radix_tree_maybe_preload(gfp_t gfp_mask);
 void radix_tree_init(void);
 void *radix_tree_tag_set(struct radix_tree_root *root,
                        unsigned long index, unsigned int tag);
index e7964296fd50551020872d430009e890d0e97b5d..7811ed3b4e701c2e0d82368a8bae457279ca3246 100644 (file)
@@ -32,6 +32,7 @@
 #include <linux/string.h>
 #include <linux/bitops.h>
 #include <linux/rcupdate.h>
+#include <linux/hardirq.h>             /* in_interrupt() */
 
 
 #ifdef __KERNEL__
@@ -207,7 +208,12 @@ radix_tree_node_alloc(struct radix_tree_root *root)
        struct radix_tree_node *ret = NULL;
        gfp_t gfp_mask = root_gfp_mask(root);
 
-       if (!(gfp_mask & __GFP_WAIT)) {
+       /*
+        * Preload code isn't irq safe and it doesn't make sence to use
+        * preloading in the interrupt anyway as all the allocations have to
+        * be atomic. So just do normal allocation when in interrupt.
+        */
+       if (!(gfp_mask & __GFP_WAIT) && !in_interrupt()) {
                struct radix_tree_preload *rtp;
 
                /*
@@ -264,7 +270,7 @@ radix_tree_node_free(struct radix_tree_node *node)
  * To make use of this facility, the radix tree must be initialised without
  * __GFP_WAIT being passed to INIT_RADIX_TREE().
  */
-int radix_tree_preload(gfp_t gfp_mask)
+static int __radix_tree_preload(gfp_t gfp_mask)
 {
        struct radix_tree_preload *rtp;
        struct radix_tree_node *node;
@@ -288,8 +294,39 @@ int radix_tree_preload(gfp_t gfp_mask)
 out:
        return ret;
 }
+
+/*
+ * Load up this CPU's radix_tree_node buffer with sufficient objects to
+ * ensure that the addition of a single element in the tree cannot fail.  On
+ * success, return zero, with preemption disabled.  On error, return -ENOMEM
+ * with preemption not disabled.
+ *
+ * To make use of this facility, the radix tree must be initialised without
+ * __GFP_WAIT being passed to INIT_RADIX_TREE().
+ */
+int radix_tree_preload(gfp_t gfp_mask)
+{
+       /* Warn on non-sensical use... */
+       WARN_ON_ONCE(!(gfp_mask & __GFP_WAIT));
+       return __radix_tree_preload(gfp_mask);
+}
 EXPORT_SYMBOL(radix_tree_preload);
 
+/*
+ * The same as above function, except we don't guarantee preloading happens.
+ * We do it, if we decide it helps. On success, return zero with preemption
+ * disabled. On error, return -ENOMEM with preemption not disabled.
+ */
+int radix_tree_maybe_preload(gfp_t gfp_mask)
+{
+       if (gfp_mask & __GFP_WAIT)
+               return __radix_tree_preload(gfp_mask);
+       /* Preloading doesn't help anything with this gfp mask, skip it */
+       preempt_disable();
+       return 0;
+}
+EXPORT_SYMBOL(radix_tree_maybe_preload);
+
 /*
  *     Return the maximum key which can be store into a
  *     radix tree with height HEIGHT.
index 731a2c24532df32ca532184e7c714911a338dd9c..e607728db4a8f011ba80739a1f1cea17aff023bc 100644 (file)
@@ -469,7 +469,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
        if (error)
                goto out;
 
-       error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+       error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
        if (error == 0) {
                page_cache_get(page);
                page->mapping = mapping;
index 526149846d0a82370eaf96c32489a45152fd4f66..a1b8bf4391c27739b009e63b0c6105361edcebe8 100644 (file)
@@ -1205,7 +1205,7 @@ repeat:
                                                gfp & GFP_RECLAIM_MASK);
                if (error)
                        goto decused;
-               error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
+               error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
                if (!error) {
                        error = shmem_add_to_page_cache(page, mapping, index,
                                                        gfp, NULL);
index f24ab0dff554262e1da6866b866a4584871f8d9c..e6f15f8ca2af339ce9e90159bae0e6a800f06819 100644 (file)
@@ -122,7 +122,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
 {
        int error;
 
-       error = radix_tree_preload(gfp_mask);
+       error = radix_tree_maybe_preload(gfp_mask);
        if (!error) {
                error = __add_to_swap_cache(page, entry);
                radix_tree_preload_end();
@@ -328,7 +328,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                /*
                 * call radix_tree_preload() while we can wait.
                 */
-               err = radix_tree_preload(gfp_mask & GFP_KERNEL);
+               err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL);
                if (err)
                        break;