From e11f8b7b6c4ea13bf8af6b8f42b45e15b554a92b Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 7 Apr 2017 16:04:57 -0700 Subject: [PATCH] dax: fix radix tree insertion race While running generic/340 in my test setup I hit the following race. It can happen with kernels that support FS DAX PMDs, so v4.10 thru v4.11-rc5. Thread 1 Thread 2 -------- -------- dax_iomap_pmd_fault() grab_mapping_entry() spin_lock_irq() get_unlocked_mapping_entry() 'entry' is NULL, can't call lock_slot() spin_unlock_irq() radix_tree_preload() dax_iomap_pmd_fault() grab_mapping_entry() spin_lock_irq() get_unlocked_mapping_entry() ... lock_slot() spin_unlock_irq() dax_pmd_insert_mapping() spin_lock_irq() __radix_tree_insert() fails with -EEXIST The issue is that we have to drop mapping->tree_lock while calling radix_tree_preload(), but since we didn't have a radix tree entry to lock (unlike in the pmd_downgrade case) we have no protection against Thread 2 coming along and inserting a PMD at the same index. For 4k entries we handled this with a special-case response to -EEXIST coming from the __radix_tree_insert(), but this doesn't save us for PMDs because the -EEXIST case can also mean that we collided with a 4k entry in the radix tree at a different index, but one that is covered by our PMD range. So, correctly handle both the 4k and 2M collision cases by explicitly re-checking the radix tree for an entry at our index once we reacquire mapping->tree_lock. This patch has made it through a clean xfstests run with the current v4.11-rc5 based linux/master, and it also ran generic/340 500 times in a loop. It used to fail within the first 10 iterations. Link: http://lkml.kernel.org/r/20170406212944.2866-1-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Cc: "Darrick J. Wong" Cc: Alexander Viro Cc: Christoph Hellwig Cc: Dan Williams Cc: Jan Kara Cc: Matthew Wilcox Cc: [4.10+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index de622d4282a6..85abd741253d 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -373,6 +373,22 @@ restart: } spin_lock_irq(&mapping->tree_lock); + if (!entry) { + /* + * We needed to drop the page_tree lock while calling + * radix_tree_preload() and we didn't have an entry to + * lock. See if another thread inserted an entry at + * our index during this time. + */ + entry = __radix_tree_lookup(&mapping->page_tree, index, + NULL, &slot); + if (entry) { + radix_tree_preload_end(); + spin_unlock_irq(&mapping->tree_lock); + goto restart; + } + } + if (pmd_downgrade) { radix_tree_delete(&mapping->page_tree, index); mapping->nrexceptional--; @@ -388,19 +404,12 @@ restart: if (err) { spin_unlock_irq(&mapping->tree_lock); /* - * Someone already created the entry? This is a - * normal failure when inserting PMDs in a range - * that already contains PTEs. In that case we want - * to return -EEXIST immediately. - */ - if (err == -EEXIST && !(size_flag & RADIX_DAX_PMD)) - goto restart; - /* - * Our insertion of a DAX PMD entry failed, most - * likely because it collided with a PTE sized entry - * at a different index in the PMD range. We haven't - * inserted anything into the radix tree and have no - * waiters to wake. + * Our insertion of a DAX entry failed, most likely + * because we were inserting a PMD entry and it + * collided with a PTE sized entry at a different + * index in the PMD range. We haven't inserted + * anything into the radix tree and have no waiters to + * wake. */ return ERR_PTR(err); } -- 2.30.2