swap: fix races exposed by swap discard

author Shaohua Li <shli@kernel.org>

Wed, 11 Sep 2013 21:20:31 +0000 (14:20 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 11 Sep 2013 22:57:16 +0000 (15:57 -0700)
author Shaohua Li <shli@kernel.org>
Wed, 11 Sep 2013 21:20:31 +0000 (14:20 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 11 Sep 2013 22:57:16 +0000 (15:57 -0700)
diff --git a/mm/swapfile.c b/mm/swapfile.c

index dac47c66055c47f5c359bf545af69233b452b089..98e52e373bd80be9a35f4197556994d40c2acc7d 100644 (file)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -370,7 +370,8 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
                  * instead of free it immediately. The cluster will be freed
                  * after discard.
                  */
-               if (p->flags & SWP_PAGE_DISCARD) {
+               if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
+                                (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
                         swap_cluster_schedule_discard(p, idx);
                         return;
                 }
@@ -1288,7 +1289,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
                         else
                                 continue;
                 }
-               count = si->swap_map[i];
+               count = ACCESS_ONCE(si->swap_map[i]);
                 if (count && swap_count(count) != SWAP_MAP_BAD)
                         break;
         }
@@ -1308,7 +1309,11 @@ int try_to_unuse(unsigned int type, bool frontswap,
  {
         struct swap_info_struct *si = swap_info[type];
         struct mm_struct *start_mm;
-       unsigned char *swap_map;
+       volatile unsigned char *swap_map; /* swap_map is accessed without
+                                          * locking. Mark it as volatile
+                                          * to prevent compiler doing
+                                          * something odd.
+                                          */
         unsigned char swcount;
         struct page *page;
         swp_entry_t entry;
@@ -1359,7 +1364,15 @@ int try_to_unuse(unsigned int type, bool frontswap,
                          * reused since sys_swapoff() already disabled
                          * allocation from here, or alloc_page() failed.
                          */
-                       if (!*swap_map)
+                       swcount = *swap_map;
+                       /*
+                        * We don't hold lock here, so the swap entry could be
+                        * SWAP_MAP_BAD (when the cluster is discarding).
+                        * Instead of fail out, We can just skip the swap
+                        * entry because swapoff will wait for discarding
+                        * finish anyway.
+                        */
+                       if (!swcount || swcount == SWAP_MAP_BAD)
                                 continue;
                         retval = -ENOMEM;
                         break;
@@ -2543,6 +2556,16 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
                 goto unlock_out;
  
         count = p->swap_map[offset];
+
+       /*
+        * swapin_readahead() doesn't check if a swap entry is valid, so the
+        * swap entry could be SWAP_MAP_BAD. Check here with lock held.
+        */
+       if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
+               err = -ENOENT;
+               goto unlock_out;
+       }
+
         has_cache = count & SWAP_HAS_CACHE;
         count &= ~SWAP_HAS_CACHE;
         err = 0;
author	Shaohua Li <shli@kernel.org>
	Wed, 11 Sep 2013 21:20:31 +0000 (14:20 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 11 Sep 2013 22:57:16 +0000 (15:57 -0700)