hugetlbfs: Use i_mmap_rwsem to fix page fault/truncate race

author Mike Kravetz <mike.kravetz@oracle.com>

Fri, 28 Dec 2018 08:39:42 +0000 (00:39 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 28 Dec 2018 20:11:52 +0000 (12:11 -0800)
author Mike Kravetz <mike.kravetz@oracle.com>
Fri, 28 Dec 2018 08:39:42 +0000 (00:39 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 28 Dec 2018 20:11:52 +0000 (12:11 -0800)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c

index 32920a10100e23fc60f53cf36c882278ae972cee..a2fcea5f8225c1ec0228623b1f332ee02bba9c1c 100644 (file)
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -383,17 +383,16 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
   * truncation is indicated by end of range being LLONG_MAX
   *     In this case, we first scan the range and release found pages.
   *     After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
- *     maps and global counts.  Page faults can not race with truncation
- *     in this routine.  hugetlb_no_page() prevents page faults in the
- *     truncated range.  It checks i_size before allocation, and again after
- *     with the page table lock for the page held.  The same lock must be
- *     acquired to unmap a page.
+ *     maps and global counts.
   * hole punch is indicated if end is not LLONG_MAX
   *     In the hole punch case we scan the range and release found pages.
   *     Only when releasing a page is the associated region/reserv map
   *     deleted.  The region/reserv map for ranges without associated
- *     pages are not modified.  Page faults can race with hole punch.
- *     This is indicated if we find a mapped page.
+ *     pages are not modified.
+ *
+ * Callers of this routine must hold the i_mmap_rwsem in write mode to prevent
+ * races with page faults.
+ *
   * Note: If the passed end of range value is beyond the end of file, but
   * not LLONG_MAX this routine still performs a hole punch operation.
   */
@@ -423,32 +422,14 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
  
                 for (i = 0; i < pagevec_count(&pvec); ++i) {
                         struct page *page = pvec.pages[i];
-                       u32 hash;
  
                         index = page->index;
-                       hash = hugetlb_fault_mutex_hash(h, current->mm,
-                                                       &pseudo_vma,
-                                                       mapping, index, 0);
-                       mutex_lock(&hugetlb_fault_mutex_table[hash]);
-
                         /*
-                        * If page is mapped, it was faulted in after being
-                        * unmapped in caller.  Unmap (again) now after taking
-                        * the fault mutex.  The mutex will prevent faults
-                        * until we finish removing the page.
-                        *
-                        * This race can only happen in the hole punch case.
-                        * Getting here in a truncate operation is a bug.
+                        * A mapped page is impossible as callers should unmap
+                        * all references before calling.  And, i_mmap_rwsem
+                        * prevents the creation of additional mappings.
                          */
-                       if (unlikely(page_mapped(page))) {
-                               BUG_ON(truncate_op);
-
-                               i_mmap_lock_write(mapping);
-                               hugetlb_vmdelete_list(&mapping->i_mmap,
-                                       index * pages_per_huge_page(h),
-                                       (index + 1) * pages_per_huge_page(h));
-                               i_mmap_unlock_write(mapping);
-                       }
+                       VM_BUG_ON(page_mapped(page));
  
                         lock_page(page);
                         /*
@@ -470,7 +451,6 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
                         }
  
                         unlock_page(page);
-                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                 }
                 huge_pagevec_release(&pvec);
                 cond_resched();
@@ -482,9 +462,20 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
  
  static void hugetlbfs_evict_inode(struct inode *inode)
  {
+       struct address_space *mapping = inode->i_mapping;
         struct resv_map *resv_map;
  
+       /*
+        * The vfs layer guarantees that there are no other users of this
+        * inode.  Therefore, it would be safe to call remove_inode_hugepages
+        * without holding i_mmap_rwsem.  We acquire and hold here to be
+        * consistent with other callers.  Since there will be no contention
+        * on the semaphore, overhead is negligible.
+        */
+       i_mmap_lock_write(mapping);
         remove_inode_hugepages(inode, 0, LLONG_MAX);
+       i_mmap_unlock_write(mapping);
+
         resv_map = (struct resv_map *)inode->i_mapping->private_data;
         /* root inode doesn't have the resv_map, so we should check it */
         if (resv_map)
@@ -505,8 +496,8 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
         i_mmap_lock_write(mapping);
         if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
                 hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
-       i_mmap_unlock_write(mapping);
         remove_inode_hugepages(inode, offset, LLONG_MAX);
+       i_mmap_unlock_write(mapping);
         return 0;
  }
  
@@ -540,8 +531,8 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
                         hugetlb_vmdelete_list(&mapping->i_mmap,
                                                 hole_start >> PAGE_SHIFT,
                                                 hole_end  >> PAGE_SHIFT);
-               i_mmap_unlock_write(mapping);
                 remove_inode_hugepages(inode, hole_start, hole_end);
+               i_mmap_unlock_write(mapping);
                 inode_unlock(inode);
         }
  
@@ -624,7 +615,11 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
                 /* addr is the offset within the file (zero based) */
                 addr = index * hpage_size;
  
-               /* mutex taken here, fault path and hole punch */
+               /*
+                * fault mutex taken here, protects against fault path
+                * and hole punch.  inode_lock previously taken protects
+                * against truncation.
+                */
                 hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
                                                 index, addr);
                 mutex_lock(&hugetlb_fault_mutex_table[hash]);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index 87fd3ab809c68feab81c67a19837c5b8a5557ce8..e37efd5d831830123ca117a0efec6eead901eaa7 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3755,16 +3755,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
         }
  
         /*
-        * Use page lock to guard against racing truncation
-        * before we get page_table_lock.
+        * We can not race with truncation due to holding i_mmap_rwsem.
+        * Check once here for faults beyond end of file.
          */
+       size = i_size_read(mapping->host) >> huge_page_shift(h);
+       if (idx >= size)
+               goto out;
+
  retry:
         page = find_lock_page(mapping, idx);
         if (!page) {
-               size = i_size_read(mapping->host) >> huge_page_shift(h);
-               if (idx >= size)
-                       goto out;
-
                 /*
                  * Check for page in userfault range
                  */
@@ -3854,9 +3854,6 @@ retry:
         }
  
         ptl = huge_pte_lock(h, mm, ptep);
-       size = i_size_read(mapping->host) >> huge_page_shift(h);
-       if (idx >= size)
-               goto backout;
  
         ret = 0;
         if (!huge_pte_none(huge_ptep_get(ptep)))
@@ -3959,8 +3956,10 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
  
         /*
          * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
-        * until finished with ptep.  This prevents huge_pmd_unshare from
-        * being called elsewhere and making the ptep no longer valid.
+        * until finished with ptep.  This serves two purposes:
+        * 1) It prevents huge_pmd_unshare from being called elsewhere
+        *    and making the ptep no longer valid.
+        * 2) It synchronizes us with file truncation.
          *
          * ptep could have already be assigned via huge_pte_offset.  That
          * is OK, as huge_pte_alloc will return the same value unless
author	Mike Kravetz <mike.kravetz@oracle.com>
	Fri, 28 Dec 2018 08:39:42 +0000 (00:39 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 28 Dec 2018 20:11:52 +0000 (12:11 -0800)
fs/hugetlbfs/inode.c		patch \| blob \| history
mm/hugetlb.c		patch \| blob \| history