mm/hugetlb: fix huge page reservation leak in private mapping error paths

author Mike Kravetz <mike.kravetz@oracle.com>

Thu, 10 Nov 2016 18:46:32 +0000 (10:46 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 11 Nov 2016 16:12:37 +0000 (08:12 -0800)
author Mike Kravetz <mike.kravetz@oracle.com>
Thu, 10 Nov 2016 18:46:32 +0000 (10:46 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 11 Nov 2016 16:12:37 +0000 (08:12 -0800)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index ec49d9ef1eefd0155f099d813ed547c16172d69e..418bf01a50ed1f9dde0ca6c083aafa86c1869f23 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1826,11 +1826,17 @@ static void return_unused_surplus_pages(struct hstate *h,
   * is not the case is if a reserve map was changed between calls.  It
   * is the responsibility of the caller to notice the difference and
   * take appropriate action.
+ *
+ * vma_add_reservation is used in error paths where a reservation must
+ * be restored when a newly allocated huge page must be freed.  It is
+ * to be called after calling vma_needs_reservation to determine if a
+ * reservation exists.
   */
  enum vma_resv_mode {
         VMA_NEEDS_RESV,
         VMA_COMMIT_RESV,
         VMA_END_RESV,
+       VMA_ADD_RESV,
  };
  static long __vma_reservation_common(struct hstate *h,
                                 struct vm_area_struct *vma, unsigned long addr,
@@ -1856,6 +1862,14 @@ static long __vma_reservation_common(struct hstate *h,
                 region_abort(resv, idx, idx + 1);
                 ret = 0;
                 break;
+       case VMA_ADD_RESV:
+               if (vma->vm_flags & VM_MAYSHARE)
+                       ret = region_add(resv, idx, idx + 1);
+               else {
+                       region_abort(resv, idx, idx + 1);
+                       ret = region_del(resv, idx, idx + 1);
+               }
+               break;
         default:
                 BUG();
         }
@@ -1903,6 +1917,56 @@ static void vma_end_reservation(struct hstate *h,
         (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
  }
  
+static long vma_add_reservation(struct hstate *h,
+                       struct vm_area_struct *vma, unsigned long addr)
+{
+       return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
+}
+
+/*
+ * This routine is called to restore a reservation on error paths.  In the
+ * specific error paths, a huge page was allocated (via alloc_huge_page)
+ * and is about to be freed.  If a reservation for the page existed,
+ * alloc_huge_page would have consumed the reservation and set PagePrivate
+ * in the newly allocated page.  When the page is freed via free_huge_page,
+ * the global reservation count will be incremented if PagePrivate is set.
+ * However, free_huge_page can not adjust the reserve map.  Adjust the
+ * reserve map here to be consistent with global reserve count adjustments
+ * to be made by free_huge_page.
+ */
+static void restore_reserve_on_error(struct hstate *h,
+                       struct vm_area_struct *vma, unsigned long address,
+                       struct page *page)
+{
+       if (unlikely(PagePrivate(page))) {
+               long rc = vma_needs_reservation(h, vma, address);
+
+               if (unlikely(rc < 0)) {
+                       /*
+                        * Rare out of memory condition in reserve map
+                        * manipulation.  Clear PagePrivate so that
+                        * global reserve count will not be incremented
+                        * by free_huge_page.  This will make it appear
+                        * as though the reservation for this page was
+                        * consumed.  This may prevent the task from
+                        * faulting in the page at a later time.  This
+                        * is better than inconsistent global huge page
+                        * accounting of reserve counts.
+                        */
+                       ClearPagePrivate(page);
+               } else if (rc) {
+                       rc = vma_add_reservation(h, vma, address);
+                       if (unlikely(rc < 0))
+                               /*
+                                * See above comment about rare out of
+                                * memory condition.
+                                */
+                               ClearPagePrivate(page);
+               } else
+                       vma_end_reservation(h, vma, address);
+       }
+}
+
  struct page *alloc_huge_page(struct vm_area_struct *vma,
                                     unsigned long addr, int avoid_reserve)
  {
@@ -3498,6 +3562,7 @@ retry_avoidcopy:
         spin_unlock(ptl);
         mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
  out_release_all:
+       restore_reserve_on_error(h, vma, address, new_page);
         put_page(new_page);
  out_release_old:
         put_page(old_page);
@@ -3680,6 +3745,7 @@ backout:
         spin_unlock(ptl);
  backout_unlocked:
         unlock_page(page);
+       restore_reserve_on_error(h, vma, address, page);
         put_page(page);
         goto out;
  }
author	Mike Kravetz <mike.kravetz@oracle.com>
	Thu, 10 Nov 2016 18:46:32 +0000 (10:46 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 11 Nov 2016 16:12:37 +0000 (08:12 -0800)