userfaultfd: hugetlbfs: add UFFDIO_COPY support for shared mappings

author Mike Kravetz <mike.kravetz@oracle.com>

Wed, 22 Feb 2017 23:43:43 +0000 (15:43 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 23 Feb 2017 00:41:28 +0000 (16:41 -0800)
author Mike Kravetz <mike.kravetz@oracle.com>
Wed, 22 Feb 2017 23:43:43 +0000 (15:43 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 23 Feb 2017 00:41:28 +0000 (16:41 -0800)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index f6c7ff316daf5a7451b2bd3daef648d6f0b8491b..30e7709a5121981c6792dfe2294e8137adc1ece5 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3992,6 +3992,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                             unsigned long src_addr,
                             struct page **pagep)
  {
+       int vm_shared = dst_vma->vm_flags & VM_SHARED;
         struct hstate *h = hstate_vma(dst_vma);
         pte_t _dst_pte;
         spinlock_t *ptl;
@@ -4028,6 +4029,18 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
         __SetPageUptodate(page);
         set_page_huge_active(page);
  
+       /*
+        * If shared, add to page cache
+        */
+       if (vm_shared) {
+               struct address_space *mapping = dst_vma->vm_file->f_mapping;
+               pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
+
+               ret = huge_add_to_page_cache(page, mapping, idx);
+               if (ret)
+                       goto out_release_nounlock;
+       }
+
         ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
         spin_lock(ptl);
  
@@ -4035,8 +4048,12 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
         if (!huge_pte_none(huge_ptep_get(dst_pte)))
                 goto out_release_unlock;
  
-       ClearPagePrivate(page);
-       hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
+       if (vm_shared) {
+               page_dup_rmap(page, true);
+       } else {
+               ClearPagePrivate(page);
+               hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
+       }
  
         _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE);
         if (dst_vma->vm_flags & VM_WRITE)
@@ -4053,11 +4070,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
         update_mmu_cache(dst_vma, dst_addr, dst_pte);
  
         spin_unlock(ptl);
+       if (vm_shared)
+               unlock_page(page);
         ret = 0;
  out:
         return ret;
  out_release_unlock:
         spin_unlock(ptl);
+out_release_nounlock:
+       if (vm_shared)
+               unlock_page(page);
         put_page(page);
         goto out;
  }
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c

index a0817cc470b0067001b5bb08016faa3248251693..1e5c2f94e8a3299a47c3ba5bda9e6f59ea8afbf0 100644 (file)
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -154,6 +154,8 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
                                               unsigned long len,
                                               bool zeropage)
  {
+       int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
+       int vm_shared = dst_vma->vm_flags & VM_SHARED;
         ssize_t err;
         pte_t *dst_pte;
         unsigned long src_addr, dst_addr;
@@ -204,14 +206,14 @@ retry:
                         goto out_unlock;
  
                 /*
-                * Make sure the vma is not shared, that the remaining dst
-                * range is both valid and fully within a single existing vma.
+                * Make sure the remaining dst range is both valid and
+                * fully within a single existing vma.
                  */
-               if (dst_vma->vm_flags & VM_SHARED)
-                       goto out_unlock;
                 if (dst_start < dst_vma->vm_start ||
                     dst_start + len > dst_vma->vm_end)
                         goto out_unlock;
+
+               vm_shared = dst_vma->vm_flags & VM_SHARED;
         }
  
         if (WARN_ON(dst_addr & (vma_hpagesize - 1) ||
@@ -225,11 +227,13 @@ retry:
                 goto out_unlock;
  
         /*
-        * Ensure the dst_vma has a anon_vma.
+        * If not shared, ensure the dst_vma has a anon_vma.
          */
         err = -ENOMEM;
-       if (unlikely(anon_vma_prepare(dst_vma)))
-               goto out_unlock;
+       if (!vm_shared) {
+               if (unlikely(anon_vma_prepare(dst_vma)))
+                       goto out_unlock;
+       }
  
         h = hstate_vma(dst_vma);
  
@@ -266,6 +270,7 @@ retry:
                                                 dst_addr, src_addr, &page);
  
                 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+               vm_alloc_shared = vm_shared;
  
                 cond_resched();
  
@@ -305,18 +310,49 @@ out:
         if (page) {
                 /*
                  * We encountered an error and are about to free a newly
-                * allocated huge page.  It is possible that there was a
-                * reservation associated with the page that has been
-                * consumed.  See the routine restore_reserve_on_error
-                * for details.  Unfortunately, we can not call
-                * restore_reserve_on_error now as it would require holding
-                * mmap_sem.  Clear the PagePrivate flag so that the global
+                * allocated huge page.
+                *
+                * Reservation handling is very subtle, and is different for
+                * private and shared mappings.  See the routine
+                * restore_reserve_on_error for details.  Unfortunately, we
+                * can not call restore_reserve_on_error now as it would
+                * require holding mmap_sem.
+                *
+                * If a reservation for the page existed in the reservation
+                * map of a private mapping, the map was modified to indicate
+                * the reservation was consumed when the page was allocated.
+                * We clear the PagePrivate flag now so that the global
                  * reserve count will not be incremented in free_huge_page.
                  * The reservation map will still indicate the reservation
                  * was consumed and possibly prevent later page allocation.
-                * This is better than leaking a global reservation.
+                * This is better than leaking a global reservation.  If no
+                * reservation existed, it is still safe to clear PagePrivate
+                * as no adjustments to reservation counts were made during
+                * allocation.
+                *
+                * The reservation map for shared mappings indicates which
+                * pages have reservations.  When a huge page is allocated
+                * for an address with a reservation, no change is made to
+                * the reserve map.  In this case PagePrivate will be set
+                * to indicate that the global reservation count should be
+                * incremented when the page is freed.  This is the desired
+                * behavior.  However, when a huge page is allocated for an
+                * address without a reservation a reservation entry is added
+                * to the reservation map, and PagePrivate will not be set.
+                * When the page is freed, the global reserve count will NOT
+                * be incremented and it will appear as though we have leaked
+                * reserved page.  In this case, set PagePrivate so that the
+                * global reserve count will be incremented to match the
+                * reservation map entry which was created.
+                *
+                * Note that vm_alloc_shared is based on the flags of the vma
+                * for which the page was originally allocated.  dst_vma could
+                * be different or NULL on error.
                  */
-               ClearPagePrivate(page);
+               if (vm_alloc_shared)
+                       SetPagePrivate(page);
+               else
+                       ClearPagePrivate(page);
                 put_page(page);
         }
         BUG_ON(copied < 0);
@@ -372,8 +408,14 @@ retry:
         dst_vma = find_vma(dst_mm, dst_start);
         if (!dst_vma)
                 goto out_unlock;
-       if (!vma_is_shmem(dst_vma) && dst_vma->vm_flags & VM_SHARED)
+       /*
+        * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
+        * it will overwrite vm_ops, so vma_is_anonymous must return false.
+        */
+       if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
+           dst_vma->vm_flags & VM_SHARED))
                 goto out_unlock;
+
         if (dst_start < dst_vma->vm_start ||
             dst_start + len > dst_vma->vm_end)
                 goto out_unlock;
author	Mike Kravetz <mike.kravetz@oracle.com>
	Wed, 22 Feb 2017 23:43:43 +0000 (15:43 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 23 Feb 2017 00:41:28 +0000 (16:41 -0800)
mm/hugetlb.c		patch \| blob \| history
mm/userfaultfd.c		patch \| blob \| history