userfaultfd: hugetlbfs: gup: support VM_FAULT_RETRY
authorAndrea Arcangeli <aarcange@redhat.com>
Wed, 22 Feb 2017 23:43:13 +0000 (15:43 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 23 Feb 2017 00:41:28 +0000 (16:41 -0800)
Add support for VM_FAULT_RETRY to follow_hugetlb_page() so that
get_user_pages_unlocked/locked and "nonblocking/FOLL_NOWAIT" features
will work on hugetlbfs.

This is required for fully functional userfaultfd non-present support on
hugetlbfs.

Link: http://lkml.kernel.org/r/20161216144821.5183-25-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
include/linux/hugetlb.h
mm/gup.c
mm/hugetlb.c

index aab2fff3e26998e26e0cf6deffd3a1c9544c3388..503099d8aada5351b2e30b04cf79e651d57a23d7 100644 (file)
@@ -65,7 +65,8 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int,
 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
 long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
                         struct page **, struct vm_area_struct **,
-                        unsigned long *, unsigned long *, long, unsigned int);
+                        unsigned long *, unsigned long *, long, unsigned int,
+                        int *);
 void unmap_hugepage_range(struct vm_area_struct *,
                          unsigned long, unsigned long, struct page *);
 void __unmap_hugepage_range_final(struct mmu_gather *tlb,
@@ -136,7 +137,7 @@ static inline unsigned long hugetlb_total_pages(void)
        return 0;
 }
 
-#define follow_hugetlb_page(m,v,p,vs,a,b,i,w)  ({ BUG(); 0; })
+#define follow_hugetlb_page(m,v,p,vs,a,b,i,w,n)        ({ BUG(); 0; })
 #define follow_huge_addr(mm, addr, write)      ERR_PTR(-EINVAL)
 #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; })
 static inline void hugetlb_report_meminfo(struct seq_file *m)
index 55315555489d02c411534a958f1a31f79c9fafb5..40abe4c903834573d3249326b957553b72bd8eb9 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -572,7 +572,7 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        if (is_vm_hugetlb_page(vma)) {
                                i = follow_hugetlb_page(mm, vma, pages, vmas,
                                                &start, &nr_pages, i,
-                                               gup_flags);
+                                               gup_flags, nonblocking);
                                continue;
                        }
                }
index a4b29054cc3f683d0b0329bf8c66ea48da0e22c3..f6c7ff316daf5a7451b2bd3daef648d6f0b8491b 100644 (file)
@@ -4065,7 +4065,7 @@ out_release_unlock:
 long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                         struct page **pages, struct vm_area_struct **vmas,
                         unsigned long *position, unsigned long *nr_pages,
-                        long i, unsigned int flags)
+                        long i, unsigned int flags, int *nonblocking)
 {
        unsigned long pfn_offset;
        unsigned long vaddr = *position;
@@ -4128,16 +4128,43 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                    ((flags & FOLL_WRITE) &&
                      !huge_pte_write(huge_ptep_get(pte)))) {
                        int ret;
+                       unsigned int fault_flags = 0;
 
                        if (pte)
                                spin_unlock(ptl);
-                       ret = hugetlb_fault(mm, vma, vaddr,
-                               (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
-                       if (!(ret & VM_FAULT_ERROR))
-                               continue;
-
-                       remainder = 0;
-                       break;
+                       if (flags & FOLL_WRITE)
+                               fault_flags |= FAULT_FLAG_WRITE;
+                       if (nonblocking)
+                               fault_flags |= FAULT_FLAG_ALLOW_RETRY;
+                       if (flags & FOLL_NOWAIT)
+                               fault_flags |= FAULT_FLAG_ALLOW_RETRY |
+                                       FAULT_FLAG_RETRY_NOWAIT;
+                       if (flags & FOLL_TRIED) {
+                               VM_WARN_ON_ONCE(fault_flags &
+                                               FAULT_FLAG_ALLOW_RETRY);
+                               fault_flags |= FAULT_FLAG_TRIED;
+                       }
+                       ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
+                       if (ret & VM_FAULT_ERROR) {
+                               remainder = 0;
+                               break;
+                       }
+                       if (ret & VM_FAULT_RETRY) {
+                               if (nonblocking)
+                                       *nonblocking = 0;
+                               *nr_pages = 0;
+                               /*
+                                * VM_FAULT_RETRY must not return an
+                                * error, it will return zero
+                                * instead.
+                                *
+                                * No need to update "position" as the
+                                * caller will not check it after
+                                * *nr_pages is set to 0.
+                                */
+                               return i;
+                       }
+                       continue;
                }
 
                pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
@@ -4166,6 +4193,11 @@ same_page:
                spin_unlock(ptl);
        }
        *nr_pages = remainder;
+       /*
+        * setting position is actually required only if remainder is
+        * not zero but it's faster not to add a "if (remainder)"
+        * branch.
+        */
        *position = vaddr;
 
        return i ? i : -EFAULT;