mm/gup: cache dev_pagemap while pinning pages
authorKeith Busch <keith.busch@intel.com>
Fri, 26 Oct 2018 22:10:28 +0000 (15:10 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 26 Oct 2018 23:38:15 +0000 (16:38 -0700)
Getting pages from ZONE_DEVICE memory needs to check the backing device's
live-ness, which is tracked in the device's dev_pagemap metadata.  This
metadata is stored in a radix tree and looking it up adds measurable
software overhead.

This patch avoids repeating this relatively costly operation when
dev_pagemap is used by caching the last dev_pagemap while getting user
pages.  The gup_benchmark kernel self test reports this reduces time to
get user pages to as low as 1/3 of the previous time.

Link: http://lkml.kernel.org/r/20181012173040.15669-1-keith.busch@intel.com
Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
include/linux/huge_mm.h
include/linux/mm.h
mm/gup.c
mm/huge_memory.c
mm/nommu.c

index fdcb45999b26338a197cef58c9c46f929b62bbc8..4663ee96cf5981198de6b6c841faf5331ff67b07 100644 (file)
@@ -213,9 +213,9 @@ static inline int hpage_nr_pages(struct page *page)
 }
 
 struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
-               pmd_t *pmd, int flags);
+               pmd_t *pmd, int flags, struct dev_pagemap **pgmap);
 struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
-               pud_t *pud, int flags);
+               pud_t *pud, int flags, struct dev_pagemap **pgmap);
 
 extern vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd);
 
@@ -344,13 +344,13 @@ static inline void mm_put_huge_zero_page(struct mm_struct *mm)
 }
 
 static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
-               unsigned long addr, pmd_t *pmd, int flags)
+       unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
 {
        return NULL;
 }
 
 static inline struct page *follow_devmap_pud(struct vm_area_struct *vma,
-               unsigned long addr, pud_t *pud, int flags)
+       unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap)
 {
        return NULL;
 }
index a023c5ce71fadcb07fabe439c6f9f3ef0f850990..1e52b8fd168593fb85356afedb07a29b66d95aac 100644 (file)
@@ -2536,16 +2536,8 @@ static inline vm_fault_t vmf_error(int err)
        return VM_FAULT_SIGBUS;
 }
 
-struct page *follow_page_mask(struct vm_area_struct *vma,
-                             unsigned long address, unsigned int foll_flags,
-                             unsigned int *page_mask);
-
-static inline struct page *follow_page(struct vm_area_struct *vma,
-               unsigned long address, unsigned int foll_flags)
-{
-       unsigned int unused_page_mask;
-       return follow_page_mask(vma, address, foll_flags, &unused_page_mask);
-}
+struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
+                        unsigned int foll_flags);
 
 #define FOLL_WRITE     0x01    /* check pte is writable */
 #define FOLL_TOUCH     0x02    /* mark page accessed */
index 08eb350e0f35f36c8008bb8be8af3d2309b32025..841d7ef5359195b2de3758b098155788c230e543 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
 
 #include "internal.h"
 
+struct follow_page_context {
+       struct dev_pagemap *pgmap;
+       unsigned int page_mask;
+};
+
 static struct page *no_page_table(struct vm_area_struct *vma,
                unsigned int flags)
 {
@@ -71,10 +76,10 @@ static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
 }
 
 static struct page *follow_page_pte(struct vm_area_struct *vma,
-               unsigned long address, pmd_t *pmd, unsigned int flags)
+               unsigned long address, pmd_t *pmd, unsigned int flags,
+               struct dev_pagemap **pgmap)
 {
        struct mm_struct *mm = vma->vm_mm;
-       struct dev_pagemap *pgmap = NULL;
        struct page *page;
        spinlock_t *ptl;
        pte_t *ptep, pte;
@@ -116,8 +121,8 @@ retry:
                 * Only return device mapping pages in the FOLL_GET case since
                 * they are only valid while holding the pgmap reference.
                 */
-               pgmap = get_dev_pagemap(pte_pfn(pte), NULL);
-               if (pgmap)
+               *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
+               if (*pgmap)
                        page = pte_page(pte);
                else
                        goto no_page;
@@ -152,15 +157,8 @@ retry:
                goto retry;
        }
 
-       if (flags & FOLL_GET) {
+       if (flags & FOLL_GET)
                get_page(page);
-
-               /* drop the pgmap reference now that we hold the page */
-               if (pgmap) {
-                       put_dev_pagemap(pgmap);
-                       pgmap = NULL;
-               }
-       }
        if (flags & FOLL_TOUCH) {
                if ((flags & FOLL_WRITE) &&
                    !pte_dirty(pte) && !PageDirty(page))
@@ -210,7 +208,8 @@ no_page:
 
 static struct page *follow_pmd_mask(struct vm_area_struct *vma,
                                    unsigned long address, pud_t *pudp,
-                                   unsigned int flags, unsigned int *page_mask)
+                                   unsigned int flags,
+                                   struct follow_page_context *ctx)
 {
        pmd_t *pmd, pmdval;
        spinlock_t *ptl;
@@ -258,13 +257,13 @@ retry:
        }
        if (pmd_devmap(pmdval)) {
                ptl = pmd_lock(mm, pmd);
-               page = follow_devmap_pmd(vma, address, pmd, flags);
+               page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
                spin_unlock(ptl);
                if (page)
                        return page;
        }
        if (likely(!pmd_trans_huge(pmdval)))
-               return follow_page_pte(vma, address, pmd, flags);
+               return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
 
        if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
                return no_page_table(vma, flags);
@@ -284,7 +283,7 @@ retry_locked:
        }
        if (unlikely(!pmd_trans_huge(*pmd))) {
                spin_unlock(ptl);
-               return follow_page_pte(vma, address, pmd, flags);
+               return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
        }
        if (flags & FOLL_SPLIT) {
                int ret;
@@ -307,18 +306,18 @@ retry_locked:
                }
 
                return ret ? ERR_PTR(ret) :
-                       follow_page_pte(vma, address, pmd, flags);
+                       follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
        }
        page = follow_trans_huge_pmd(vma, address, pmd, flags);
        spin_unlock(ptl);
-       *page_mask = HPAGE_PMD_NR - 1;
+       ctx->page_mask = HPAGE_PMD_NR - 1;
        return page;
 }
 
-
 static struct page *follow_pud_mask(struct vm_area_struct *vma,
                                    unsigned long address, p4d_t *p4dp,
-                                   unsigned int flags, unsigned int *page_mask)
+                                   unsigned int flags,
+                                   struct follow_page_context *ctx)
 {
        pud_t *pud;
        spinlock_t *ptl;
@@ -344,7 +343,7 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
        }
        if (pud_devmap(*pud)) {
                ptl = pud_lock(mm, pud);
-               page = follow_devmap_pud(vma, address, pud, flags);
+               page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
                spin_unlock(ptl);
                if (page)
                        return page;
@@ -352,13 +351,13 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
        if (unlikely(pud_bad(*pud)))
                return no_page_table(vma, flags);
 
-       return follow_pmd_mask(vma, address, pud, flags, page_mask);
+       return follow_pmd_mask(vma, address, pud, flags, ctx);
 }
 
-
 static struct page *follow_p4d_mask(struct vm_area_struct *vma,
                                    unsigned long address, pgd_t *pgdp,
-                                   unsigned int flags, unsigned int *page_mask)
+                                   unsigned int flags,
+                                   struct follow_page_context *ctx)
 {
        p4d_t *p4d;
        struct page *page;
@@ -378,7 +377,7 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
                        return page;
                return no_page_table(vma, flags);
        }
-       return follow_pud_mask(vma, address, p4d, flags, page_mask);
+       return follow_pud_mask(vma, address, p4d, flags, ctx);
 }
 
 /**
@@ -396,13 +395,13 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
  */
 struct page *follow_page_mask(struct vm_area_struct *vma,
                              unsigned long address, unsigned int flags,
-                             unsigned int *page_mask)
+                             struct follow_page_context *ctx)
 {
        pgd_t *pgd;
        struct page *page;
        struct mm_struct *mm = vma->vm_mm;
 
-       *page_mask = 0;
+       ctx->page_mask = 0;
 
        /* make this handle hugepd */
        page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
@@ -431,7 +430,19 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
                return no_page_table(vma, flags);
        }
 
-       return follow_p4d_mask(vma, address, pgd, flags, page_mask);
+       return follow_p4d_mask(vma, address, pgd, flags, ctx);
+}
+
+struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
+                        unsigned int foll_flags)
+{
+       struct follow_page_context ctx = { NULL };
+       struct page *page;
+
+       page = follow_page_mask(vma, address, foll_flags, &ctx);
+       if (ctx.pgmap)
+               put_dev_pagemap(ctx.pgmap);
+       return page;
 }
 
 static int get_gate_page(struct mm_struct *mm, unsigned long address,
@@ -659,9 +670,9 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                unsigned int gup_flags, struct page **pages,
                struct vm_area_struct **vmas, int *nonblocking)
 {
-       long i = 0;
-       unsigned int page_mask;
+       long ret = 0, i = 0;
        struct vm_area_struct *vma = NULL;
+       struct follow_page_context ctx = { NULL };
 
        if (!nr_pages)
                return 0;
@@ -691,12 +702,14 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                                pages ? &pages[i] : NULL);
                                if (ret)
                                        return i ? : ret;
-                               page_mask = 0;
+                               ctx.page_mask = 0;
                                goto next_page;
                        }
 
-                       if (!vma || check_vma_flags(vma, gup_flags))
-                               return i ? : -EFAULT;
+                       if (!vma || check_vma_flags(vma, gup_flags)) {
+                               ret = -EFAULT;
+                               goto out;
+                       }
                        if (is_vm_hugetlb_page(vma)) {
                                i = follow_hugetlb_page(mm, vma, pages, vmas,
                                                &start, &nr_pages, i,
@@ -709,23 +722,26 @@ retry:
                 * If we have a pending SIGKILL, don't keep faulting pages and
                 * potentially allocating memory.
                 */
-               if (unlikely(fatal_signal_pending(current)))
-                       return i ? i : -ERESTARTSYS;
+               if (unlikely(fatal_signal_pending(current))) {
+                       ret = -ERESTARTSYS;
+                       goto out;
+               }
                cond_resched();
-               page = follow_page_mask(vma, start, foll_flags, &page_mask);
+
+               page = follow_page_mask(vma, start, foll_flags, &ctx);
                if (!page) {
-                       int ret;
                        ret = faultin_page(tsk, vma, start, &foll_flags,
                                        nonblocking);
                        switch (ret) {
                        case 0:
                                goto retry;
+                       case -EBUSY:
+                               ret = 0;
+                               /* FALLTHRU */
                        case -EFAULT:
                        case -ENOMEM:
                        case -EHWPOISON:
-                               return i ? i : ret;
-                       case -EBUSY:
-                               return i;
+                               goto out;
                        case -ENOENT:
                                goto next_page;
                        }
@@ -737,27 +753,31 @@ retry:
                         */
                        goto next_page;
                } else if (IS_ERR(page)) {
-                       return i ? i : PTR_ERR(page);
+                       ret = PTR_ERR(page);
+                       goto out;
                }
                if (pages) {
                        pages[i] = page;
                        flush_anon_page(vma, page, start);
                        flush_dcache_page(page);
-                       page_mask = 0;
+                       ctx.page_mask = 0;
                }
 next_page:
                if (vmas) {
                        vmas[i] = vma;
-                       page_mask = 0;
+                       ctx.page_mask = 0;
                }
-               page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
+               page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
                if (page_increm > nr_pages)
                        page_increm = nr_pages;
                i += page_increm;
                start += page_increm * PAGE_SIZE;
                nr_pages -= page_increm;
        } while (nr_pages);
-       return i;
+out:
+       if (ctx.pgmap)
+               put_dev_pagemap(ctx.pgmap);
+       return i ? i : ret;
 }
 
 static bool vma_permits_fault(struct vm_area_struct *vma,
index 8ea1b36bd452d795da8ac2714634c6bc70a45b99..25c7d7509cf49efe4bc29ed811f05df8de9b6a48 100644 (file)
@@ -852,11 +852,10 @@ static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
 }
 
 struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
-               pmd_t *pmd, int flags)
+               pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
 {
        unsigned long pfn = pmd_pfn(*pmd);
        struct mm_struct *mm = vma->vm_mm;
-       struct dev_pagemap *pgmap;
        struct page *page;
 
        assert_spin_locked(pmd_lockptr(mm, pmd));
@@ -886,12 +885,11 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
                return ERR_PTR(-EEXIST);
 
        pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
-       pgmap = get_dev_pagemap(pfn, NULL);
-       if (!pgmap)
+       *pgmap = get_dev_pagemap(pfn, *pgmap);
+       if (!*pgmap)
                return ERR_PTR(-EFAULT);
        page = pfn_to_page(pfn);
        get_page(page);
-       put_dev_pagemap(pgmap);
 
        return page;
 }
@@ -1000,11 +998,10 @@ static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
 }
 
 struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
-               pud_t *pud, int flags)
+               pud_t *pud, int flags, struct dev_pagemap **pgmap)
 {
        unsigned long pfn = pud_pfn(*pud);
        struct mm_struct *mm = vma->vm_mm;
-       struct dev_pagemap *pgmap;
        struct page *page;
 
        assert_spin_locked(pud_lockptr(mm, pud));
@@ -1028,12 +1025,11 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
                return ERR_PTR(-EEXIST);
 
        pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
-       pgmap = get_dev_pagemap(pfn, NULL);
-       if (!pgmap)
+       *pgmap = get_dev_pagemap(pfn, *pgmap);
+       if (!*pgmap)
                return ERR_PTR(-EFAULT);
        page = pfn_to_page(pfn);
        get_page(page);
-       put_dev_pagemap(pgmap);
 
        return page;
 }
index e4aac33216aec0f05022f17578b6dee03e25a4e4..749276beb1094d61b28da6a5e82c26a2696647ab 100644 (file)
@@ -1709,11 +1709,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
        return ret;
 }
 
-struct page *follow_page_mask(struct vm_area_struct *vma,
-                             unsigned long address, unsigned int flags,
-                             unsigned int *page_mask)
+struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
+                        unsigned int foll_flags)
 {
-       *page_mask = 0;
        return NULL;
 }