mm: mempolicy: Add MPOL_MF_LAZY

author Lee Schermerhorn <lee.schermerhorn@hp.com>

Thu, 25 Oct 2012 12:16:32 +0000 (14:16 +0200)

committer Mel Gorman <mgorman@suse.de>

Tue, 11 Dec 2012 14:42:43 +0000 (14:42 +0000)
author Lee Schermerhorn <lee.schermerhorn@hp.com>
Thu, 25 Oct 2012 12:16:32 +0000 (14:16 +0200)
committer Mel Gorman <mgorman@suse.de>
Tue, 11 Dec 2012 14:42:43 +0000 (14:42 +0000)
diff --git a/include/linux/mm.h b/include/linux/mm.h

index fa1615211159c8a7f404c38e7fb5b5ecc7a27238..471185e29babc16f55fdd02dac5634277089f323 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1551,6 +1551,11 @@ static inline pgprot_t vm_get_page_prot(unsigned long vm_flags)
  }
  #endif
  
+#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
+void change_prot_numa(struct vm_area_struct *vma,
+                       unsigned long start, unsigned long end);
+#endif
+
  struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
  int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
                         unsigned long pfn, unsigned long size, pgprot_t);
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h

index 472de8a5d37eb5d6a8e3a98a374da8c75fc19c20..6a1baae3775d0eb5a2bdad79b5eb01f3bb09977b 100644 (file)
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -49,9 +49,16 @@ enum mpol_rebind_step {
  
  /* Flags for mbind */
  #define MPOL_MF_STRICT (1<<0)  /* Verify existing pages in the mapping */
-#define MPOL_MF_MOVE   (1<<1)  /* Move pages owned by this process to conform to mapping */
-#define MPOL_MF_MOVE_ALL (1<<2)        /* Move every page to conform to mapping */
-#define MPOL_MF_INTERNAL (1<<3)        /* Internal flags start here */
+#define MPOL_MF_MOVE    (1<<1) /* Move pages owned by this process to conform
+                                  to policy */
+#define MPOL_MF_MOVE_ALL (1<<2)        /* Move every page to conform to policy */
+#define MPOL_MF_LAZY    (1<<3) /* Modifies '_MOVE:  lazy migrate on fault */
+#define MPOL_MF_INTERNAL (1<<4)        /* Internal flags start here */
+
+#define MPOL_MF_VALID  (MPOL_MF_STRICT   |     \
+                        MPOL_MF_MOVE     |     \
+                        MPOL_MF_MOVE_ALL |     \
+                        MPOL_MF_LAZY)
  
  /*
   * Internal flags that share the struct mempolicy flags word with
diff --git a/mm/mempolicy.c b/mm/mempolicy.c

index df1466d3d2d868091b1558719fc676516e0508f3..51d3ebd8561ed68f56914a88a2ff0e787b4dc428 100644 (file)
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -90,6 +90,7 @@
  #include <linux/syscalls.h>
  #include <linux/ctype.h>
  #include <linux/mm_inline.h>
+#include <linux/mmu_notifier.h>
  
  #include <asm/tlbflush.h>
  #include <asm/uaccess.h>
@@ -565,6 +566,145 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
         return 0;
  }
  
+#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
+/*
+ * Here we search for not shared page mappings (mapcount == 1) and we
+ * set up the pmd/pte_numa on those mappings so the very next access
+ * will fire a NUMA hinting page fault.
+ */
+static int
+change_prot_numa_range(struct mm_struct *mm, struct vm_area_struct *vma,
+                       unsigned long address)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte, *_pte;
+       struct page *page;
+       unsigned long _address, end;
+       spinlock_t *ptl;
+       int ret = 0;
+
+       VM_BUG_ON(address & ~PAGE_MASK);
+
+       pgd = pgd_offset(mm, address);
+       if (!pgd_present(*pgd))
+               goto out;
+
+       pud = pud_offset(pgd, address);
+       if (!pud_present(*pud))
+               goto out;
+
+       pmd = pmd_offset(pud, address);
+       if (pmd_none(*pmd))
+               goto out;
+
+       if (pmd_trans_huge_lock(pmd, vma) == 1) {
+               int page_nid;
+               ret = HPAGE_PMD_NR;
+
+               VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+               if (pmd_numa(*pmd)) {
+                       spin_unlock(&mm->page_table_lock);
+                       goto out;
+               }
+
+               page = pmd_page(*pmd);
+
+               /* only check non-shared pages */
+               if (page_mapcount(page) != 1) {
+                       spin_unlock(&mm->page_table_lock);
+                       goto out;
+               }
+
+               page_nid = page_to_nid(page);
+
+               if (pmd_numa(*pmd)) {
+                       spin_unlock(&mm->page_table_lock);
+                       goto out;
+               }
+
+               set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd));
+               ret += HPAGE_PMD_NR;
+               /* defer TLB flush to lower the overhead */
+               spin_unlock(&mm->page_table_lock);
+               goto out;
+       }
+
+       if (pmd_trans_unstable(pmd))
+               goto out;
+       VM_BUG_ON(!pmd_present(*pmd));
+
+       end = min(vma->vm_end, (address + PMD_SIZE) & PMD_MASK);
+       pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+       for (_address = address, _pte = pte; _address < end;
+            _pte++, _address += PAGE_SIZE) {
+               pte_t pteval = *_pte;
+               if (!pte_present(pteval))
+                       continue;
+               if (pte_numa(pteval))
+                       continue;
+               page = vm_normal_page(vma, _address, pteval);
+               if (unlikely(!page))
+                       continue;
+               /* only check non-shared pages */
+               if (page_mapcount(page) != 1)
+                       continue;
+
+               set_pte_at(mm, _address, _pte, pte_mknuma(pteval));
+
+               /* defer TLB flush to lower the overhead */
+               ret++;
+       }
+       pte_unmap_unlock(pte, ptl);
+
+       if (ret && !pmd_numa(*pmd)) {
+               spin_lock(&mm->page_table_lock);
+               set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd));
+               spin_unlock(&mm->page_table_lock);
+               /* defer TLB flush to lower the overhead */
+       }
+
+out:
+       return ret;
+}
+
+/* Assumes mmap_sem is held */
+void
+change_prot_numa(struct vm_area_struct *vma,
+                       unsigned long address, unsigned long end)
+{
+       struct mm_struct *mm = vma->vm_mm;
+       int progress = 0;
+
+       while (address < end) {
+               VM_BUG_ON(address < vma->vm_start ||
+                         address + PAGE_SIZE > vma->vm_end);
+
+               progress += change_prot_numa_range(mm, vma, address);
+               address = (address + PMD_SIZE) & PMD_MASK;
+       }
+
+       /*
+        * Flush the TLB for the mm to start the NUMA hinting
+        * page faults after we finish scanning this vma part
+        * if there were any PTE updates
+        */
+       if (progress) {
+               mmu_notifier_invalidate_range_start(vma->vm_mm, address, end);
+               flush_tlb_range(vma, address, end);
+               mmu_notifier_invalidate_range_end(vma->vm_mm, address, end);
+       }
+}
+#else
+static unsigned long change_prot_numa(struct vm_area_struct *vma,
+                       unsigned long addr, unsigned long end)
+{
+       return 0;
+}
+#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
+
  /*
   * Check if all pages in a range are on a set of nodes.
   * If pagelist != NULL then isolate pages from the LRU and
@@ -583,22 +723,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                 return ERR_PTR(-EFAULT);
         prev = NULL;
         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
+               unsigned long endvma = vma->vm_end;
+
+               if (endvma > end)
+                       endvma = end;
+               if (vma->vm_start > start)
+                       start = vma->vm_start;
+
                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
                         if (!vma->vm_next && vma->vm_end < end)
                                 return ERR_PTR(-EFAULT);
                         if (prev && prev->vm_end < vma->vm_start)
                                 return ERR_PTR(-EFAULT);
                 }
-               if (!is_vm_hugetlb_page(vma) &&
-                   ((flags & MPOL_MF_STRICT) ||
+
+               if (is_vm_hugetlb_page(vma))
+                       goto next;
+
+               if (flags & MPOL_MF_LAZY) {
+                       change_prot_numa(vma, start, endvma);
+                       goto next;
+               }
+
+               if ((flags & MPOL_MF_STRICT) ||
                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
-                               vma_migratable(vma)))) {
-                       unsigned long endvma = vma->vm_end;
+                     vma_migratable(vma))) {
  
-                       if (endvma > end)
-                               endvma = end;
-                       if (vma->vm_start > start)
-                               start = vma->vm_start;
                         err = check_pgd_range(vma, start, endvma, nodes,
                                                 flags, private);
                         if (err) {
@@ -606,6 +756,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                                 break;
                         }
                 }
+next:
                 prev = vma;
         }
         return first;
@@ -1138,8 +1289,7 @@ static long do_mbind(unsigned long start, unsigned long len,
         int err;
         LIST_HEAD(pagelist);
  
-       if (flags & ~(unsigned long)(MPOL_MF_STRICT |
-                                    MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+       if (flags & ~(unsigned long)MPOL_MF_VALID)
                 return -EINVAL;
         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
                 return -EPERM;
@@ -1162,6 +1312,9 @@ static long do_mbind(unsigned long start, unsigned long len,
         if (IS_ERR(new))
                 return PTR_ERR(new);
  
+       if (flags & MPOL_MF_LAZY)
+               new->flags |= MPOL_F_MOF;
+
         /*
          * If we are using the default policy then operation
          * on discontinuous address spaces is okay after all
@@ -1198,13 +1351,15 @@ static long do_mbind(unsigned long start, unsigned long len,
         vma = check_range(mm, start, end, nmask,
                           flags | MPOL_MF_INVERT, &pagelist);
  
-       err = PTR_ERR(vma);
-       if (!IS_ERR(vma)) {
-               int nr_failed = 0;
-
+       err = PTR_ERR(vma);     /* maybe ... */
+       if (!IS_ERR(vma) && mode != MPOL_NOOP)
                 err = mbind_range(mm, start, end, new);
  
+       if (!err) {
+               int nr_failed = 0;
+
                 if (!list_empty(&pagelist)) {
+                       WARN_ON_ONCE(flags & MPOL_MF_LAZY);
                         nr_failed = migrate_pages(&pagelist, new_vma_page,
                                                 (unsigned long)vma,
                                                 false, MIGRATE_SYNC,
@@ -1213,7 +1368,7 @@ static long do_mbind(unsigned long start, unsigned long len,
                                 putback_lru_pages(&pagelist);
                 }
  
-               if (!err && nr_failed && (flags & MPOL_MF_STRICT))
+               if (nr_failed && (flags & MPOL_MF_STRICT))
                         err = -EIO;
         } else
                 putback_lru_pages(&pagelist);
author	Lee Schermerhorn <lee.schermerhorn@hp.com>
	Thu, 25 Oct 2012 12:16:32 +0000 (14:16 +0200)
committer	Mel Gorman <mgorman@suse.de>
	Tue, 11 Dec 2012 14:42:43 +0000 (14:42 +0000)
include/linux/mm.h		patch \| blob \| history
include/uapi/linux/mempolicy.h		patch \| blob \| history
mm/mempolicy.c		patch \| blob \| history