mm, hugetlb: do not rely on overcommit limit during migration

author Michal Hocko <mhocko@suse.com>

Thu, 1 Feb 2018 00:20:48 +0000 (16:20 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 1 Feb 2018 01:18:40 +0000 (17:18 -0800)
author Michal Hocko <mhocko@suse.com>
Thu, 1 Feb 2018 00:20:48 +0000 (16:20 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 1 Feb 2018 01:18:40 +0000 (17:18 -0800)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h

index 944e6e8bd5723d1cea2d96b1e8893686073bb074..66992348531e0513ec976ef960dd92743fb5a68c 100644 (file)
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -119,6 +119,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
                                                 long freed);
  bool isolate_huge_page(struct page *page, struct list_head *list);
  void putback_active_hugepage(struct page *page);
+void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason);
  void free_huge_page(struct page *page);
  void hugetlb_fix_reserve_counts(struct inode *inode);
  extern struct mutex *hugetlb_fault_mutex_table;
@@ -157,6 +158,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                 unsigned long address, unsigned long end, pgprot_t newprot);
  
  bool is_hugetlb_entry_migration(pte_t pte);
+
  #else /* !CONFIG_HUGETLB_PAGE */
  
  static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
@@ -197,6 +199,7 @@ static inline bool isolate_huge_page(struct page *page, struct list_head *list)
         return false;
  }
  #define putback_active_hugepage(p)     do {} while (0)
+#define move_hugetlb_state(old, new, reason)   do {} while (0)
  
  static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                 unsigned long address, unsigned long end, pgprot_t newprot)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index 360765156c7caf2b9b8f80cbc72e221e07143321..f260ffa263633a35e14a380f1a35d9aa47f7afa5 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -34,6 +34,7 @@
  #include <linux/hugetlb_cgroup.h>
  #include <linux/node.h>
  #include <linux/userfaultfd_k.h>
+#include <linux/page_owner.h>
  #include "internal.h"
  
  int hugetlb_max_hstate __read_mostly;
@@ -1219,6 +1220,28 @@ static void clear_page_huge_active(struct page *page)
         ClearPagePrivate(&page[1]);
  }
  
+/*
+ * Internal hugetlb specific page flag. Do not use outside of the hugetlb
+ * code
+ */
+static inline bool PageHugeTemporary(struct page *page)
+{
+       if (!PageHuge(page))
+               return false;
+
+       return (unsigned long)page[2].mapping == -1U;
+}
+
+static inline void SetPageHugeTemporary(struct page *page)
+{
+       page[2].mapping = (void *)-1U;
+}
+
+static inline void ClearPageHugeTemporary(struct page *page)
+{
+       page[2].mapping = NULL;
+}
+
  void free_huge_page(struct page *page)
  {
         /*
@@ -1253,7 +1276,11 @@ void free_huge_page(struct page *page)
         if (restore_reserve)
                 h->resv_huge_pages++;
  
-       if (h->surplus_huge_pages_node[nid]) {
+       if (PageHugeTemporary(page)) {
+               list_del(&page->lru);
+               ClearPageHugeTemporary(page);
+               update_and_free_page(h, page);
+       } else if (h->surplus_huge_pages_node[nid]) {
                 /* remove the page from active list */
                 list_del(&page->lru);
                 update_and_free_page(h, page);
@@ -1507,7 +1534,10 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
         return rc;
  }
  
-static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
+/*
+ * Allocates a fresh surplus page from the page allocator.
+ */
+static struct page *__alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
                 int nid, nodemask_t *nmask)
  {
         struct page *page;
@@ -1571,6 +1601,28 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
         return page;
  }
  
+static struct page *__alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
+               int nid, nodemask_t *nmask)
+{
+       struct page *page;
+
+       if (hstate_is_gigantic(h))
+               return NULL;
+
+       page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask);
+       if (!page)
+               return NULL;
+
+       /*
+        * We do not account these pages as surplus because they are only
+        * temporary and will be released properly on the last reference
+        */
+       prep_new_huge_page(h, page, page_to_nid(page));
+       SetPageHugeTemporary(page);
+
+       return page;
+}
+
  /*
   * Use the VMA's mpolicy to allocate a huge page from the buddy.
   */
@@ -1585,17 +1637,13 @@ struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
         nodemask_t *nodemask;
  
         nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
-       page = __alloc_buddy_huge_page(h, gfp_mask, nid, nodemask);
+       page = __alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
         mpol_cond_put(mpol);
  
         return page;
  }
  
-/*
- * This allocation function is useful in the context where vma is irrelevant.
- * E.g. soft-offlining uses this function because it only cares physical
- * address of error page.
- */
+/* page migration callback function */
  struct page *alloc_huge_page_node(struct hstate *h, int nid)
  {
         gfp_t gfp_mask = htlb_alloc_mask(h);
@@ -1610,12 +1658,12 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
         spin_unlock(&hugetlb_lock);
  
         if (!page)
-               page = __alloc_buddy_huge_page(h, gfp_mask, nid, NULL);
+               page = __alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
  
         return page;
  }
  
-
+/* page migration callback function */
  struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
                 nodemask_t *nmask)
  {
@@ -1633,9 +1681,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
         }
         spin_unlock(&hugetlb_lock);
  
-       /* No reservations, try to overcommit */
-
-       return __alloc_buddy_huge_page(h, gfp_mask, preferred_nid, nmask);
+       return __alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
  }
  
  /*
@@ -1663,7 +1709,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
  retry:
         spin_unlock(&hugetlb_lock);
         for (i = 0; i < needed; i++) {
-               page = __alloc_buddy_huge_page(h, htlb_alloc_mask(h),
+               page = __alloc_surplus_huge_page(h, htlb_alloc_mask(h),
                                 NUMA_NO_NODE, NULL);
                 if (!page) {
                         alloc_ok = false;
@@ -2260,7 +2306,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
          * First take pages out of surplus state.  Then make up the
          * remaining difference by allocating fresh huge pages.
          *
-        * We might race with __alloc_buddy_huge_page() here and be unable
+        * We might race with __alloc_surplus_huge_page() here and be unable
          * to convert a surplus huge page to a normal huge page. That is
          * not critical, though, it just means the overall size of the
          * pool might be one hugepage larger than it needs to be, but
@@ -2303,7 +2349,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
          * By placing pages into the surplus state independent of the
          * overcommit value, we are allowing the surplus pool size to
          * exceed overcommit. There are few sane options here. Since
-        * __alloc_buddy_huge_page() is checking the global counter,
+        * __alloc_surplus_huge_page() is checking the global counter,
          * though, we'll note that we're not allowed to exceed surplus
          * and won't grow the pool anywhere else. Not until one of the
          * sysctls are changed, or the surplus pages go out of use.
@@ -4779,3 +4825,36 @@ void putback_active_hugepage(struct page *page)
         spin_unlock(&hugetlb_lock);
         put_page(page);
  }
+
+void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
+{
+       struct hstate *h = page_hstate(oldpage);
+
+       hugetlb_cgroup_migrate(oldpage, newpage);
+       set_page_owner_migrate_reason(newpage, reason);
+
+       /*
+        * transfer temporary state of the new huge page. This is
+        * reverse to other transitions because the newpage is going to
+        * be final while the old one will be freed so it takes over
+        * the temporary status.
+        *
+        * Also note that we have to transfer the per-node surplus state
+        * here as well otherwise the global surplus count will not match
+        * the per-node's.
+        */
+       if (PageHugeTemporary(newpage)) {
+               int old_nid = page_to_nid(oldpage);
+               int new_nid = page_to_nid(newpage);
+
+               SetPageHugeTemporary(oldpage);
+               ClearPageHugeTemporary(newpage);
+
+               spin_lock(&hugetlb_lock);
+               if (h->surplus_huge_pages_node[old_nid]) {
+                       h->surplus_huge_pages_node[old_nid]--;
+                       h->surplus_huge_pages_node[new_nid]++;
+               }
+               spin_unlock(&hugetlb_lock);
+       }
+}
diff --git a/mm/migrate.c b/mm/migrate.c

index 4d0be47a322a8a33491c94072100ebcec829becd..1e5525a256910629d2dc2d15d14f13d06f2aa1d7 100644 (file)
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1323,9 +1323,8 @@ put_anon:
                 put_anon_vma(anon_vma);
  
         if (rc == MIGRATEPAGE_SUCCESS) {
-               hugetlb_cgroup_migrate(hpage, new_hpage);
+               move_hugetlb_state(hpage, new_hpage, reason);
                 put_new_page = NULL;
-               set_page_owner_migrate_reason(new_hpage, reason);
         }
  
         unlock_page(hpage);
author	Michal Hocko <mhocko@suse.com>
	Thu, 1 Feb 2018 00:20:48 +0000 (16:20 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 1 Feb 2018 01:18:40 +0000 (17:18 -0800)
include/linux/hugetlb.h		patch \| blob \| history
mm/hugetlb.c		patch \| blob \| history
mm/migrate.c		patch \| blob \| history