int page_group_by_mobility_disabled __read_mostly;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-
-/*
- * Determine how many pages need to be initialized during early boot
- * (non-deferred initialization).
- * The value of first_deferred_pfn will be set later, once non-deferred pages
- * are initialized, but for now set it ULONG_MAX.
- */
-static inline void reset_deferred_meminit(pg_data_t *pgdat)
-{
- phys_addr_t start_addr, end_addr;
- unsigned long max_pgcnt;
- unsigned long reserved;
-
- /*
- * Initialise at least 2G of a node but also take into account that
- * two large system hashes that can take up 1GB for 0.25TB/node.
- */
- max_pgcnt = max(2UL << (30 - PAGE_SHIFT),
- (pgdat->node_spanned_pages >> 8));
-
- /*
- * Compensate the all the memblock reservations (e.g. crash kernel)
- * from the initial estimation to make sure we will initialize enough
- * memory to boot.
- */
- start_addr = PFN_PHYS(pgdat->node_start_pfn);
- end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt);
- reserved = memblock_reserved_memory_within(start_addr, end_addr);
- max_pgcnt += PHYS_PFN(reserved);
-
- pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages);
- pgdat->first_deferred_pfn = ULONG_MAX;
-}
-
/* Returns true if the struct page for the pfn is uninitialised */
static inline bool __meminit early_page_uninitialised(unsigned long pfn)
{
return true;
}
#else
-static inline void reset_deferred_meminit(pg_data_t *pgdat)
-{
-}
-
static inline bool early_page_uninitialised(unsigned long pfn)
{
return false;
pgdat_init_report_one_done();
return 0;
}
+
+/*
+ * During boot we initialize deferred pages on-demand, as needed, but once
+ * page_alloc_init_late() has finished, the deferred pages are all initialized,
+ * and we can permanently disable that path.
+ */
+static DEFINE_STATIC_KEY_TRUE(deferred_pages);
+
+/*
+ * If this zone has deferred pages, try to grow it by initializing enough
+ * deferred pages to satisfy the allocation specified by order, rounded up to
+ * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments
+ * of SECTION_SIZE bytes by initializing struct pages in increments of
+ * PAGES_PER_SECTION * sizeof(struct page) bytes.
+ *
+ * Return true when zone was grown, otherwise return false. We return true even
+ * when we grow less than requested, to let the caller decide if there are
+ * enough pages to satisfy the allocation.
+ *
+ * Note: We use noinline because this function is needed only during boot, and
+ * it is called from a __ref function _deferred_grow_zone. This way we are
+ * making sure that it is not inlined into permanent text section.
+ */
+static noinline bool __init
+deferred_grow_zone(struct zone *zone, unsigned int order)
+{
+ int zid = zone_idx(zone);
+ int nid = zone_to_nid(zone);
+ pg_data_t *pgdat = NODE_DATA(nid);
+ unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
+ unsigned long nr_pages = 0;
+ unsigned long first_init_pfn, spfn, epfn, t, flags;
+ unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
+ phys_addr_t spa, epa;
+ u64 i;
+
+ /* Only the last zone may have deferred pages */
+ if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
+ return false;
+
+ pgdat_resize_lock(pgdat, &flags);
+
+ /*
+ * If deferred pages have been initialized while we were waiting for
+ * the lock, return true, as the zone was grown. The caller will retry
+ * this zone. We won't return to this function since the caller also
+ * has this static branch.
+ */
+ if (!static_branch_unlikely(&deferred_pages)) {
+ pgdat_resize_unlock(pgdat, &flags);
+ return true;
+ }
+
+ /*
+ * If someone grew this zone while we were waiting for spinlock, return
+ * true, as there might be enough pages already.
+ */
+ if (first_deferred_pfn != pgdat->first_deferred_pfn) {
+ pgdat_resize_unlock(pgdat, &flags);
+ return true;
+ }
+
+ first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn);
+
+ if (first_init_pfn >= pgdat_end_pfn(pgdat)) {
+ pgdat_resize_unlock(pgdat, &flags);
+ return false;
+ }
+
+ for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
+ spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
+ epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
+
+ while (spfn < epfn && nr_pages < nr_pages_needed) {
+ t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION);
+ first_deferred_pfn = min(t, epfn);
+ nr_pages += deferred_init_pages(nid, zid, spfn,
+ first_deferred_pfn);
+ spfn = first_deferred_pfn;
+ }
+
+ if (nr_pages >= nr_pages_needed)
+ break;
+ }
+
+ for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
+ spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
+ epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa));
+ deferred_free_pages(nid, zid, spfn, epfn);
+
+ if (first_deferred_pfn == epfn)
+ break;
+ }
+ pgdat->first_deferred_pfn = first_deferred_pfn;
+ pgdat_resize_unlock(pgdat, &flags);
+
+ return nr_pages > 0;
+}
+
+/*
+ * deferred_grow_zone() is __init, but it is called from
+ * get_page_from_freelist() during early boot until deferred_pages permanently
+ * disables this call. This is why we have refdata wrapper to avoid warning,
+ * and to ensure that the function body gets unloaded.
+ */
+static bool __ref
+_deferred_grow_zone(struct zone *zone, unsigned int order)
+{
+ return deferred_grow_zone(zone, order);
+}
+
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
void __init page_alloc_init_late(void)
/* Block until all are initialised */
wait_for_completion(&pgdat_init_all_done_comp);
+ /*
+ * We initialized the rest of the deferred pages. Permanently disable
+ * on-demand struct page initialization.
+ */
+ static_branch_disable(&deferred_pages);
+
/* Reinit limits that are based on free pages after the kernel is up */
files_maxfiles_init();
#endif
ac_classzone_idx(ac), alloc_flags)) {
int ret;
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+ /*
+ * Watermark failed for this zone, but see if we can
+ * grow this zone if it contains deferred pages.
+ */
+ if (static_branch_unlikely(&deferred_pages)) {
+ if (_deferred_grow_zone(zone, order))
+ goto try_this_zone;
+ }
+#endif
/* Checked here to keep the fast path fast */
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (alloc_flags & ALLOC_NO_WATERMARKS)
reserve_highatomic_pageblock(page, zone, order);
return page;
+ } else {
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+ /* Try again if zone has deferred pages */
+ if (static_branch_unlikely(&deferred_pages)) {
+ if (_deferred_grow_zone(zone, order))
+ goto try_this_zone;
+ }
+#endif
}
}
alloc_node_mem_map(pgdat);
- reset_deferred_meminit(pgdat);
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+ /*
+ * We start only with one section of pages, more pages are added as
+ * needed until the rest of deferred pages are initialized.
+ */
+ pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
+ pgdat->node_spanned_pages);
+ pgdat->first_deferred_pfn = ULONG_MAX;
+#endif
free_area_init_core(pgdat);
}