mm: add PageWaiters indicating tasks are waiting for a page bit
authorNicholas Piggin <npiggin@gmail.com>
Sun, 25 Dec 2016 03:00:30 +0000 (13:00 +1000)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sun, 25 Dec 2016 19:54:48 +0000 (11:54 -0800)
Add a new page flag, PageWaiters, to indicate the page waitqueue has
tasks waiting. This can be tested rather than testing waitqueue_active
which requires another cacheline load.

This bit is always set when the page has tasks on page_waitqueue(page),
and is set and cleared under the waitqueue lock. It may be set when
there are no tasks on the waitqueue, which will cause a harmless extra
wakeup check that will clears the bit.

The generic bit-waitqueue infrastructure is no longer used for pages.
Instead, waitqueues are used directly with a custom key type. The
generic code was not flexible enough to have PageWaiters manipulation
under the waitqueue lock (which simplifies concurrency).

This improves the performance of page lock intensive microbenchmarks by
2-3%.

Putting two bits in the same word opens the opportunity to remove the
memory barrier between clearing the lock bit and testing the waiters
bit, after some work on the arch primitives (e.g., ensuring memory
operand widths match and cover both bits).

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Andrew Lutomirski <luto@kernel.org>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
include/linux/mm.h
include/linux/page-flags.h
include/linux/pagemap.h
include/linux/writeback.h
include/trace/events/mmflags.h
init/main.c
mm/filemap.c
mm/internal.h
mm/swap.c

index 4424784ac37495b38d735f7d86fd6b574b2fa16f..fe6b4036664a9a7c82fe4a22be93288928eceac1 100644 (file)
@@ -1758,6 +1758,8 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd)
        return ptl;
 }
 
+extern void __init pagecache_init(void);
+
 extern void free_area_init(unsigned long * zones_size);
 extern void free_area_init_node(int nid, unsigned long * zones_size,
                unsigned long zone_start_pfn, unsigned long *zholes_size);
index a57c909a15e419943ee1fc565c6104017fe6fb11..c56b39890a412abfec4acc31e404781215ae3ff6 100644 (file)
@@ -73,6 +73,7 @@
  */
 enum pageflags {
        PG_locked,              /* Page is locked. Don't touch. */
+       PG_waiters,             /* Page has waiters, check its waitqueue */
        PG_error,
        PG_referenced,
        PG_uptodate,
@@ -169,6 +170,9 @@ static __always_inline int PageCompound(struct page *page)
  *     for compound page all operations related to the page flag applied to
  *     head page.
  *
+ * PF_ONLY_HEAD:
+ *     for compound page, callers only ever operate on the head page.
+ *
  * PF_NO_TAIL:
  *     modifications of the page flag must be done on small or head pages,
  *     checks can be done on tail pages too.
@@ -178,6 +182,9 @@ static __always_inline int PageCompound(struct page *page)
  */
 #define PF_ANY(page, enforce)  page
 #define PF_HEAD(page, enforce) compound_head(page)
+#define PF_ONLY_HEAD(page, enforce) ({                                 \
+               VM_BUG_ON_PGFLAGS(PageTail(page), page);                \
+               page;})
 #define PF_NO_TAIL(page, enforce) ({                                   \
                VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page);     \
                compound_head(page);})
@@ -255,6 +262,7 @@ static inline int TestClearPage##uname(struct page *page) { return 0; }
        TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname)
 
 __PAGEFLAG(Locked, locked, PF_NO_TAIL)
+PAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) __CLEARPAGEFLAG(Waiters, waiters, PF_ONLY_HEAD)
 PAGEFLAG(Error, error, PF_NO_COMPOUND) TESTCLEARFLAG(Error, error, PF_NO_COMPOUND)
 PAGEFLAG(Referenced, referenced, PF_HEAD)
        TESTCLEARFLAG(Referenced, referenced, PF_HEAD)
@@ -743,6 +751,7 @@ static inline int page_has_private(struct page *page)
 
 #undef PF_ANY
 #undef PF_HEAD
+#undef PF_ONLY_HEAD
 #undef PF_NO_TAIL
 #undef PF_NO_COMPOUND
 #endif /* !__GENERATING_BOUNDS_H */
index f29f80f81dbf93b76c88003f4fbc27caa1c51576..324c8dbad1e13d049a7fed3e9ccac1210c4c5ed8 100644 (file)
@@ -486,22 +486,14 @@ static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm,
  * and for filesystems which need to wait on PG_private.
  */
 extern void wait_on_page_bit(struct page *page, int bit_nr);
-
 extern int wait_on_page_bit_killable(struct page *page, int bit_nr);
-extern int wait_on_page_bit_killable_timeout(struct page *page,
-                                            int bit_nr, unsigned long timeout);
-
-static inline int wait_on_page_locked_killable(struct page *page)
-{
-       if (!PageLocked(page))
-               return 0;
-       return wait_on_page_bit_killable(compound_head(page), PG_locked);
-}
+extern void wake_up_page_bit(struct page *page, int bit_nr);
 
-extern wait_queue_head_t *page_waitqueue(struct page *page);
 static inline void wake_up_page(struct page *page, int bit)
 {
-       __wake_up_bit(page_waitqueue(page), &page->flags, bit);
+       if (!PageWaiters(page))
+               return;
+       wake_up_page_bit(page, bit);
 }
 
 /* 
@@ -517,6 +509,13 @@ static inline void wait_on_page_locked(struct page *page)
                wait_on_page_bit(compound_head(page), PG_locked);
 }
 
+static inline int wait_on_page_locked_killable(struct page *page)
+{
+       if (!PageLocked(page))
+               return 0;
+       return wait_on_page_bit_killable(compound_head(page), PG_locked);
+}
+
 /* 
  * Wait for a page to complete writeback
  */
index c78f9f0920b51b61bc8d954438e02eca5b4a6513..5527d910ba3d12ee622cd23bd5aa8f62f2926043 100644 (file)
@@ -375,7 +375,6 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
 unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
 
 void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time);
-void page_writeback_init(void);
 void balance_dirty_pages_ratelimited(struct address_space *mapping);
 bool wb_over_bg_thresh(struct bdi_writeback *wb);
 
index 30c2adbdebe85b9bcb32a4499fb19ef701f848f6..9e687ca9a307b1344889186e46dd5e62a85f614f 100644 (file)
@@ -81,6 +81,7 @@
 
 #define __def_pageflag_names                                           \
        {1UL << PG_locked,              "locked"        },              \
+       {1UL << PG_waiters,             "waiters"       },              \
        {1UL << PG_error,               "error"         },              \
        {1UL << PG_referenced,          "referenced"    },              \
        {1UL << PG_uptodate,            "uptodate"      },              \
index c81c9fa21bc770896c737fd408ef372e37d79c1f..b0c9d6facef9a5aced55d1443b40029a660011e8 100644 (file)
@@ -647,9 +647,8 @@ asmlinkage __visible void __init start_kernel(void)
        security_init();
        dbg_late_init();
        vfs_caches_init();
+       pagecache_init();
        signals_init();
-       /* rootfs populating might need page-writeback */
-       page_writeback_init();
        proc_root_init();
        nsfs_init();
        cpuset_init();
index 32be3c8f3a112d2fd7b3be42b327d727bf240507..82f26cde830c4b70df30cfa47c7e21dbe6d05a7f 100644 (file)
@@ -739,45 +739,159 @@ EXPORT_SYMBOL(__page_cache_alloc);
  * at a cost of "thundering herd" phenomena during rare hash
  * collisions.
  */
-wait_queue_head_t *page_waitqueue(struct page *page)
+#define PAGE_WAIT_TABLE_BITS 8
+#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
+static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
+
+static wait_queue_head_t *page_waitqueue(struct page *page)
 {
-       return bit_waitqueue(page, 0);
+       return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
 }
-EXPORT_SYMBOL(page_waitqueue);
 
-void wait_on_page_bit(struct page *page, int bit_nr)
+void __init pagecache_init(void)
 {
-       DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
+       int i;
 
-       if (test_bit(bit_nr, &page->flags))
-               __wait_on_bit(page_waitqueue(page), &wait, bit_wait_io,
-                                                       TASK_UNINTERRUPTIBLE);
+       for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
+               init_waitqueue_head(&page_wait_table[i]);
+
+       page_writeback_init();
 }
-EXPORT_SYMBOL(wait_on_page_bit);
 
-int wait_on_page_bit_killable(struct page *page, int bit_nr)
+struct wait_page_key {
+       struct page *page;
+       int bit_nr;
+       int page_match;
+};
+
+struct wait_page_queue {
+       struct page *page;
+       int bit_nr;
+       wait_queue_t wait;
+};
+
+static int wake_page_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
 {
-       DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
+       struct wait_page_key *key = arg;
+       struct wait_page_queue *wait_page
+               = container_of(wait, struct wait_page_queue, wait);
+
+       if (wait_page->page != key->page)
+              return 0;
+       key->page_match = 1;
 
-       if (!test_bit(bit_nr, &page->flags))
+       if (wait_page->bit_nr != key->bit_nr)
+               return 0;
+       if (test_bit(key->bit_nr, &key->page->flags))
                return 0;
 
-       return __wait_on_bit(page_waitqueue(page), &wait,
-                            bit_wait_io, TASK_KILLABLE);
+       return autoremove_wake_function(wait, mode, sync, key);
 }
 
-int wait_on_page_bit_killable_timeout(struct page *page,
-                                      int bit_nr, unsigned long timeout)
+void wake_up_page_bit(struct page *page, int bit_nr)
 {
-       DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
+       wait_queue_head_t *q = page_waitqueue(page);
+       struct wait_page_key key;
+       unsigned long flags;
 
-       wait.key.timeout = jiffies + timeout;
-       if (!test_bit(bit_nr, &page->flags))
-               return 0;
-       return __wait_on_bit(page_waitqueue(page), &wait,
-                            bit_wait_io_timeout, TASK_KILLABLE);
+       key.page = page;
+       key.bit_nr = bit_nr;
+       key.page_match = 0;
+
+       spin_lock_irqsave(&q->lock, flags);
+       __wake_up_locked_key(q, TASK_NORMAL, &key);
+       /*
+        * It is possible for other pages to have collided on the waitqueue
+        * hash, so in that case check for a page match. That prevents a long-
+        * term waiter
+        *
+        * It is still possible to miss a case here, when we woke page waiters
+        * and removed them from the waitqueue, but there are still other
+        * page waiters.
+        */
+       if (!waitqueue_active(q) || !key.page_match) {
+               ClearPageWaiters(page);
+               /*
+                * It's possible to miss clearing Waiters here, when we woke
+                * our page waiters, but the hashed waitqueue has waiters for
+                * other pages on it.
+                *
+                * That's okay, it's a rare case. The next waker will clear it.
+                */
+       }
+       spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(wake_up_page_bit);
+
+static inline int wait_on_page_bit_common(wait_queue_head_t *q,
+               struct page *page, int bit_nr, int state, bool lock)
+{
+       struct wait_page_queue wait_page;
+       wait_queue_t *wait = &wait_page.wait;
+       int ret = 0;
+
+       init_wait(wait);
+       wait->func = wake_page_function;
+       wait_page.page = page;
+       wait_page.bit_nr = bit_nr;
+
+       for (;;) {
+               spin_lock_irq(&q->lock);
+
+               if (likely(list_empty(&wait->task_list))) {
+                       if (lock)
+                               __add_wait_queue_tail_exclusive(q, wait);
+                       else
+                               __add_wait_queue(q, wait);
+                       SetPageWaiters(page);
+               }
+
+               set_current_state(state);
+
+               spin_unlock_irq(&q->lock);
+
+               if (likely(test_bit(bit_nr, &page->flags))) {
+                       io_schedule();
+                       if (unlikely(signal_pending_state(state, current))) {
+                               ret = -EINTR;
+                               break;
+                       }
+               }
+
+               if (lock) {
+                       if (!test_and_set_bit_lock(bit_nr, &page->flags))
+                               break;
+               } else {
+                       if (!test_bit(bit_nr, &page->flags))
+                               break;
+               }
+       }
+
+       finish_wait(q, wait);
+
+       /*
+        * A signal could leave PageWaiters set. Clearing it here if
+        * !waitqueue_active would be possible (by open-coding finish_wait),
+        * but still fail to catch it in the case of wait hash collision. We
+        * already can fail to clear wait hash collision cases, so don't
+        * bother with signals either.
+        */
+
+       return ret;
+}
+
+void wait_on_page_bit(struct page *page, int bit_nr)
+{
+       wait_queue_head_t *q = page_waitqueue(page);
+       wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, false);
+}
+EXPORT_SYMBOL(wait_on_page_bit);
+
+int wait_on_page_bit_killable(struct page *page, int bit_nr)
+{
+       wait_queue_head_t *q = page_waitqueue(page);
+       return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, false);
 }
-EXPORT_SYMBOL_GPL(wait_on_page_bit_killable_timeout);
 
 /**
  * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
@@ -793,6 +907,7 @@ void add_page_wait_queue(struct page *page, wait_queue_t *waiter)
 
        spin_lock_irqsave(&q->lock, flags);
        __add_wait_queue(q, waiter);
+       SetPageWaiters(page);
        spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL_GPL(add_page_wait_queue);
@@ -874,23 +989,19 @@ EXPORT_SYMBOL_GPL(page_endio);
  * __lock_page - get a lock on the page, assuming we need to sleep to get it
  * @page: the page to lock
  */
-void __lock_page(struct page *page)
+void __lock_page(struct page *__page)
 {
-       struct page *page_head = compound_head(page);
-       DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
-
-       __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io,
-                                                       TASK_UNINTERRUPTIBLE);
+       struct page *page = compound_head(__page);
+       wait_queue_head_t *q = page_waitqueue(page);
+       wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, true);
 }
 EXPORT_SYMBOL(__lock_page);
 
-int __lock_page_killable(struct page *page)
+int __lock_page_killable(struct page *__page)
 {
-       struct page *page_head = compound_head(page);
-       DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
-
-       return __wait_on_bit_lock(page_waitqueue(page_head), &wait,
-                                       bit_wait_io, TASK_KILLABLE);
+       struct page *page = compound_head(__page);
+       wait_queue_head_t *q = page_waitqueue(page);
+       return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, true);
 }
 EXPORT_SYMBOL_GPL(__lock_page_killable);
 
index 44d68895a9b9b29deefbed2be37e52eb94ba9f33..7aa2ea0a8623c2bac9bb4d29889ad163b58a4195 100644 (file)
@@ -36,6 +36,8 @@
 /* Do not use these with a slab allocator */
 #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
 
+void page_writeback_init(void);
+
 int do_swap_page(struct vm_fault *vmf);
 
 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
index 4dcf852e1e6d8f2e9f0eeca9ee39f620ea972957..844baedd24292f0803ddadb2a2be03a754ec150b 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -69,6 +69,7 @@ static void __page_cache_release(struct page *page)
                del_page_from_lru_list(page, lruvec, page_off_lru(page));
                spin_unlock_irqrestore(zone_lru_lock(zone), flags);
        }
+       __ClearPageWaiters(page);
        mem_cgroup_uncharge(page);
 }
 
@@ -784,6 +785,7 @@ void release_pages(struct page **pages, int nr, bool cold)
 
                /* Clear Active bit in case of parallel mark_page_accessed */
                __ClearPageActive(page);
+               __ClearPageWaiters(page);
 
                list_add(&page->lru, &pages_to_free);
        }