x86/mm: Add barriers and document switch_mm()-vs-flush synchronization

author Andy Lutomirski <luto@kernel.org>

Wed, 6 Jan 2016 20:21:01 +0000 (12:21 -0800)

committer Ingo Molnar <mingo@kernel.org>

Mon, 11 Jan 2016 11:03:15 +0000 (12:03 +0100)
author Andy Lutomirski <luto@kernel.org>
Wed, 6 Jan 2016 20:21:01 +0000 (12:21 -0800)
committer Ingo Molnar <mingo@kernel.org>
Mon, 11 Jan 2016 11:03:15 +0000 (12:03 +0100)
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h

index 379cd3658799f3fb0623e5dfca5c3e7fde4a7a23..1edc9cd198b8538bd7d70da33b369d0d7e5743a8 100644 (file)
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -116,8 +116,34 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
  #endif
                 cpumask_set_cpu(cpu, mm_cpumask(next));
  
-               /* Re-load page tables */
+               /*
+                * Re-load page tables.
+                *
+                * This logic has an ordering constraint:
+                *
+                *  CPU 0: Write to a PTE for 'next'
+                *  CPU 0: load bit 1 in mm_cpumask.  if nonzero, send IPI.
+                *  CPU 1: set bit 1 in next's mm_cpumask
+                *  CPU 1: load from the PTE that CPU 0 writes (implicit)
+                *
+                * We need to prevent an outcome in which CPU 1 observes
+                * the new PTE value and CPU 0 observes bit 1 clear in
+                * mm_cpumask.  (If that occurs, then the IPI will never
+                * be sent, and CPU 0's TLB will contain a stale entry.)
+                *
+                * The bad outcome can occur if either CPU's load is
+                * reordered before that CPU's store, so both CPUs much
+                * execute full barriers to prevent this from happening.
+                *
+                * Thus, switch_mm needs a full barrier between the
+                * store to mm_cpumask and any operation that could load
+                * from next->pgd.  This barrier synchronizes with
+                * remote TLB flushers.  Fortunately, load_cr3 is
+                * serializing and thus acts as a full barrier.
+                *
+                */
                 load_cr3(next->pgd);
+
                 trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
  
                 /* Stop flush ipis for the previous mm */
@@ -156,10 +182,15 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
                          * schedule, protecting us from simultaneous changes.
                          */
                         cpumask_set_cpu(cpu, mm_cpumask(next));
+
                         /*
                          * We were in lazy tlb mode and leave_mm disabled
                          * tlb flush IPI delivery. We must reload CR3
                          * to make sure to use no freed page tables.
+                        *
+                        * As above, this is a barrier that forces
+                        * TLB repopulation to be ordered after the
+                        * store to mm_cpumask.
                          */
                         load_cr3(next->pgd);
                         trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c

index 8ddb5d0d66fb6f6353e735760489a47a0b98dea7..8f4cc3dfac322a2911ab2b2e8471c6e8e87d8af0 100644 (file)
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -161,7 +161,10 @@ void flush_tlb_current_task(void)
         preempt_disable();
  
         count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+
+       /* This is an implicit full barrier that synchronizes with switch_mm. */
         local_flush_tlb();
+
         trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
         if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
                 flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
@@ -188,17 +191,29 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
         unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
  
         preempt_disable();
-       if (current->active_mm != mm)
+       if (current->active_mm != mm) {
+               /* Synchronize with switch_mm. */
+               smp_mb();
+
                 goto out;
+       }
  
         if (!current->mm) {
                 leave_mm(smp_processor_id());
+
+               /* Synchronize with switch_mm. */
+               smp_mb();
+
                 goto out;
         }
  
         if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
                 base_pages_to_flush = (end - start) >> PAGE_SHIFT;
  
+       /*
+        * Both branches below are implicit full barriers (MOV to CR or
+        * INVLPG) that synchronize with switch_mm.
+        */
         if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
                 base_pages_to_flush = TLB_FLUSH_ALL;
                 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
@@ -228,10 +243,18 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
         preempt_disable();
  
         if (current->active_mm == mm) {
-               if (current->mm)
+               if (current->mm) {
+                       /*
+                        * Implicit full barrier (INVLPG) that synchronizes
+                        * with switch_mm.
+                        */
                         __flush_tlb_one(start);
-               else
+               } else {
                         leave_mm(smp_processor_id());
+
+                       /* Synchronize with switch_mm. */
+                       smp_mb();
+               }
         }
  
         if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
author	Andy Lutomirski <luto@kernel.org>
	Wed, 6 Jan 2016 20:21:01 +0000 (12:21 -0800)
committer	Ingo Molnar <mingo@kernel.org>
	Mon, 11 Jan 2016 11:03:15 +0000 (12:03 +0100)
arch/x86/include/asm/mmu_context.h		patch \| blob \| history
arch/x86/mm/tlb.c		patch \| blob \| history