xen: use iret directly when possible

author Jeremy Fitzhardinge <jeremy@xensource.com>

Wed, 18 Jul 2007 01:37:07 +0000 (18:37 -0700)

committer Jeremy Fitzhardinge <jeremy@goop.org>

Wed, 18 Jul 2007 15:47:46 +0000 (08:47 -0700)
author Jeremy Fitzhardinge <jeremy@xensource.com>
Wed, 18 Jul 2007 01:37:07 +0000 (18:37 -0700)
committer Jeremy Fitzhardinge <jeremy@goop.org>
Wed, 18 Jul 2007 15:47:46 +0000 (08:47 -0700)
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c

index a7c2947b39661507e8cfcefea522519ed6a5ea4c..25f7eb513928d7727ce07fb8a5ceb45f97cb6260 100644 (file)
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -61,6 +61,7 @@ void foo(void)
         OFFSET(TI_addr_limit, thread_info, addr_limit);
         OFFSET(TI_restart_block, thread_info, restart_block);
         OFFSET(TI_sysenter_return, thread_info, sysenter_return);
+       OFFSET(TI_cpu, thread_info, cpu);
         BLANK();
  
         OFFSET(GDS_size, Xgt_desc_struct, size);
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S

index ffb23654427073fd241e66b2f0402b09f358f2f2..32980b8349357492b195a890a9486ee0c60d9141 100644 (file)
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -1030,7 +1030,21 @@ ENTRY(xen_hypervisor_callback)
         CFI_ADJUST_CFA_OFFSET 4
         SAVE_ALL
         TRACE_IRQS_OFF
-       mov %esp, %eax
+
+       /* Check to see if we got the event in the critical
+          region in xen_iret_direct, after we've reenabled
+          events and checked for pending events.  This simulates
+          iret instruction's behaviour where it delivers a
+          pending interrupt when enabling interrupts. */
+       movl PT_EIP(%esp),%eax
+       cmpl $xen_iret_start_crit,%eax
+       jb   1f
+       cmpl $xen_iret_end_crit,%eax
+       jae  1f
+
+       call xen_iret_crit_fixup
+
+1:     mov %esp, %eax
         call xen_evtchn_do_upcall
         jmp  ret_from_intr
         CFI_ENDPROC
diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c

index 4fa62a4cb7cc7d7a5df980dd50ba37bd9f318f87..9a8c1181c001cec6c6105bd40dd2875e9e312074 100644 (file)
--- a/arch/i386/xen/enlighten.c
+++ b/arch/i386/xen/enlighten.c
@@ -838,6 +838,7 @@ void __init xen_setup_vcpu_info_placement(void)
                 paravirt_ops.irq_disable = xen_irq_disable_direct;
                 paravirt_ops.irq_enable = xen_irq_enable_direct;
                 paravirt_ops.read_cr2 = xen_read_cr2_direct;
+               paravirt_ops.iret = xen_iret_direct;
         }
  }
  
diff --git a/arch/i386/xen/xen-asm.S b/arch/i386/xen/xen-asm.S

index dc4d36d51bc15740f99bd5daf63261323c5405e0..1a43b60c0c62533cf481d50591166d6530b35f32 100644 (file)
--- a/arch/i386/xen/xen-asm.S
+++ b/arch/i386/xen/xen-asm.S
@@ -12,15 +12,21 @@
   */
  
  #include <linux/linkage.h>
+
  #include <asm/asm-offsets.h>
  #include <asm/thread_info.h>
  #include <asm/percpu.h>
-#include <asm/asm-offsets.h>
  #include <asm/processor-flags.h>
+#include <asm/segment.h>
+
+#include <xen/interface/xen.h>
  
  #define RELOC(x, v)    .globl x##_reloc; x##_reloc=v
  #define ENDPATCH(x)    .globl x##_end; x##_end=.
  
+/* Pseudo-flag used for virtual NMI, which we don't implement yet */
+#define XEN_EFLAGS_NMI 0x80000000
+
  /*
         Enable events.  This clears the event mask and tests the pending
         event status with one and operation.  If there are pending
@@ -81,13 +87,12 @@ ENDPATCH(xen_save_fl_direct)
   */
  ENTRY(xen_restore_fl_direct)
         testb $X86_EFLAGS_IF>>8, %ah
-       setz %al
-       movb %al, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+       setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
         /* Preempt here doesn't matter because that will deal with
            any pending interrupts.  The pending check may end up being
            run on the wrong CPU, but that doesn't hurt. */
  
-       /* check for pending but unmasked */
+       /* check for unmasked and pending */
         cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
         jz 1f
  2:     call check_events
@@ -97,6 +102,178 @@ ENDPATCH(xen_restore_fl_direct)
         ENDPROC(xen_restore_fl_direct)
         RELOC(xen_restore_fl_direct, 2b+1)
  
+/*
+       This is run where a normal iret would be run, with the same stack setup:
+             8: eflags
+             4: cs
+       esp-> 0: eip
+
+       This attempts to make sure that any pending events are dealt
+       with on return to usermode, but there is a small window in
+       which an event can happen just before entering usermode.  If
+       the nested interrupt ends up setting one of the TIF_WORK_MASK
+       pending work flags, they will not be tested again before
+       returning to usermode. This means that a process can end up
+       with pending work, which will be unprocessed until the process
+       enters and leaves the kernel again, which could be an
+       unbounded amount of time.  This means that a pending signal or
+       reschedule event could be indefinitely delayed.
+
+       The fix is to notice a nested interrupt in the critical
+       window, and if one occurs, then fold the nested interrupt into
+       the current interrupt stack frame, and re-process it
+       iteratively rather than recursively.  This means that it will
+       exit via the normal path, and all pending work will be dealt
+       with appropriately.
+
+       Because the nested interrupt handler needs to deal with the
+       current stack state in whatever form its in, we keep things
+       simple by only using a single register which is pushed/popped
+       on the stack.
+
+       Non-direct iret could be done in the same way, but it would
+       require an annoying amount of code duplication.  We'll assume
+       that direct mode will be the common case once the hypervisor
+       support becomes commonplace.
+ */
+ENTRY(xen_iret_direct)
+       /* test eflags for special cases */
+       testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
+       jnz hyper_iret
+
+       push %eax
+       ESP_OFFSET=4    # bytes pushed onto stack
+
+       /* Store vcpu_info pointer for easy access.  Do it this
+          way to avoid having to reload %fs */
+#ifdef CONFIG_SMP
+       GET_THREAD_INFO(%eax)
+       movl TI_cpu(%eax),%eax
+       movl __per_cpu_offset(,%eax,4),%eax
+       lea per_cpu__xen_vcpu_info(%eax),%eax
+#else
+       movl $per_cpu__xen_vcpu_info, %eax
+#endif
+
+       /* check IF state we're restoring */
+       testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
+
+       /* Maybe enable events.  Once this happens we could get a
+          recursive event, so the critical region starts immediately
+          afterwards.  However, if that happens we don't end up
+          resuming the code, so we don't have to be worried about
+          being preempted to another CPU. */
+       setz XEN_vcpu_info_mask(%eax)
+xen_iret_start_crit:
+
+       /* check for unmasked and pending */
+       cmpw $0x0001, XEN_vcpu_info_pending(%eax)
+
+       /* If there's something pending, mask events again so we
+          can jump back into xen_hypervisor_callback */
+       sete XEN_vcpu_info_mask(%eax)
+
+       popl %eax
+
+       /* From this point on the registers are restored and the stack
+          updated, so we don't need to worry about it if we're preempted */
+iret_restore_end:
+
+       /* Jump to hypervisor_callback after fixing up the stack.
+          Events are masked, so jumping out of the critical
+          region is OK. */
+       je xen_hypervisor_callback
+
+       iret
+xen_iret_end_crit:
+
+hyper_iret:
+       /* put this out of line since its very rarely used */
+       jmp hypercall_page + __HYPERVISOR_iret * 32
+
+       .globl xen_iret_start_crit, xen_iret_end_crit
+
+/*
+   This is called by xen_hypervisor_callback in entry.S when it sees
+   that the EIP at the time of interrupt was between xen_iret_start_crit
+   and xen_iret_end_crit.  We're passed the EIP in %eax so we can do
+   a more refined determination of what to do.
+
+   The stack format at this point is:
+       ----------------
+        ss             : (ss/esp may be present if we came from usermode)
+        esp            :
+        eflags         }  outer exception info
+        cs             }
+        eip            }
+       ---------------- <- edi (copy dest)
+        eax            :  outer eax if it hasn't been restored
+       ----------------
+        eflags         }  nested exception info
+        cs             }   (no ss/esp because we're nested
+        eip            }    from the same ring)
+        orig_eax       }<- esi (copy src)
+        - - - - - - - -
+        fs             }
+        es             }
+        ds             }  SAVE_ALL state
+        eax            }
+         :             :
+        ebx            }
+       ----------------
+        return addr     <- esp
+       ----------------
+
+   In order to deliver the nested exception properly, we need to shift
+   everything from the return addr up to the error code so it
+   sits just under the outer exception info.  This means that when we
+   handle the exception, we do it in the context of the outer exception
+   rather than starting a new one.
+
+   The only caveat is that if the outer eax hasn't been
+   restored yet (ie, it's still on stack), we need to insert
+   its value into the SAVE_ALL state before going on, since
+   it's usermode state which we eventually need to restore.
+ */
+ENTRY(xen_iret_crit_fixup)
+       /* offsets +4 for return address */
+
+       /*
+          Paranoia: Make sure we're really coming from userspace.
+          One could imagine a case where userspace jumps into the
+          critical range address, but just before the CPU delivers a GP,
+          it decides to deliver an interrupt instead.  Unlikely?
+          Definitely.  Easy to avoid?  Yes.  The Intel documents
+          explicitly say that the reported EIP for a bad jump is the
+          jump instruction itself, not the destination, but some virtual
+          environments get this wrong.
+        */
+       movl PT_CS+4(%esp), %ecx
+       andl $SEGMENT_RPL_MASK, %ecx
+       cmpl $USER_RPL, %ecx
+       je 2f
+
+       lea PT_ORIG_EAX+4(%esp), %esi
+       lea PT_EFLAGS+4(%esp), %edi
+
+       /* If eip is before iret_restore_end then stack
+          hasn't been restored yet. */
+       cmp $iret_restore_end, %eax
+       jae 1f
+
+       movl 0+4(%edi),%eax             /* copy EAX */
+       movl %eax, PT_EAX+4(%esp)
+
+       lea ESP_OFFSET(%edi),%edi       /* move dest up over saved regs */
+
+       /* set up the copy */
+1:     std
+       mov $(PT_EIP+4) / 4, %ecx       /* copy ret+saved regs up to orig_eax */
+       rep movsl
+       cld
+
+       lea 4(%edi),%esp                /* point esp to new frame */
+2:     ret
  
  
  /*
diff --git a/arch/i386/xen/xen-ops.h b/arch/i386/xen/xen-ops.h

index 33e4c8a1628912ddde75ad00d04bdf08ebf404d8..b9aaea45f07f386f55366b1f8ec3e9e65a150ae9 100644 (file)
--- a/arch/i386/xen/xen-ops.h
+++ b/arch/i386/xen/xen-ops.h
@@ -67,4 +67,5 @@ DECL_ASM(void, xen_irq_disable_direct, void);
  DECL_ASM(unsigned long, xen_save_fl_direct, void);
  DECL_ASM(void, xen_restore_fl_direct, unsigned long);
  
+void xen_iret_direct(void);
  #endif /* XEN_OPS_H */
author	Jeremy Fitzhardinge <jeremy@xensource.com>
	Wed, 18 Jul 2007 01:37:07 +0000 (18:37 -0700)
committer	Jeremy Fitzhardinge <jeremy@goop.org>
	Wed, 18 Jul 2007 15:47:46 +0000 (08:47 -0700)
arch/i386/kernel/asm-offsets.c		patch \| blob \| history
arch/i386/kernel/entry.S		patch \| blob \| history
arch/i386/xen/enlighten.c		patch \| blob \| history
arch/i386/xen/xen-asm.S		patch \| blob \| history
arch/i386/xen/xen-ops.h		patch \| blob \| history