Optimize/cleanup BPIALL workaround
authorDimitris Papastamos <dimitris.papastamos@arm.com>
Thu, 11 Jan 2018 15:29:36 +0000 (15:29 +0000)
committerDimitris Papastamos <dimitris.papastamos@arm.com>
Mon, 29 Jan 2018 09:58:57 +0000 (09:58 +0000)
In the initial implementation of this workaround we used a dedicated
workaround context to save/restore state.  This patch reduces the
footprint as no additional context is needed.

Additionally, this patch reduces the memory loads and stores by 20%,
reduces the instruction count and exploits static branch prediction to
optimize the SMC path.

Change-Id: Ia9f6bf06fbf8a9037cfe7f1f1fb32e8aec38ec7d
Signed-off-by: Dimitris Papastamos <dimitris.papastamos@arm.com>
include/lib/el3_runtime/aarch64/context.h
lib/cpus/aarch64/workaround_cve_2017_5715_bpiall.S

index 5e212ec3fe44ae18c8384778426f9eaafe3487e1..5f6bdc97af6a322a7d1407ba11139036dddd8a6d 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2018, ARM Limited and Contributors. All rights reserved.
  *
  * SPDX-License-Identifier: BSD-3-Clause
  */
 #define CTX_GPREG_SP_EL0       U(0xf8)
 #define CTX_GPREGS_END         U(0x100)
 
-#if WORKAROUND_CVE_2017_5715
-#define CTX_CVE_2017_5715_OFFSET       (CTX_GPREGS_OFFSET + CTX_GPREGS_END)
-#define CTX_CVE_2017_5715_QUAD0                U(0x0)
-#define CTX_CVE_2017_5715_QUAD1                U(0x8)
-#define        CTX_CVE_2017_5715_QUAD2         U(0x10)
-#define CTX_CVE_2017_5715_QUAD3                U(0x18)
-#define CTX_CVE_2017_5715_QUAD4                U(0x20)
-#define CTX_CVE_2017_5715_QUAD5                U(0x28)
-#define CTX_CVE_2017_5715_END          U(0x30)
-#else
-#define CTX_CVE_2017_5715_OFFSET       CTX_GPREGS_OFFSET
-#define CTX_CVE_2017_5715_END          CTX_GPREGS_END
-#endif
-
 /*******************************************************************************
  * Constants that allow assembler code to access members of and the 'el3_state'
  * structure at their correct offsets. Note that some of the registers are only
  * 32-bits wide but are stored as 64-bit values for convenience
  ******************************************************************************/
-#define CTX_EL3STATE_OFFSET    (CTX_CVE_2017_5715_OFFSET + CTX_CVE_2017_5715_END)
+#define CTX_EL3STATE_OFFSET    (CTX_GPREGS_OFFSET + CTX_GPREGS_END)
 #define CTX_SCR_EL3            U(0x0)
 #define CTX_RUNTIME_SP         U(0x8)
 #define CTX_SPSR_EL3           U(0x10)
 
 /* Constants to determine the size of individual context structures */
 #define CTX_GPREG_ALL          (CTX_GPREGS_END >> DWORD_SHIFT)
-#if WORKAROUND_CVE_2017_5715
-#define CTX_CVE_2017_5715_ALL  (CTX_CVE_2017_5715_END >> DWORD_SHIFT)
-#endif
 #define CTX_SYSREG_ALL         (CTX_SYSREGS_END >> DWORD_SHIFT)
 #if CTX_INCLUDE_FPREGS
 #define CTX_FPREG_ALL          (CTX_FPREGS_END >> DWORD_SHIFT)
  */
 DEFINE_REG_STRUCT(gp_regs, CTX_GPREG_ALL);
 
-#if WORKAROUND_CVE_2017_5715
-DEFINE_REG_STRUCT(cve_2017_5715_regs, CTX_CVE_2017_5715_ALL);
-#endif
-
 /*
  * AArch64 EL1 system register context structure for preserving the
  * architectural state during switches from one security state to
@@ -263,9 +242,6 @@ DEFINE_REG_STRUCT(el3_state, CTX_EL3STATE_ALL);
  */
 typedef struct cpu_context {
        gp_regs_t gpregs_ctx;
-#if WORKAROUND_CVE_2017_5715
-       cve_2017_5715_regs_t cve_2017_5715_regs_ctx;
-#endif
        el3_state_t el3state_ctx;
        el1_sys_regs_t sysregs_ctx;
 #if CTX_INCLUDE_FPREGS
index cd29266ed7d73303700ed70d5f775ba85503286f..9677e2e05daff9721118b459fcb8aba43de51630 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2017-2018, ARM Limited and Contributors. All rights reserved.
  *
  * SPDX-License-Identifier: BSD-3-Clause
  */
        .globl  workaround_bpiall_vbar0_runtime_exceptions
 
 #define EMIT_BPIALL            0xee070fd5
-#define EMIT_MOV_R0_IMM(v)     0xe3a0000##v
 #define EMIT_SMC               0xe1600070
 
-       .macro  enter_workaround _stub_name
+       .macro  enter_workaround _from_vector
+       /*
+        * Save register state to enable a call to AArch32 S-EL1 and return
+        * Identify the original calling vector in w2 (==_from_vector)
+        * Use w3-w6 for additional register state preservation while in S-EL1
+        */
+
        /* Save GP regs */
        stp     x0, x1, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X0]
        stp     x2, x3, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X2]
        stp     x26, x27, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X26]
        stp     x28, x29, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X28]
 
-       adr     x4, \_stub_name
+       /* Identify the original exception vector */
+       mov     w2, \_from_vector
+
+       /* Preserve 32-bit system registers in GP registers through the workaround */
+       mrs     x3, esr_el3
+       mrs     x4, spsr_el3
+       mrs     x5, scr_el3
+       mrs     x6, sctlr_el1
 
        /*
-        * Load SPSR_EL3 and VBAR_EL3.  SPSR_EL3 is set up to have
-        * all interrupts masked in preparation to running the workaround
-        * stub in S-EL1.  VBAR_EL3 points to the vector table that
-        * will handle the SMC back from the workaround stub.
+        * Preserve LR and ELR_EL3 registers in the GP regs context.
+        * Temporarily use the CTX_GPREG_SP_EL0 slot to preserve ELR_EL3
+        * through the workaround. This is OK because at this point the
+        * current state for this context's SP_EL0 is in the live system
+        * register, which is unmodified by the workaround.
         */
-       ldp     x0, x1, [x4, #0]
+       mrs     x7, elr_el3
+       stp     x30, x7, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_LR]
 
        /*
-        * Load SCTLR_EL1 and ELR_EL3.  SCTLR_EL1 is configured to disable
-        * the MMU in S-EL1.  ELR_EL3 points to the appropriate stub in S-EL1.
+        * Load system registers for entry to S-EL1.
         */
-       ldp     x2, x3, [x4, #16]
 
-       mrs     x4, scr_el3
-       mrs     x5, spsr_el3
-       mrs     x6, elr_el3
-       mrs     x7, sctlr_el1
-       mrs     x8, esr_el3
+       /* Mask all interrupts and set AArch32 Supervisor mode */
+       movz    w8, SPSR_MODE32(MODE32_svc, SPSR_T_ARM, SPSR_E_LITTLE, SPSR_AIF_MASK)
+
+       /* Switch EL3 exception vectors while the workaround is executing. */
+       adr     x9, workaround_bpiall_vbar1_runtime_exceptions
+
+       /* Setup SCTLR_EL1 with MMU off and I$ on */
+       ldr     x10, stub_sel1_sctlr
 
-       /* Preserve system registers in the workaround context */
-       stp     x4, x5, [sp, #CTX_CVE_2017_5715_OFFSET + CTX_CVE_2017_5715_QUAD0]
-       stp     x6, x7, [sp, #CTX_CVE_2017_5715_OFFSET + CTX_CVE_2017_5715_QUAD2]
-       stp     x8, x30, [sp, #CTX_CVE_2017_5715_OFFSET + CTX_CVE_2017_5715_QUAD4]
+       /* Land at the S-EL1 workaround stub */
+       adr     x11, aarch32_stub
 
        /*
         * Setting SCR_EL3 to all zeroes means that the NS, RW
         * and SMD bits are configured as expected.
         */
        msr     scr_el3, xzr
-
-       /*
-        * Reload system registers with the crafted values
-        * in preparation for entry in S-EL1.
-        */
-       msr     spsr_el3, x0
-       msr     vbar_el3, x1
-       msr     sctlr_el1, x2
-       msr     elr_el3, x3
+       msr     spsr_el3, x8
+       msr     vbar_el3, x9
+       msr     sctlr_el1, x10
+       msr     elr_el3, x11
 
        eret
        .endm
@@ -91,76 +99,31 @@ vector_base workaround_bpiall_vbar0_runtime_exceptions
         */
 vector_entry workaround_bpiall_vbar0_sync_exception_sp_el0
        b       sync_exception_sp_el0
+       nop     /* to force 8 byte alignment for the following stub */
+
        /*
         * Since each vector table entry is 128 bytes, we can store the
         * stub context in the unused space to minimize memory footprint.
         */
-aarch32_stub_smc:
+stub_sel1_sctlr:
+       .quad   SCTLR_AARCH32_EL1_RES1 | SCTLR_I_BIT
+
+aarch32_stub:
        .word   EMIT_BPIALL
-       .word   EMIT_MOV_R0_IMM(1)
        .word   EMIT_SMC
-aarch32_stub_ctx_smc:
-       /* Mask all interrupts and set AArch32 Supervisor mode */
-       .quad   (SPSR_AIF_MASK << SPSR_AIF_SHIFT | \
-                SPSR_M_AARCH32 << SPSR_M_SHIFT | \
-                MODE32_svc << MODE32_SHIFT)
 
-       /*
-        * VBAR_EL3 points to vbar1 which is the vector table
-        * used while the workaround is executing.
-        */
-       .quad   workaround_bpiall_vbar1_runtime_exceptions
-
-       /* Setup SCTLR_EL1 with MMU off and I$ on */
-       .quad   SCTLR_AARCH32_EL1_RES1 | SCTLR_I_BIT
-
-       /* ELR_EL3 is setup to point to the sync exception stub in AArch32 */
-       .quad   aarch32_stub_smc
        check_vector_size workaround_bpiall_vbar0_sync_exception_sp_el0
 
 vector_entry workaround_bpiall_vbar0_irq_sp_el0
        b       irq_sp_el0
-aarch32_stub_irq:
-       .word   EMIT_BPIALL
-       .word   EMIT_MOV_R0_IMM(2)
-       .word   EMIT_SMC
-aarch32_stub_ctx_irq:
-       .quad   (SPSR_AIF_MASK << SPSR_AIF_SHIFT | \
-                SPSR_M_AARCH32 << SPSR_M_SHIFT | \
-                MODE32_svc << MODE32_SHIFT)
-       .quad   workaround_bpiall_vbar1_runtime_exceptions
-       .quad   SCTLR_AARCH32_EL1_RES1 | SCTLR_I_BIT
-       .quad   aarch32_stub_irq
        check_vector_size workaround_bpiall_vbar0_irq_sp_el0
 
 vector_entry workaround_bpiall_vbar0_fiq_sp_el0
        b       fiq_sp_el0
-aarch32_stub_fiq:
-       .word   EMIT_BPIALL
-       .word   EMIT_MOV_R0_IMM(4)
-       .word   EMIT_SMC
-aarch32_stub_ctx_fiq:
-       .quad   (SPSR_AIF_MASK << SPSR_AIF_SHIFT | \
-                SPSR_M_AARCH32 << SPSR_M_SHIFT | \
-                MODE32_svc << MODE32_SHIFT)
-       .quad   workaround_bpiall_vbar1_runtime_exceptions
-       .quad   SCTLR_AARCH32_EL1_RES1 | SCTLR_I_BIT
-       .quad   aarch32_stub_fiq
        check_vector_size workaround_bpiall_vbar0_fiq_sp_el0
 
 vector_entry workaround_bpiall_vbar0_serror_sp_el0
        b       serror_sp_el0
-aarch32_stub_serror:
-       .word   EMIT_BPIALL
-       .word   EMIT_MOV_R0_IMM(8)
-       .word   EMIT_SMC
-aarch32_stub_ctx_serror:
-       .quad   (SPSR_AIF_MASK << SPSR_AIF_SHIFT | \
-                SPSR_M_AARCH32 << SPSR_M_SHIFT | \
-                MODE32_svc << MODE32_SHIFT)
-       .quad   workaround_bpiall_vbar1_runtime_exceptions
-       .quad   SCTLR_AARCH32_EL1_RES1 | SCTLR_I_BIT
-       .quad   aarch32_stub_serror
        check_vector_size workaround_bpiall_vbar0_serror_sp_el0
 
        /* ---------------------------------------------------------------------
@@ -188,19 +151,19 @@ vector_entry workaround_bpiall_vbar0_serror_sp_elx
         * ---------------------------------------------------------------------
         */
 vector_entry workaround_bpiall_vbar0_sync_exception_aarch64
-       enter_workaround aarch32_stub_ctx_smc
+       enter_workaround 1
        check_vector_size workaround_bpiall_vbar0_sync_exception_aarch64
 
 vector_entry workaround_bpiall_vbar0_irq_aarch64
-       enter_workaround aarch32_stub_ctx_irq
+       enter_workaround 2
        check_vector_size workaround_bpiall_vbar0_irq_aarch64
 
 vector_entry workaround_bpiall_vbar0_fiq_aarch64
-       enter_workaround aarch32_stub_ctx_fiq
+       enter_workaround 4
        check_vector_size workaround_bpiall_vbar0_fiq_aarch64
 
 vector_entry workaround_bpiall_vbar0_serror_aarch64
-       enter_workaround aarch32_stub_ctx_serror
+       enter_workaround 8
        check_vector_size workaround_bpiall_vbar0_serror_aarch64
 
        /* ---------------------------------------------------------------------
@@ -208,19 +171,19 @@ vector_entry workaround_bpiall_vbar0_serror_aarch64
         * ---------------------------------------------------------------------
         */
 vector_entry workaround_bpiall_vbar0_sync_exception_aarch32
-       enter_workaround aarch32_stub_ctx_smc
+       enter_workaround 1
        check_vector_size workaround_bpiall_vbar0_sync_exception_aarch32
 
 vector_entry workaround_bpiall_vbar0_irq_aarch32
-       enter_workaround aarch32_stub_ctx_irq
+       enter_workaround 2
        check_vector_size workaround_bpiall_vbar0_irq_aarch32
 
 vector_entry workaround_bpiall_vbar0_fiq_aarch32
-       enter_workaround aarch32_stub_ctx_fiq
+       enter_workaround 4
        check_vector_size workaround_bpiall_vbar0_fiq_aarch32
 
 vector_entry workaround_bpiall_vbar0_serror_aarch32
-       enter_workaround aarch32_stub_ctx_serror
+       enter_workaround 8
        check_vector_size workaround_bpiall_vbar0_serror_aarch32
 
        /* ---------------------------------------------------------------------
@@ -297,31 +260,33 @@ vector_entry workaround_bpiall_vbar1_serror_aarch64
         * ---------------------------------------------------------------------
         */
 vector_entry workaround_bpiall_vbar1_sync_exception_aarch32
-       /* Restore register state from the workaround context */
-       ldp     x2, x3, [sp, #CTX_CVE_2017_5715_OFFSET + CTX_CVE_2017_5715_QUAD0]
-       ldp     x4, x5, [sp, #CTX_CVE_2017_5715_OFFSET + CTX_CVE_2017_5715_QUAD2]
-       ldp     x6, x30, [sp, #CTX_CVE_2017_5715_OFFSET + CTX_CVE_2017_5715_QUAD4]
+       /*
+        * w2 indicates which SEL1 stub was run and thus which original vector was used
+        * w3-w6 contain saved system register state (esr_el3 in w3)
+        * Restore LR and ELR_EL3 register state from the GP regs context
+        */
+       ldp     x30, x7, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_LR]
 
        /* Apply the restored system register state */
-       msr     scr_el3, x2
-       msr     spsr_el3, x3
-       msr     elr_el3, x4
-       msr     sctlr_el1, x5
-       msr     esr_el3, x6
+       msr     esr_el3, x3
+       msr     spsr_el3, x4
+       msr     scr_el3, x5
+       msr     sctlr_el1, x6
+       msr     elr_el3, x7
 
        /*
         * Workaround is complete, so swap VBAR_EL3 to point
         * to workaround entry table in preparation for subsequent
         * Sync/IRQ/FIQ/SError exceptions.
         */
-       adr     x2, workaround_bpiall_vbar0_runtime_exceptions
-       msr     vbar_el3, x2
+       adr     x0, workaround_bpiall_vbar0_runtime_exceptions
+       msr     vbar_el3, x0
 
        /*
-        * Restore all GP regs except x0 and x1.  The value in x0
+        * Restore all GP regs except x2 and x3 (esr).  The value in x2
         * indicates the type of the original exception.
         */
-       ldp     x2, x3, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X2]
+       ldp     x0, x1, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X0]
        ldp     x4, x5, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X4]
        ldp     x6, x7, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X6]
        ldp     x8, x9, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X8]
@@ -336,37 +301,38 @@ vector_entry workaround_bpiall_vbar1_sync_exception_aarch32
        ldp     x26, x27, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X26]
        ldp     x28, x29, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X28]
 
-       /*
-        * Each of these handlers will first restore x0 and x1 from
-        * the context and the branch to the common implementation for
-        * each of the exception types.
-        */
-       tbnz    x0, #1, workaround_bpiall_vbar1_irq
-       tbnz    x0, #2, workaround_bpiall_vbar1_fiq
-       tbnz    x0, #3, workaround_bpiall_vbar1_serror
-
-       /* Fallthrough case for Sync exception */
-       ldp     x0, x1, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X0]
+       /* Fast path Sync exceptions.  Static predictor will fall through. */
+       tbz     w2, #0, workaround_not_sync
+       ldp     x2, x3, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X2]
        b       sync_exception_aarch64
        check_vector_size workaround_bpiall_vbar1_sync_exception_aarch32
 
 vector_entry workaround_bpiall_vbar1_irq_aarch32
        b       report_unhandled_interrupt
-workaround_bpiall_vbar1_irq:
-       ldp     x0, x1, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X0]
+
+       /*
+        * Post-workaround fan-out for non-sync exceptions
+        */
+workaround_not_sync:
+       tbnz    w2, #3, workaround_bpiall_vbar1_serror
+       tbnz    w2, #2, workaround_bpiall_vbar1_fiq
+       /* IRQ */
+       ldp     x2, x3, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X2]
        b       irq_aarch64
+
+workaround_bpiall_vbar1_fiq:
+       ldp     x2, x3, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X2]
+       b       fiq_aarch64
+
+workaround_bpiall_vbar1_serror:
+       ldp     x2, x3, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X2]
+       b       serror_aarch64
        check_vector_size workaround_bpiall_vbar1_irq_aarch32
 
 vector_entry workaround_bpiall_vbar1_fiq_aarch32
        b       report_unhandled_interrupt
-workaround_bpiall_vbar1_fiq:
-       ldp     x0, x1, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X0]
-       b       fiq_aarch64
        check_vector_size workaround_bpiall_vbar1_fiq_aarch32
 
 vector_entry workaround_bpiall_vbar1_serror_aarch32
        b       report_unhandled_exception
-workaround_bpiall_vbar1_serror:
-       ldp     x0, x1, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X0]
-       b       serror_aarch64
        check_vector_size workaround_bpiall_vbar1_serror_aarch32