arm: build arch memset/memcpy in Thumb2 mode

author Stefan Agner <stefan@agner.ch>

Thu, 18 Dec 2014 17:10:33 +0000 (18:10 +0100)

committer Tom Rini <trini@ti.com>

Fri, 9 Jan 2015 15:20:22 +0000 (10:20 -0500)
author Stefan Agner <stefan@agner.ch>
Thu, 18 Dec 2014 17:10:33 +0000 (18:10 +0100)
committer Tom Rini <trini@ti.com>
Fri, 9 Jan 2015 15:20:22 +0000 (10:20 -0500)
diff --git a/arch/arm/config.mk b/arch/arm/config.mk

index c339e6dc8cfbd683b6dfcef59935048e9282920c..0667984b697d62845cb013ba3376cfc5ca01bbf3 100644 (file)
--- a/arch/arm/config.mk
+++ b/arch/arm/config.mk
@@ -26,7 +26,9 @@ PLATFORM_CPPFLAGS += -D__ARM__
  
  # Choose between ARM/Thumb instruction sets
  ifeq ($(CONFIG_SYS_THUMB_BUILD),y)
-PF_CPPFLAGS_ARM := $(call cc-option, -mthumb -mthumb-interwork,\
+AFLAGS_IMPLICIT_IT     := $(call as-option,-Wa$(comma)-mimplicit-it=always)
+PF_CPPFLAGS_ARM                := $(AFLAGS_IMPLICIT_IT) \
+                       $(call cc-option, -mthumb -mthumb-interwork,\
                         $(call cc-option,-marm,)\
                         $(call cc-option,-mno-thumb-interwork,)\
                 )
diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h

index 5e4789b1452079fee12ff63c923ecdf9bcf17115..11b80fb190ce03165ac505bf4412d485a30b4840 100644 (file)
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -14,12 +14,14 @@
   *  assembler source.
   */
  
+#include <config.h>
+
  /*
   * Endian independent macros for shifting bytes within registers.
   */
  #ifndef __ARMEB__
-#define pull           lsr
-#define push           lsl
+#define lspull         lsr
+#define lspush         lsl
  #define get_byte_0     lsl #0
  #define get_byte_1     lsr #8
  #define get_byte_2     lsr #16
@@ -29,8 +31,8 @@
  #define put_byte_2     lsl #16
  #define put_byte_3     lsl #24
  #else
-#define pull           lsl
-#define push           lsr
+#define lspull         lsl
+#define lspush         lsr
  #define get_byte_0     lsr #24
  #define get_byte_1     lsr #16
  #define get_byte_2     lsr #8
@@ -54,7 +56,28 @@
  #define PLD(code...)
  #endif
  
+       .irp    c,,eq,ne,cs,cc,mi,pl,vs,vc,hi,ls,ge,lt,gt,le,hs,lo
+       .macro  ret\c, reg
+#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__)
+       mov\c   pc, \reg
+#else
+       .ifeqs  "\reg", "lr"
+       bx\c    \reg
+       .else
+       mov\c   pc, \reg
+       .endif
+#endif
+       .endm
+       .endr
+
  /*
- * Cache alligned
+ * Cache aligned, used for optimized memcpy/memset
+ * In the kernel this is only enabled for Feroceon CPU's...
+ * We disable it especially for Thumb builds since those instructions
+ * are not made in a Thumb ready way...
   */
+#ifdef CONFIG_SYS_THUMB_BUILD
+#define CALGN(code...)
+#else
  #define CALGN(code...) code
+#endif
diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S

index f655256b5d1514f949b880c322cb225a73cd538b..eeaf0035297efb010dbf14e3848e619f5f2b6d50 100644 (file)
--- a/arch/arm/lib/memcpy.S
+++ b/arch/arm/lib/memcpy.S
@@ -10,9 +10,14 @@
   *  published by the Free Software Foundation.
   */
  
+#include <linux/linkage.h>
  #include <asm/assembler.h>
  
+#ifdef CONFIG_SYS_THUMB_BUILD
+#define W(instr)       instr.w
+#else
  #define W(instr)       instr
+#endif
  
  #define LDR1W_SHIFT    0
  #define STR1W_SHIFT    0
@@ -30,7 +35,7 @@
         .endm
  
         .macro ldr1b ptr reg cond=al abort
-       ldr\cond\()b \reg, [\ptr], #1
+       ldrb\cond\() \reg, [\ptr], #1
         .endm
  
         .macro str1w ptr reg abort
@@ -42,7 +47,7 @@
         .endm
  
         .macro str1b ptr reg cond=al abort
-       str\cond\()b \reg, [\ptr], #1
+       strb\cond\() \reg, [\ptr], #1
         .endm
  
         .macro enter reg1 reg2
@@ -56,10 +61,12 @@
         .text
  
  /* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
-
-.globl memcpy
-memcpy:
-
+       .syntax unified
+#ifdef CONFIG_SYS_THUMB_BUILD
+       .thumb
+       .thumb_func
+#endif
+ENTRY(memcpy)
                 cmp     r0, r1
                 moveq   pc, lr
  
@@ -79,7 +86,7 @@ memcpy:
  
         CALGN(  ands    ip, r0, #31             )
         CALGN(  rsb     r3, ip, #32             )
-       CALGN(  sbcnes  r4, r3, r2              )  @ C is always set here
+       CALGN(  sbcsne  r4, r3, r2              )  @ C is always set here
         CALGN(  bcs     2f                      )
         CALGN(  adr     r4, 6f                  )
         CALGN(  subs    r2, r2, r3              )  @ C gets set
@@ -178,7 +185,7 @@ memcpy:
  
         CALGN(  ands    ip, r0, #31             )
         CALGN(  rsb     ip, ip, #32             )
-       CALGN(  sbcnes  r4, ip, r2              )  @ C is always set here
+       CALGN(  sbcsne  r4, ip, r2              )  @ C is always set here
         CALGN(  subcc   r2, r2, ip              )
         CALGN(  bcc     15f                     )
  
@@ -193,24 +200,24 @@ memcpy:
  
  12:    PLD(    pld     [r1, #124]              )
  13:            ldr4w   r1, r4, r5, r6, r7, abort=19f
-               mov     r3, lr, pull #\pull
+               mov     r3, lr, lspull #\pull
                 subs    r2, r2, #32
                 ldr4w   r1, r8, r9, ip, lr, abort=19f
-               orr     r3, r3, r4, push #\push
-               mov     r4, r4, pull #\pull
-               orr     r4, r4, r5, push #\push
-               mov     r5, r5, pull #\pull
-               orr     r5, r5, r6, push #\push
-               mov     r6, r6, pull #\pull
-               orr     r6, r6, r7, push #\push
-               mov     r7, r7, pull #\pull
-               orr     r7, r7, r8, push #\push
-               mov     r8, r8, pull #\pull
-               orr     r8, r8, r9, push #\push
-               mov     r9, r9, pull #\pull
-               orr     r9, r9, ip, push #\push
-               mov     ip, ip, pull #\pull
-               orr     ip, ip, lr, push #\push
+               orr     r3, r3, r4, lspush #\push
+               mov     r4, r4, lspull #\pull
+               orr     r4, r4, r5, lspush #\push
+               mov     r5, r5, lspull #\pull
+               orr     r5, r5, r6, lspush #\push
+               mov     r6, r6, lspull #\pull
+               orr     r6, r6, r7, lspush #\push
+               mov     r7, r7, lspull #\pull
+               orr     r7, r7, r8, lspush #\push
+               mov     r8, r8, lspull #\pull
+               orr     r8, r8, r9, lspush #\push
+               mov     r9, r9, lspull #\pull
+               orr     r9, r9, ip, lspush #\push
+               mov     ip, ip, lspull #\pull
+               orr     ip, ip, lr, lspush #\push
                 str8w   r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f
                 bge     12b
         PLD(    cmn     r2, #96                 )
@@ -221,10 +228,10 @@ memcpy:
  14:            ands    ip, r2, #28
                 beq     16f
  
-15:            mov     r3, lr, pull #\pull
+15:            mov     r3, lr, lspull #\pull
                 ldr1w   r1, lr, abort=21f
                 subs    ip, ip, #4
-               orr     r3, r3, lr, push #\push
+               orr     r3, r3, lr, lspush #\push
                 str1w   r0, r3, abort=21f
                 bgt     15b
         CALGN(  cmp     r2, #0                  )
@@ -241,3 +248,24 @@ memcpy:
  17:            forward_copy_shift      pull=16 push=16
  
  18:            forward_copy_shift      pull=24 push=8
+
+
+/*
+ * Abort preamble and completion macros.
+ * If a fixup handler is required then those macros must surround it.
+ * It is assumed that the fixup code will handle the private part of
+ * the exit macro.
+ */
+
+       .macro  copy_abort_preamble
+19:    ldmfd   sp!, {r5 - r9}
+       b       21f
+20:    ldmfd   sp!, {r5 - r8}
+21:
+       .endm
+
+       .macro  copy_abort_end
+       ldmfd   sp!, {r4, pc}
+       .endm
+
+ENDPROC(memcpy)
diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S

index 0cdf89535ae76c18aa33bf799e917024af3289ec..7208f20dda4bc16b7a226a753ab5d4ad17e5eaec 100644 (file)
--- a/arch/arm/lib/memset.S
+++ b/arch/arm/lib/memset.S
@@ -9,32 +9,25 @@
   *
   *  ASM optimised string functions
   */
+#include <linux/linkage.h>
  #include <asm/assembler.h>
  
         .text
         .align  5
-       .word   0
  
-1:     subs    r2, r2, #4              @ 1 do we have enough
-       blt     5f                      @ 1 bytes to align with?
-       cmp     r3, #2                  @ 1
-       strltb  r1, [r0], #1            @ 1
-       strleb  r1, [r0], #1            @ 1
-       strb    r1, [r0], #1            @ 1
-       add     r2, r2, r3              @ 1 (r2 = r2 - (4 - r3))
-/*
- * The pointer is now aligned and the length is adjusted.  Try doing the
- * memset again.
- */
-
-.globl memset
-memset:
+       .syntax unified
+#ifdef CONFIG_SYS_THUMB_BUILD
+       .thumb
+       .thumb_func
+#endif
+ENTRY(memset)
         ands    r3, r0, #3              @ 1 unaligned?
-       bne     1b                      @ 1
+       mov     ip, r0                  @ preserve r0 as return value
+       bne     6f                      @ 1
  /*
- * we know that the pointer in r0 is aligned to a word boundary.
+ * we know that the pointer in ip is aligned to a word boundary.
   */
-       orr     r1, r1, r1, lsl #8
+1:     orr     r1, r1, r1, lsl #8
         orr     r1, r1, r1, lsl #16
         mov     r3, r1
         cmp     r2, #16
@@ -43,29 +36,28 @@ memset:
  #if ! CALGN(1)+0
  
  /*
- * We need an extra register for this loop - save the return address and
- * use the LR
+ * We need 2 extra registers for this loop - use r8 and the LR
   */
-       str     lr, [sp, #-4]!
-       mov     ip, r1
+       stmfd   sp!, {r8, lr}
+       mov     r8, r1
         mov     lr, r1
  
  2:     subs    r2, r2, #64
-       stmgeia r0!, {r1, r3, ip, lr}   @ 64 bytes at a time.
-       stmgeia r0!, {r1, r3, ip, lr}
-       stmgeia r0!, {r1, r3, ip, lr}
-       stmgeia r0!, {r1, r3, ip, lr}
+       stmiage ip!, {r1, r3, r8, lr}   @ 64 bytes at a time.
+       stmiage ip!, {r1, r3, r8, lr}
+       stmiage ip!, {r1, r3, r8, lr}
+       stmiage ip!, {r1, r3, r8, lr}
         bgt     2b
-       ldmeqfd sp!, {pc}               @ Now <64 bytes to go.
+       ldmfdeq sp!, {r8, pc}           @ Now <64 bytes to go.
  /*
   * No need to correct the count; we're only testing bits from now on
   */
         tst     r2, #32
-       stmneia r0!, {r1, r3, ip, lr}
-       stmneia r0!, {r1, r3, ip, lr}
+       stmiane ip!, {r1, r3, r8, lr}
+       stmiane ip!, {r1, r3, r8, lr}
         tst     r2, #16
-       stmneia r0!, {r1, r3, ip, lr}
-       ldr     lr, [sp], #4
+       stmiane ip!, {r1, r3, r8, lr}
+       ldmfd   sp!, {r8, lr}
  
  #else
  
@@ -74,53 +66,63 @@ memset:
   * whole cache lines at once.
   */
  
-       stmfd   sp!, {r4-r7, lr}
+       stmfd   sp!, {r4-r8, lr}
         mov     r4, r1
         mov     r5, r1
         mov     r6, r1
         mov     r7, r1
-       mov     ip, r1
+       mov     r8, r1
         mov     lr, r1
  
         cmp     r2, #96
-       tstgt   r0, #31
+       tstgt   ip, #31
         ble     3f
  
-       and     ip, r0, #31
-       rsb     ip, ip, #32
-       sub     r2, r2, ip
-       movs    ip, ip, lsl #(32 - 4)
-       stmcsia r0!, {r4, r5, r6, r7}
-       stmmiia r0!, {r4, r5}
-       tst     ip, #(1 << 30)
-       mov     ip, r1
-       strne   r1, [r0], #4
+       and     r8, ip, #31
+       rsb     r8, r8, #32
+       sub     r2, r2, r8
+       movs    r8, r8, lsl #(32 - 4)
+       stmiacs ip!, {r4, r5, r6, r7}
+       stmiami ip!, {r4, r5}
+       tst     r8, #(1 << 30)
+       mov     r8, r1
+       strne   r1, [ip], #4
  
  3:     subs    r2, r2, #64
-       stmgeia r0!, {r1, r3-r7, ip, lr}
-       stmgeia r0!, {r1, r3-r7, ip, lr}
+       stmiage ip!, {r1, r3-r8, lr}
+       stmiage ip!, {r1, r3-r8, lr}
         bgt     3b
-       ldmeqfd sp!, {r4-r7, pc}
+       ldmfdeq sp!, {r4-r8, pc}
  
         tst     r2, #32
-       stmneia r0!, {r1, r3-r7, ip, lr}
+       stmiane ip!, {r1, r3-r8, lr}
         tst     r2, #16
-       stmneia r0!, {r4-r7}
-       ldmfd   sp!, {r4-r7, lr}
+       stmiane ip!, {r4-r7}
+       ldmfd   sp!, {r4-r8, lr}
  
  #endif
  
  4:     tst     r2, #8
-       stmneia r0!, {r1, r3}
+       stmiane ip!, {r1, r3}
         tst     r2, #4
-       strne   r1, [r0], #4
+       strne   r1, [ip], #4
  /*
   * When we get here, we've got less than 4 bytes to zero.  We
   * may have an unaligned pointer as well.
   */
  5:     tst     r2, #2
-       strneb  r1, [r0], #1
-       strneb  r1, [r0], #1
+       strbne  r1, [ip], #1
+       strbne  r1, [ip], #1
         tst     r2, #1
-       strneb  r1, [r0], #1
-       mov     pc, lr
+       strbne  r1, [ip], #1
+       ret     lr
+
+6:     subs    r2, r2, #4              @ 1 do we have enough
+       blt     5b                      @ 1 bytes to align with?
+       cmp     r3, #2                  @ 1
+       strblt  r1, [ip], #1            @ 1
+       strble  r1, [ip], #1            @ 1
+       strb    r1, [ip], #1            @ 1
+       add     r2, r2, r3              @ 1 (r2 = r2 - (4 - r3))
+       b       1b
+ENDPROC(memset)
author	Stefan Agner <stefan@agner.ch>
	Thu, 18 Dec 2014 17:10:33 +0000 (18:10 +0100)
committer	Tom Rini <trini@ti.com>
	Fri, 9 Jan 2015 15:20:22 +0000 (10:20 -0500)
arch/arm/config.mk		patch \| blob \| history
arch/arm/include/asm/assembler.h		patch \| blob \| history
arch/arm/lib/memcpy.S		patch \| blob \| history
arch/arm/lib/memset.S		patch \| blob \| history