alpha: fix alignment problem in csum_ipv6_magic()

author Ivan Kokshaysky <ink@jurassic.park.msu.ru>

Sun, 24 Jun 2007 00:16:35 +0000 (17:16 -0700)

committer Linus Torvalds <torvalds@woody.linux-foundation.org>

Sun, 24 Jun 2007 15:59:11 +0000 (08:59 -0700)
author Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Sun, 24 Jun 2007 00:16:35 +0000 (17:16 -0700)
committer Linus Torvalds <torvalds@woody.linux-foundation.org>
Sun, 24 Jun 2007 15:59:11 +0000 (08:59 -0700)
diff --git a/arch/alpha/lib/csum_ipv6_magic.S b/arch/alpha/lib/csum_ipv6_magic.S

index e09748dbf2ede7a8fa73223eb1e46d5d1904b65e..2c2acb96deb682de173c0a0b3983f92977d3ef8f 100644 (file)
--- a/arch/alpha/lib/csum_ipv6_magic.S
+++ b/arch/alpha/lib/csum_ipv6_magic.S
@@ -7,6 +7,9 @@
   *                                __u32 len,
   *                                unsigned short proto,
   *                                unsigned int csum);
+ *
+ * Misalignment handling (which costs 16 instructions / 8 cycles)
+ * added by Ivan Kokshaysky <ink@jurassic.park.msu.ru>
   */
  
         .globl csum_ipv6_magic
@@ -16,37 +19,57 @@
  csum_ipv6_magic:
         .prologue 0
  
-       ldq     $0,0($16)       # e0    : load src & dst addr words
+       ldq_u   $0,0($16)       # e0    : load src & dst addr words
         zapnot  $20,15,$20      # .. e1 : zero extend incoming csum
         extqh   $18,1,$4        # e0    : byte swap len & proto while we wait
-       ldq     $1,8($16)       # .. e1 :
+       ldq_u   $21,7($16)      # .. e1 : handle misalignment
  
         extbl   $18,1,$5        # e0    :
-       ldq     $2,0($17)       # .. e1 :
+       ldq_u   $1,8($16)       # .. e1 :
         extbl   $18,2,$6        # e0    :
-       ldq     $3,8($17)       # .. e1 :
+       ldq_u   $22,15($16)     # .. e1 :
  
         extbl   $18,3,$18       # e0    :
+       ldq_u   $2,0($17)       # .. e1 :
         sra     $4,32,$4        # e0    :
+       ldq_u   $23,7($17)      # .. e1 :
+
+       extql   $0,$16,$0       # e0    :
+       ldq_u   $3,8($17)       # .. e1 :
+       extqh   $21,$16,$21     # e0    :
+       ldq_u   $24,15($17)     # .. e1 :
+
         sll     $5,16,$5        # e0    :
+       or      $0,$21,$0       # .. e1 : 1st src word complete
+       extql   $1,$16,$1       # e0    :
         addq    $20,$0,$20      # .. e1 : begin summing the words
  
-       sll     $6,8,$6         # e0    :
+       extqh   $22,$16,$22     # e0    :
         cmpult  $20,$0,$0       # .. e1 :
-       extwh   $19,7,$7        # e0    :
-       or      $4,$18,$18      # .. e1 :
+       sll     $6,8,$6         # e0    :
+       or      $1,$22,$1       # .. e1 : 2nd src word complete
  
-       extbl   $19,1,$19       # e0    :
+       extql   $2,$17,$2       # e0    :
+       or      $4,$18,$18      # .. e1 :
+       extqh   $23,$17,$23     # e0    :
         or      $5,$6,$5        # .. e1 :
-       or      $18,$5,$18      # e0    : len complete
-       or      $19,$7,$19      # .. e1 :
  
-       sll     $19,48,$19      # e0    :
+       extql   $3,$17,$3       # e0    :
+       or      $2,$23,$2       # .. e1 : 1st dst word complete
+       extqh   $24,$17,$24     # e0    :
+       or      $18,$5,$18      # .. e1 : len complete
+
+       extwh   $19,7,$7        # e0    :
+       or      $3,$24,$3       # .. e1 : 2nd dst word complete
+       extbl   $19,1,$19       # e0    :
         addq    $20,$1,$20      # .. e1 :
-       sra     $19,32,$19      # e0    : proto complete
+
+       or      $19,$7,$19      # e0    :
         cmpult  $20,$1,$1       # .. e1 :
+       sll     $19,48,$19      # e0    :
+       nop                     # .. e0 :
  
-       nop                     # e0    :
+       sra     $19,32,$19      # e0    : proto complete
         addq    $20,$2,$20      # .. e1 :
         cmpult  $20,$2,$2       # e0    :
         addq    $20,$3,$20      # .. e1 :
@@ -84,7 +107,7 @@ csum_ipv6_magic:
         extwl   $0,2,$1         # e0    : fold 17-bit value
         zapnot  $0,3,$0         # .. e1 :
         addq    $0,$1,$0        # e0    :
-       not     $0,$0           # e1    : and complement.
+       not     $0,$0           # .. e1 : and complement.
  
         zapnot  $0,3,$0         # e0    :
         ret                     # .. e1 :
diff --git a/arch/alpha/lib/ev6-csum_ipv6_magic.S b/arch/alpha/lib/ev6-csum_ipv6_magic.S

index de1948a691182f9c56e4945954dd79878b24f836..fc0bc399f872db671313f07e78ac8d20f5ff6637 100644 (file)
--- a/arch/alpha/lib/ev6-csum_ipv6_magic.S
+++ b/arch/alpha/lib/ev6-csum_ipv6_magic.S
@@ -46,6 +46,10 @@
   * add the 3 low ushorts together, generating a uint
   * a final add of the 2 lower ushorts
   * truncating the result.
+ *
+ * Misalignment handling added by Ivan Kokshaysky <ink@jurassic.park.msu.ru>
+ * The cost is 16 instructions (~8 cycles), including two extra loads which
+ * may cause additional delay in rare cases (load-load replay traps).
   */
  
         .globl csum_ipv6_magic
@@ -55,25 +59,45 @@
  csum_ipv6_magic:
         .prologue 0
  
-       ldq     $0,0($16)       # L : Latency: 3
+       ldq_u   $0,0($16)       # L : Latency: 3
         inslh   $18,7,$4        # U : 0000000000AABBCC
-       ldq     $1,8($16)       # L : Latency: 3
+       ldq_u   $1,8($16)       # L : Latency: 3
         sll     $19,8,$7        # U : U L U L : 0x00000000 00aabb00
  
+       and     $16,7,$6        # E : src misalignment
+       ldq_u   $5,15($16)      # L : Latency: 3
         zapnot  $20,15,$20      # U : zero extend incoming csum
-       ldq     $2,0($17)       # L : Latency: 3
-       sll     $19,24,$19      # U : U L L U : 0x000000aa bb000000
+       ldq_u   $2,0($17)       # L : U L U L : Latency: 3
+
+       extql   $0,$6,$0        # U :
+       extqh   $1,$6,$22       # U :
+       ldq_u   $3,8($17)       # L : Latency: 3
+       sll     $19,24,$19      # U : U U L U : 0x000000aa bb000000
+
+       cmoveq  $6,$31,$22      # E : src aligned?
+       ldq_u   $23,15($17)     # L : Latency: 3
         inswl   $18,3,$18       # U : 000000CCDD000000
+       addl    $19,$7,$19      # E : U L U L : <sign bits>bbaabb00
  
-       ldq     $3,8($17)       # L : Latency: 3
-       bis     $18,$4,$18      # E : 000000CCDDAABBCC
-       addl    $19,$7,$19      # E : <sign bits>bbaabb00
-       nop                     # E : U L U L
+       or      $0,$22,$0       # E : 1st src word complete
+       extql   $1,$6,$1        # U :
+       or      $18,$4,$18      # E : 000000CCDDAABBCC
+       extqh   $5,$6,$5        # U : L U L U
  
+       and     $17,7,$6        # E : dst misalignment
+       extql   $2,$6,$2        # U :
+       or      $1,$5,$1        # E : 2nd src word complete
+       extqh   $3,$6,$22       # U : L U L U :
+
+       cmoveq  $6,$31,$22      # E : dst aligned?
+       extql   $3,$6,$3        # U :
         addq    $20,$0,$20      # E : begin summing the words
+       extqh   $23,$6,$23      # U : L U L U :
+
         srl     $18,16,$4       # U : 0000000000CCDDAA
+       or      $2,$22,$2       # E : 1st dst word complete
         zap     $19,0x3,$19     # U : <sign bits>bbaa0000
-       nop                     # E : L U U L
+       or      $3,$23,$3       # E : U L U L : 2nd dst word complete
  
         cmpult  $20,$0,$0       # E :
         addq    $20,$1,$20      # E :
author	Ivan Kokshaysky <ink@jurassic.park.msu.ru>
	Sun, 24 Jun 2007 00:16:35 +0000 (17:16 -0700)
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>
	Sun, 24 Jun 2007 15:59:11 +0000 (08:59 -0700)
arch/alpha/lib/csum_ipv6_magic.S		patch \| blob \| history
arch/alpha/lib/ev6-csum_ipv6_magic.S		patch \| blob \| history