i386: prepare shared crypto/aes-i586-asm.S
authorThomas Gleixner <tglx@linutronix.de>
Thu, 11 Oct 2007 09:11:42 +0000 (11:11 +0200)
committerThomas Gleixner <tglx@linutronix.de>
Thu, 11 Oct 2007 09:11:42 +0000 (11:11 +0200)
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
arch/i386/crypto/Makefile
arch/i386/crypto/aes-i586-asm.S [deleted file]
arch/i386/crypto/aes-i586-asm_32.S [new file with mode: 0644]

index 57fcb333c9c1187b77af07e78de54f9358bb8107..cd1038a22dd3be1e136639ddc35ab50824085bfb 100644 (file)
@@ -7,6 +7,6 @@
 obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
 obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
 
-aes-i586-y := aes-i586-asm.o aes.o
+aes-i586-y := aes-i586-asm_32.o aes.o
 twofish-i586-y := twofish-i586-asm.o twofish_32.o
 
diff --git a/arch/i386/crypto/aes-i586-asm.S b/arch/i386/crypto/aes-i586-asm.S
deleted file mode 100644 (file)
index f942f0c..0000000
+++ /dev/null
@@ -1,373 +0,0 @@
-// -------------------------------------------------------------------------
-// Copyright (c) 2001, Dr Brian Gladman <                 >, Worcester, UK.
-// All rights reserved.
-//
-// LICENSE TERMS
-//
-// The free distribution and use of this software in both source and binary 
-// form is allowed (with or without changes) provided that:
-//
-//   1. distributions of this source code include the above copyright 
-//      notice, this list of conditions and the following disclaimer//
-//
-//   2. distributions in binary form include the above copyright
-//      notice, this list of conditions and the following disclaimer
-//      in the documentation and/or other associated materials//
-//
-//   3. the copyright holder's name is not used to endorse products 
-//      built using this software without specific written permission.
-//
-//
-// ALTERNATIVELY, provided that this notice is retained in full, this product
-// may be distributed under the terms of the GNU General Public License (GPL),
-// in which case the provisions of the GPL apply INSTEAD OF those given above.
-//
-// Copyright (c) 2004 Linus Torvalds <torvalds@osdl.org>
-// Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
-
-// DISCLAIMER
-//
-// This software is provided 'as is' with no explicit or implied warranties
-// in respect of its properties including, but not limited to, correctness 
-// and fitness for purpose.
-// -------------------------------------------------------------------------
-// Issue Date: 29/07/2002
-
-.file "aes-i586-asm.S"
-.text
-
-#include <asm/asm-offsets.h>
-
-#define tlen 1024   // length of each of 4 'xor' arrays (256 32-bit words)
-
-/* offsets to parameters with one register pushed onto stack */
-#define tfm 8
-#define out_blk 12
-#define in_blk 16
-
-/* offsets in crypto_tfm structure */
-#define ekey (crypto_tfm_ctx_offset + 0)
-#define nrnd (crypto_tfm_ctx_offset + 256)
-#define dkey (crypto_tfm_ctx_offset + 260)
-
-// register mapping for encrypt and decrypt subroutines
-
-#define r0  eax
-#define r1  ebx
-#define r2  ecx
-#define r3  edx
-#define r4  esi
-#define r5  edi
-
-#define eaxl  al
-#define eaxh  ah
-#define ebxl  bl
-#define ebxh  bh
-#define ecxl  cl
-#define ecxh  ch
-#define edxl  dl
-#define edxh  dh
-
-#define _h(reg) reg##h
-#define h(reg) _h(reg)
-
-#define _l(reg) reg##l
-#define l(reg) _l(reg)
-
-// This macro takes a 32-bit word representing a column and uses
-// each of its four bytes to index into four tables of 256 32-bit
-// words to obtain values that are then xored into the appropriate
-// output registers r0, r1, r4 or r5.  
-
-// Parameters:
-// table table base address
-//   %1  out_state[0]
-//   %2  out_state[1]
-//   %3  out_state[2]
-//   %4  out_state[3]
-//   idx input register for the round (destroyed)
-//   tmp scratch register for the round
-// sched key schedule
-
-#define do_col(table, a1,a2,a3,a4, idx, tmp)   \
-       movzx   %l(idx),%tmp;                   \
-       xor     table(,%tmp,4),%a1;             \
-       movzx   %h(idx),%tmp;                   \
-       shr     $16,%idx;                       \
-       xor     table+tlen(,%tmp,4),%a2;        \
-       movzx   %l(idx),%tmp;                   \
-       movzx   %h(idx),%idx;                   \
-       xor     table+2*tlen(,%tmp,4),%a3;      \
-       xor     table+3*tlen(,%idx,4),%a4;
-
-// initialise output registers from the key schedule
-// NB1: original value of a3 is in idx on exit
-// NB2: original values of a1,a2,a4 aren't used
-#define do_fcol(table, a1,a2,a3,a4, idx, tmp, sched) \
-       mov     0 sched,%a1;                    \
-       movzx   %l(idx),%tmp;                   \
-       mov     12 sched,%a2;                   \
-       xor     table(,%tmp,4),%a1;             \
-       mov     4 sched,%a4;                    \
-       movzx   %h(idx),%tmp;                   \
-       shr     $16,%idx;                       \
-       xor     table+tlen(,%tmp,4),%a2;        \
-       movzx   %l(idx),%tmp;                   \
-       movzx   %h(idx),%idx;                   \
-       xor     table+3*tlen(,%idx,4),%a4;      \
-       mov     %a3,%idx;                       \
-       mov     8 sched,%a3;                    \
-       xor     table+2*tlen(,%tmp,4),%a3;
-
-// initialise output registers from the key schedule
-// NB1: original value of a3 is in idx on exit
-// NB2: original values of a1,a2,a4 aren't used
-#define do_icol(table, a1,a2,a3,a4, idx, tmp, sched) \
-       mov     0 sched,%a1;                    \
-       movzx   %l(idx),%tmp;                   \
-       mov     4 sched,%a2;                    \
-       xor     table(,%tmp,4),%a1;             \
-       mov     12 sched,%a4;                   \
-       movzx   %h(idx),%tmp;                   \
-       shr     $16,%idx;                       \
-       xor     table+tlen(,%tmp,4),%a2;        \
-       movzx   %l(idx),%tmp;                   \
-       movzx   %h(idx),%idx;                   \
-       xor     table+3*tlen(,%idx,4),%a4;      \
-       mov     %a3,%idx;                       \
-       mov     8 sched,%a3;                    \
-       xor     table+2*tlen(,%tmp,4),%a3;
-
-
-// original Gladman had conditional saves to MMX regs.
-#define save(a1, a2)           \
-       mov     %a2,4*a1(%esp)
-
-#define restore(a1, a2)                \
-       mov     4*a2(%esp),%a1
-
-// These macros perform a forward encryption cycle. They are entered with
-// the first previous round column values in r0,r1,r4,r5 and
-// exit with the final values in the same registers, using stack
-// for temporary storage.
-
-// round column values
-// on entry: r0,r1,r4,r5
-// on exit:  r2,r1,r4,r5
-#define fwd_rnd1(arg, table)                                           \
-       save   (0,r1);                                                  \
-       save   (1,r5);                                                  \
-                                                                       \
-       /* compute new column values */                                 \
-       do_fcol(table, r2,r5,r4,r1, r0,r3, arg);        /* idx=r0 */    \
-       do_col (table, r4,r1,r2,r5, r0,r3);             /* idx=r4 */    \
-       restore(r0,0);                                                  \
-       do_col (table, r1,r2,r5,r4, r0,r3);             /* idx=r1 */    \
-       restore(r0,1);                                                  \
-       do_col (table, r5,r4,r1,r2, r0,r3);             /* idx=r5 */
-
-// round column values
-// on entry: r2,r1,r4,r5
-// on exit:  r0,r1,r4,r5
-#define fwd_rnd2(arg, table)                                           \
-       save   (0,r1);                                                  \
-       save   (1,r5);                                                  \
-                                                                       \
-       /* compute new column values */                                 \
-       do_fcol(table, r0,r5,r4,r1, r2,r3, arg);        /* idx=r2 */    \
-       do_col (table, r4,r1,r0,r5, r2,r3);             /* idx=r4 */    \
-       restore(r2,0);                                                  \
-       do_col (table, r1,r0,r5,r4, r2,r3);             /* idx=r1 */    \
-       restore(r2,1);                                                  \
-       do_col (table, r5,r4,r1,r0, r2,r3);             /* idx=r5 */
-
-// These macros performs an inverse encryption cycle. They are entered with
-// the first previous round column values in r0,r1,r4,r5 and
-// exit with the final values in the same registers, using stack
-// for temporary storage
-
-// round column values
-// on entry: r0,r1,r4,r5
-// on exit:  r2,r1,r4,r5
-#define inv_rnd1(arg, table)                                           \
-       save    (0,r1);                                                 \
-       save    (1,r5);                                                 \
-                                                                       \
-       /* compute new column values */                                 \
-       do_icol(table, r2,r1,r4,r5, r0,r3, arg);        /* idx=r0 */    \
-       do_col (table, r4,r5,r2,r1, r0,r3);             /* idx=r4 */    \
-       restore(r0,0);                                                  \
-       do_col (table, r1,r4,r5,r2, r0,r3);             /* idx=r1 */    \
-       restore(r0,1);                                                  \
-       do_col (table, r5,r2,r1,r4, r0,r3);             /* idx=r5 */
-
-// round column values
-// on entry: r2,r1,r4,r5
-// on exit:  r0,r1,r4,r5
-#define inv_rnd2(arg, table)                                           \
-       save    (0,r1);                                                 \
-       save    (1,r5);                                                 \
-                                                                       \
-       /* compute new column values */                                 \
-       do_icol(table, r0,r1,r4,r5, r2,r3, arg);        /* idx=r2 */    \
-       do_col (table, r4,r5,r0,r1, r2,r3);             /* idx=r4 */    \
-       restore(r2,0);                                                  \
-       do_col (table, r1,r4,r5,r0, r2,r3);             /* idx=r1 */    \
-       restore(r2,1);                                                  \
-       do_col (table, r5,r0,r1,r4, r2,r3);             /* idx=r5 */
-
-// AES (Rijndael) Encryption Subroutine
-/* void aes_enc_blk(struct crypto_tfm *tfm, u8 *out_blk, const u8 *in_blk) */
-
-.global  aes_enc_blk
-
-.extern  ft_tab
-.extern  fl_tab
-
-.align 4
-
-aes_enc_blk:
-       push    %ebp
-       mov     tfm(%esp),%ebp
-
-// CAUTION: the order and the values used in these assigns 
-// rely on the register mappings
-
-1:     push    %ebx
-       mov     in_blk+4(%esp),%r2
-       push    %esi
-       mov     nrnd(%ebp),%r3   // number of rounds
-       push    %edi
-#if ekey != 0
-       lea     ekey(%ebp),%ebp  // key pointer
-#endif
-
-// input four columns and xor in first round key
-
-       mov     (%r2),%r0
-       mov     4(%r2),%r1
-       mov     8(%r2),%r4
-       mov     12(%r2),%r5
-       xor     (%ebp),%r0
-       xor     4(%ebp),%r1
-       xor     8(%ebp),%r4
-       xor     12(%ebp),%r5
-
-       sub     $8,%esp         // space for register saves on stack
-       add     $16,%ebp        // increment to next round key
-       cmp     $12,%r3
-       jb      4f              // 10 rounds for 128-bit key
-       lea     32(%ebp),%ebp
-       je      3f              // 12 rounds for 192-bit key
-       lea     32(%ebp),%ebp
-
-2:     fwd_rnd1( -64(%ebp) ,ft_tab)    // 14 rounds for 256-bit key
-       fwd_rnd2( -48(%ebp) ,ft_tab)
-3:     fwd_rnd1( -32(%ebp) ,ft_tab)    // 12 rounds for 192-bit key
-       fwd_rnd2( -16(%ebp) ,ft_tab)
-4:     fwd_rnd1(    (%ebp) ,ft_tab)    // 10 rounds for 128-bit key
-       fwd_rnd2( +16(%ebp) ,ft_tab)
-       fwd_rnd1( +32(%ebp) ,ft_tab)
-       fwd_rnd2( +48(%ebp) ,ft_tab)
-       fwd_rnd1( +64(%ebp) ,ft_tab)
-       fwd_rnd2( +80(%ebp) ,ft_tab)
-       fwd_rnd1( +96(%ebp) ,ft_tab)
-       fwd_rnd2(+112(%ebp) ,ft_tab)
-       fwd_rnd1(+128(%ebp) ,ft_tab)
-       fwd_rnd2(+144(%ebp) ,fl_tab)    // last round uses a different table
-
-// move final values to the output array.  CAUTION: the 
-// order of these assigns rely on the register mappings
-
-       add     $8,%esp
-       mov     out_blk+12(%esp),%ebp
-       mov     %r5,12(%ebp)
-       pop     %edi
-       mov     %r4,8(%ebp)
-       pop     %esi
-       mov     %r1,4(%ebp)
-       pop     %ebx
-       mov     %r0,(%ebp)
-       pop     %ebp
-       mov     $1,%eax
-       ret
-
-// AES (Rijndael) Decryption Subroutine
-/* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out_blk, const u8 *in_blk) */
-
-.global  aes_dec_blk
-
-.extern  it_tab
-.extern  il_tab
-
-.align 4
-
-aes_dec_blk:
-       push    %ebp
-       mov     tfm(%esp),%ebp
-
-// CAUTION: the order and the values used in these assigns 
-// rely on the register mappings
-
-1:     push    %ebx
-       mov     in_blk+4(%esp),%r2
-       push    %esi
-       mov     nrnd(%ebp),%r3   // number of rounds
-       push    %edi
-#if dkey != 0
-       lea     dkey(%ebp),%ebp  // key pointer
-#endif
-       mov     %r3,%r0
-       shl     $4,%r0
-       add     %r0,%ebp
-       
-// input four columns and xor in first round key
-
-       mov     (%r2),%r0
-       mov     4(%r2),%r1
-       mov     8(%r2),%r4
-       mov     12(%r2),%r5
-       xor     (%ebp),%r0
-       xor     4(%ebp),%r1
-       xor     8(%ebp),%r4
-       xor     12(%ebp),%r5
-
-       sub     $8,%esp         // space for register saves on stack
-       sub     $16,%ebp        // increment to next round key
-       cmp     $12,%r3
-       jb      4f              // 10 rounds for 128-bit key
-       lea     -32(%ebp),%ebp
-       je      3f              // 12 rounds for 192-bit key
-       lea     -32(%ebp),%ebp
-
-2:     inv_rnd1( +64(%ebp), it_tab)    // 14 rounds for 256-bit key
-       inv_rnd2( +48(%ebp), it_tab)
-3:     inv_rnd1( +32(%ebp), it_tab)    // 12 rounds for 192-bit key
-       inv_rnd2( +16(%ebp), it_tab)
-4:     inv_rnd1(    (%ebp), it_tab)    // 10 rounds for 128-bit key
-       inv_rnd2( -16(%ebp), it_tab)
-       inv_rnd1( -32(%ebp), it_tab)
-       inv_rnd2( -48(%ebp), it_tab)
-       inv_rnd1( -64(%ebp), it_tab)
-       inv_rnd2( -80(%ebp), it_tab)
-       inv_rnd1( -96(%ebp), it_tab)
-       inv_rnd2(-112(%ebp), it_tab)
-       inv_rnd1(-128(%ebp), it_tab)
-       inv_rnd2(-144(%ebp), il_tab)    // last round uses a different table
-
-// move final values to the output array.  CAUTION: the 
-// order of these assigns rely on the register mappings
-
-       add     $8,%esp
-       mov     out_blk+12(%esp),%ebp
-       mov     %r5,12(%ebp)
-       pop     %edi
-       mov     %r4,8(%ebp)
-       pop     %esi
-       mov     %r1,4(%ebp)
-       pop     %ebx
-       mov     %r0,(%ebp)
-       pop     %ebp
-       mov     $1,%eax
-       ret
-
diff --git a/arch/i386/crypto/aes-i586-asm_32.S b/arch/i386/crypto/aes-i586-asm_32.S
new file mode 100644 (file)
index 0000000..f942f0c
--- /dev/null
@@ -0,0 +1,373 @@
+// -------------------------------------------------------------------------
+// Copyright (c) 2001, Dr Brian Gladman <                 >, Worcester, UK.
+// All rights reserved.
+//
+// LICENSE TERMS
+//
+// The free distribution and use of this software in both source and binary 
+// form is allowed (with or without changes) provided that:
+//
+//   1. distributions of this source code include the above copyright 
+//      notice, this list of conditions and the following disclaimer//
+//
+//   2. distributions in binary form include the above copyright
+//      notice, this list of conditions and the following disclaimer
+//      in the documentation and/or other associated materials//
+//
+//   3. the copyright holder's name is not used to endorse products 
+//      built using this software without specific written permission.
+//
+//
+// ALTERNATIVELY, provided that this notice is retained in full, this product
+// may be distributed under the terms of the GNU General Public License (GPL),
+// in which case the provisions of the GPL apply INSTEAD OF those given above.
+//
+// Copyright (c) 2004 Linus Torvalds <torvalds@osdl.org>
+// Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
+
+// DISCLAIMER
+//
+// This software is provided 'as is' with no explicit or implied warranties
+// in respect of its properties including, but not limited to, correctness 
+// and fitness for purpose.
+// -------------------------------------------------------------------------
+// Issue Date: 29/07/2002
+
+.file "aes-i586-asm.S"
+.text
+
+#include <asm/asm-offsets.h>
+
+#define tlen 1024   // length of each of 4 'xor' arrays (256 32-bit words)
+
+/* offsets to parameters with one register pushed onto stack */
+#define tfm 8
+#define out_blk 12
+#define in_blk 16
+
+/* offsets in crypto_tfm structure */
+#define ekey (crypto_tfm_ctx_offset + 0)
+#define nrnd (crypto_tfm_ctx_offset + 256)
+#define dkey (crypto_tfm_ctx_offset + 260)
+
+// register mapping for encrypt and decrypt subroutines
+
+#define r0  eax
+#define r1  ebx
+#define r2  ecx
+#define r3  edx
+#define r4  esi
+#define r5  edi
+
+#define eaxl  al
+#define eaxh  ah
+#define ebxl  bl
+#define ebxh  bh
+#define ecxl  cl
+#define ecxh  ch
+#define edxl  dl
+#define edxh  dh
+
+#define _h(reg) reg##h
+#define h(reg) _h(reg)
+
+#define _l(reg) reg##l
+#define l(reg) _l(reg)
+
+// This macro takes a 32-bit word representing a column and uses
+// each of its four bytes to index into four tables of 256 32-bit
+// words to obtain values that are then xored into the appropriate
+// output registers r0, r1, r4 or r5.  
+
+// Parameters:
+// table table base address
+//   %1  out_state[0]
+//   %2  out_state[1]
+//   %3  out_state[2]
+//   %4  out_state[3]
+//   idx input register for the round (destroyed)
+//   tmp scratch register for the round
+// sched key schedule
+
+#define do_col(table, a1,a2,a3,a4, idx, tmp)   \
+       movzx   %l(idx),%tmp;                   \
+       xor     table(,%tmp,4),%a1;             \
+       movzx   %h(idx),%tmp;                   \
+       shr     $16,%idx;                       \
+       xor     table+tlen(,%tmp,4),%a2;        \
+       movzx   %l(idx),%tmp;                   \
+       movzx   %h(idx),%idx;                   \
+       xor     table+2*tlen(,%tmp,4),%a3;      \
+       xor     table+3*tlen(,%idx,4),%a4;
+
+// initialise output registers from the key schedule
+// NB1: original value of a3 is in idx on exit
+// NB2: original values of a1,a2,a4 aren't used
+#define do_fcol(table, a1,a2,a3,a4, idx, tmp, sched) \
+       mov     0 sched,%a1;                    \
+       movzx   %l(idx),%tmp;                   \
+       mov     12 sched,%a2;                   \
+       xor     table(,%tmp,4),%a1;             \
+       mov     4 sched,%a4;                    \
+       movzx   %h(idx),%tmp;                   \
+       shr     $16,%idx;                       \
+       xor     table+tlen(,%tmp,4),%a2;        \
+       movzx   %l(idx),%tmp;                   \
+       movzx   %h(idx),%idx;                   \
+       xor     table+3*tlen(,%idx,4),%a4;      \
+       mov     %a3,%idx;                       \
+       mov     8 sched,%a3;                    \
+       xor     table+2*tlen(,%tmp,4),%a3;
+
+// initialise output registers from the key schedule
+// NB1: original value of a3 is in idx on exit
+// NB2: original values of a1,a2,a4 aren't used
+#define do_icol(table, a1,a2,a3,a4, idx, tmp, sched) \
+       mov     0 sched,%a1;                    \
+       movzx   %l(idx),%tmp;                   \
+       mov     4 sched,%a2;                    \
+       xor     table(,%tmp,4),%a1;             \
+       mov     12 sched,%a4;                   \
+       movzx   %h(idx),%tmp;                   \
+       shr     $16,%idx;                       \
+       xor     table+tlen(,%tmp,4),%a2;        \
+       movzx   %l(idx),%tmp;                   \
+       movzx   %h(idx),%idx;                   \
+       xor     table+3*tlen(,%idx,4),%a4;      \
+       mov     %a3,%idx;                       \
+       mov     8 sched,%a3;                    \
+       xor     table+2*tlen(,%tmp,4),%a3;
+
+
+// original Gladman had conditional saves to MMX regs.
+#define save(a1, a2)           \
+       mov     %a2,4*a1(%esp)
+
+#define restore(a1, a2)                \
+       mov     4*a2(%esp),%a1
+
+// These macros perform a forward encryption cycle. They are entered with
+// the first previous round column values in r0,r1,r4,r5 and
+// exit with the final values in the same registers, using stack
+// for temporary storage.
+
+// round column values
+// on entry: r0,r1,r4,r5
+// on exit:  r2,r1,r4,r5
+#define fwd_rnd1(arg, table)                                           \
+       save   (0,r1);                                                  \
+       save   (1,r5);                                                  \
+                                                                       \
+       /* compute new column values */                                 \
+       do_fcol(table, r2,r5,r4,r1, r0,r3, arg);        /* idx=r0 */    \
+       do_col (table, r4,r1,r2,r5, r0,r3);             /* idx=r4 */    \
+       restore(r0,0);                                                  \
+       do_col (table, r1,r2,r5,r4, r0,r3);             /* idx=r1 */    \
+       restore(r0,1);                                                  \
+       do_col (table, r5,r4,r1,r2, r0,r3);             /* idx=r5 */
+
+// round column values
+// on entry: r2,r1,r4,r5
+// on exit:  r0,r1,r4,r5
+#define fwd_rnd2(arg, table)                                           \
+       save   (0,r1);                                                  \
+       save   (1,r5);                                                  \
+                                                                       \
+       /* compute new column values */                                 \
+       do_fcol(table, r0,r5,r4,r1, r2,r3, arg);        /* idx=r2 */    \
+       do_col (table, r4,r1,r0,r5, r2,r3);             /* idx=r4 */    \
+       restore(r2,0);                                                  \
+       do_col (table, r1,r0,r5,r4, r2,r3);             /* idx=r1 */    \
+       restore(r2,1);                                                  \
+       do_col (table, r5,r4,r1,r0, r2,r3);             /* idx=r5 */
+
+// These macros performs an inverse encryption cycle. They are entered with
+// the first previous round column values in r0,r1,r4,r5 and
+// exit with the final values in the same registers, using stack
+// for temporary storage
+
+// round column values
+// on entry: r0,r1,r4,r5
+// on exit:  r2,r1,r4,r5
+#define inv_rnd1(arg, table)                                           \
+       save    (0,r1);                                                 \
+       save    (1,r5);                                                 \
+                                                                       \
+       /* compute new column values */                                 \
+       do_icol(table, r2,r1,r4,r5, r0,r3, arg);        /* idx=r0 */    \
+       do_col (table, r4,r5,r2,r1, r0,r3);             /* idx=r4 */    \
+       restore(r0,0);                                                  \
+       do_col (table, r1,r4,r5,r2, r0,r3);             /* idx=r1 */    \
+       restore(r0,1);                                                  \
+       do_col (table, r5,r2,r1,r4, r0,r3);             /* idx=r5 */
+
+// round column values
+// on entry: r2,r1,r4,r5
+// on exit:  r0,r1,r4,r5
+#define inv_rnd2(arg, table)                                           \
+       save    (0,r1);                                                 \
+       save    (1,r5);                                                 \
+                                                                       \
+       /* compute new column values */                                 \
+       do_icol(table, r0,r1,r4,r5, r2,r3, arg);        /* idx=r2 */    \
+       do_col (table, r4,r5,r0,r1, r2,r3);             /* idx=r4 */    \
+       restore(r2,0);                                                  \
+       do_col (table, r1,r4,r5,r0, r2,r3);             /* idx=r1 */    \
+       restore(r2,1);                                                  \
+       do_col (table, r5,r0,r1,r4, r2,r3);             /* idx=r5 */
+
+// AES (Rijndael) Encryption Subroutine
+/* void aes_enc_blk(struct crypto_tfm *tfm, u8 *out_blk, const u8 *in_blk) */
+
+.global  aes_enc_blk
+
+.extern  ft_tab
+.extern  fl_tab
+
+.align 4
+
+aes_enc_blk:
+       push    %ebp
+       mov     tfm(%esp),%ebp
+
+// CAUTION: the order and the values used in these assigns 
+// rely on the register mappings
+
+1:     push    %ebx
+       mov     in_blk+4(%esp),%r2
+       push    %esi
+       mov     nrnd(%ebp),%r3   // number of rounds
+       push    %edi
+#if ekey != 0
+       lea     ekey(%ebp),%ebp  // key pointer
+#endif
+
+// input four columns and xor in first round key
+
+       mov     (%r2),%r0
+       mov     4(%r2),%r1
+       mov     8(%r2),%r4
+       mov     12(%r2),%r5
+       xor     (%ebp),%r0
+       xor     4(%ebp),%r1
+       xor     8(%ebp),%r4
+       xor     12(%ebp),%r5
+
+       sub     $8,%esp         // space for register saves on stack
+       add     $16,%ebp        // increment to next round key
+       cmp     $12,%r3
+       jb      4f              // 10 rounds for 128-bit key
+       lea     32(%ebp),%ebp
+       je      3f              // 12 rounds for 192-bit key
+       lea     32(%ebp),%ebp
+
+2:     fwd_rnd1( -64(%ebp) ,ft_tab)    // 14 rounds for 256-bit key
+       fwd_rnd2( -48(%ebp) ,ft_tab)
+3:     fwd_rnd1( -32(%ebp) ,ft_tab)    // 12 rounds for 192-bit key
+       fwd_rnd2( -16(%ebp) ,ft_tab)
+4:     fwd_rnd1(    (%ebp) ,ft_tab)    // 10 rounds for 128-bit key
+       fwd_rnd2( +16(%ebp) ,ft_tab)
+       fwd_rnd1( +32(%ebp) ,ft_tab)
+       fwd_rnd2( +48(%ebp) ,ft_tab)
+       fwd_rnd1( +64(%ebp) ,ft_tab)
+       fwd_rnd2( +80(%ebp) ,ft_tab)
+       fwd_rnd1( +96(%ebp) ,ft_tab)
+       fwd_rnd2(+112(%ebp) ,ft_tab)
+       fwd_rnd1(+128(%ebp) ,ft_tab)
+       fwd_rnd2(+144(%ebp) ,fl_tab)    // last round uses a different table
+
+// move final values to the output array.  CAUTION: the 
+// order of these assigns rely on the register mappings
+
+       add     $8,%esp
+       mov     out_blk+12(%esp),%ebp
+       mov     %r5,12(%ebp)
+       pop     %edi
+       mov     %r4,8(%ebp)
+       pop     %esi
+       mov     %r1,4(%ebp)
+       pop     %ebx
+       mov     %r0,(%ebp)
+       pop     %ebp
+       mov     $1,%eax
+       ret
+
+// AES (Rijndael) Decryption Subroutine
+/* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out_blk, const u8 *in_blk) */
+
+.global  aes_dec_blk
+
+.extern  it_tab
+.extern  il_tab
+
+.align 4
+
+aes_dec_blk:
+       push    %ebp
+       mov     tfm(%esp),%ebp
+
+// CAUTION: the order and the values used in these assigns 
+// rely on the register mappings
+
+1:     push    %ebx
+       mov     in_blk+4(%esp),%r2
+       push    %esi
+       mov     nrnd(%ebp),%r3   // number of rounds
+       push    %edi
+#if dkey != 0
+       lea     dkey(%ebp),%ebp  // key pointer
+#endif
+       mov     %r3,%r0
+       shl     $4,%r0
+       add     %r0,%ebp
+       
+// input four columns and xor in first round key
+
+       mov     (%r2),%r0
+       mov     4(%r2),%r1
+       mov     8(%r2),%r4
+       mov     12(%r2),%r5
+       xor     (%ebp),%r0
+       xor     4(%ebp),%r1
+       xor     8(%ebp),%r4
+       xor     12(%ebp),%r5
+
+       sub     $8,%esp         // space for register saves on stack
+       sub     $16,%ebp        // increment to next round key
+       cmp     $12,%r3
+       jb      4f              // 10 rounds for 128-bit key
+       lea     -32(%ebp),%ebp
+       je      3f              // 12 rounds for 192-bit key
+       lea     -32(%ebp),%ebp
+
+2:     inv_rnd1( +64(%ebp), it_tab)    // 14 rounds for 256-bit key
+       inv_rnd2( +48(%ebp), it_tab)
+3:     inv_rnd1( +32(%ebp), it_tab)    // 12 rounds for 192-bit key
+       inv_rnd2( +16(%ebp), it_tab)
+4:     inv_rnd1(    (%ebp), it_tab)    // 10 rounds for 128-bit key
+       inv_rnd2( -16(%ebp), it_tab)
+       inv_rnd1( -32(%ebp), it_tab)
+       inv_rnd2( -48(%ebp), it_tab)
+       inv_rnd1( -64(%ebp), it_tab)
+       inv_rnd2( -80(%ebp), it_tab)
+       inv_rnd1( -96(%ebp), it_tab)
+       inv_rnd2(-112(%ebp), it_tab)
+       inv_rnd1(-128(%ebp), it_tab)
+       inv_rnd2(-144(%ebp), il_tab)    // last round uses a different table
+
+// move final values to the output array.  CAUTION: the 
+// order of these assigns rely on the register mappings
+
+       add     $8,%esp
+       mov     out_blk+12(%esp),%ebp
+       mov     %r5,12(%ebp)
+       pop     %edi
+       mov     %r4,8(%ebp)
+       pop     %esi
+       mov     %r1,4(%ebp)
+       pop     %ebx
+       mov     %r0,(%ebp)
+       pop     %ebp
+       mov     $1,%eax
+       ret
+