x86_64: move lib

author Thomas Gleixner <tglx@linutronix.de>

Thu, 11 Oct 2007 09:17:08 +0000 (11:17 +0200)

committer Thomas Gleixner <tglx@linutronix.de>

Thu, 11 Oct 2007 09:17:08 +0000 (11:17 +0200)
author Thomas Gleixner <tglx@linutronix.de>
Thu, 11 Oct 2007 09:17:08 +0000 (11:17 +0200)
committer Thomas Gleixner <tglx@linutronix.de>
Thu, 11 Oct 2007 09:17:08 +0000 (11:17 +0200)
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile

index 2d7d724a2a6a9465ae0cadf2628e3490e4328ca7..329da276c6f1839d8bdb75ab4448c0d61a6af2f0 100644 (file)
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -1,5 +1,5 @@
  ifeq ($(CONFIG_X86_32),y)
  include ${srctree}/arch/x86/lib/Makefile_32
  else
-include ${srctree}/arch/x86_64/lib/Makefile_64
+include ${srctree}/arch/x86/lib/Makefile_64
  endif
diff --git a/arch/x86/lib/Makefile_64 b/arch/x86/lib/Makefile_64

new file mode 100644 (file)

index 0000000..bbabad3
--- /dev/null
+++ b/arch/x86/lib/Makefile_64
@@ -0,0 +1,13 @@
+#
+# Makefile for x86_64-specific library files.
+#
+
+CFLAGS_csum-partial_64.o := -funroll-loops
+
+obj-y := io_64.o iomap_copy_64.o
+obj-$(CONFIG_SMP)      += msr-on-cpu.o
+
+lib-y := csum-partial_64.o csum-copy_64.o csum-wrappers_64.o delay_64.o \
+       usercopy_64.o getuser_64.o putuser_64.o  \
+       thunk_64.o clear_page_64.o copy_page_64.o bitstr_64.o bitops_64.o
+lib-y += memcpy_64.o memmove_64.o memset_64.o copy_user_64.o rwlock_64.o copy_user_nocache_64.o
diff --git a/arch/x86/lib/bitops_64.c b/arch/x86/lib/bitops_64.c

new file mode 100644 (file)

index 0000000..95b6d96
--- /dev/null
+++ b/arch/x86/lib/bitops_64.c
@@ -0,0 +1,175 @@
+#include <linux/bitops.h>
+
+#undef find_first_zero_bit
+#undef find_next_zero_bit
+#undef find_first_bit
+#undef find_next_bit
+
+static inline long
+__find_first_zero_bit(const unsigned long * addr, unsigned long size)
+{
+       long d0, d1, d2;
+       long res;
+
+       /*
+        * We must test the size in words, not in bits, because
+        * otherwise incoming sizes in the range -63..-1 will not run
+        * any scasq instructions, and then the flags used by the je
+        * instruction will have whatever random value was in place
+        * before.  Nobody should call us like that, but
+        * find_next_zero_bit() does when offset and size are at the
+        * same word and it fails to find a zero itself.
+        */
+       size += 63;
+       size >>= 6;
+       if (!size)
+               return 0;
+       asm volatile(
+               "  repe; scasq\n"
+               "  je 1f\n"
+               "  xorq -8(%%rdi),%%rax\n"
+               "  subq $8,%%rdi\n"
+               "  bsfq %%rax,%%rdx\n"
+               "1:  subq %[addr],%%rdi\n"
+               "  shlq $3,%%rdi\n"
+               "  addq %%rdi,%%rdx"
+               :"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2)
+               :"0" (0ULL), "1" (size), "2" (addr), "3" (-1ULL),
+                [addr] "S" (addr) : "memory");
+       /*
+        * Any register would do for [addr] above, but GCC tends to
+        * prefer rbx over rsi, even though rsi is readily available
+        * and doesn't have to be saved.
+        */
+       return res;
+}
+
+/**
+ * find_first_zero_bit - find the first zero bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit-number of the first zero bit, not the number of the byte
+ * containing a bit.
+ */
+long find_first_zero_bit(const unsigned long * addr, unsigned long size)
+{
+       return __find_first_zero_bit (addr, size);
+}
+
+/**
+ * find_next_zero_bit - find the first zero bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+long find_next_zero_bit (const unsigned long * addr, long size, long offset)
+{
+       const unsigned long * p = addr + (offset >> 6);
+       unsigned long set = 0;
+       unsigned long res, bit = offset&63;
+
+       if (bit) {
+               /*
+                * Look for zero in first word
+                */
+               asm("bsfq %1,%0\n\t"
+                   "cmoveq %2,%0"
+                   : "=r" (set)
+                   : "r" (~(*p >> bit)), "r"(64L));
+               if (set < (64 - bit))
+                       return set + offset;
+               set = 64 - bit;
+               p++;
+       }
+       /*
+        * No zero yet, search remaining full words for a zero
+        */
+       res = __find_first_zero_bit (p, size - 64 * (p - addr));
+
+       return (offset + set + res);
+}
+
+static inline long
+__find_first_bit(const unsigned long * addr, unsigned long size)
+{
+       long d0, d1;
+       long res;
+
+       /*
+        * We must test the size in words, not in bits, because
+        * otherwise incoming sizes in the range -63..-1 will not run
+        * any scasq instructions, and then the flags used by the jz
+        * instruction will have whatever random value was in place
+        * before.  Nobody should call us like that, but
+        * find_next_bit() does when offset and size are at the same
+        * word and it fails to find a one itself.
+        */
+       size += 63;
+       size >>= 6;
+       if (!size)
+               return 0;
+       asm volatile(
+               "   repe; scasq\n"
+               "   jz 1f\n"
+               "   subq $8,%%rdi\n"
+               "   bsfq (%%rdi),%%rax\n"
+               "1: subq %[addr],%%rdi\n"
+               "   shlq $3,%%rdi\n"
+               "   addq %%rdi,%%rax"
+               :"=a" (res), "=&c" (d0), "=&D" (d1)
+               :"0" (0ULL), "1" (size), "2" (addr),
+                [addr] "r" (addr) : "memory");
+       return res;
+}
+
+/**
+ * find_first_bit - find the first set bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit-number of the first set bit, not the number of the byte
+ * containing a bit.
+ */
+long find_first_bit(const unsigned long * addr, unsigned long size)
+{
+       return __find_first_bit(addr,size);
+}
+
+/**
+ * find_next_bit - find the first set bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+long find_next_bit(const unsigned long * addr, long size, long offset)
+{
+       const unsigned long * p = addr + (offset >> 6);
+       unsigned long set = 0, bit = offset & 63, res;
+
+       if (bit) {
+               /*
+                * Look for nonzero in the first 64 bits:
+                */
+               asm("bsfq %1,%0\n\t"
+                   "cmoveq %2,%0\n\t"
+                   : "=r" (set)
+                   : "r" (*p >> bit), "r" (64L));
+               if (set < (64 - bit))
+                       return set + offset;
+               set = 64 - bit;
+               p++;
+       }
+       /*
+        * No set bit yet, search remaining full words for a bit
+        */
+       res = __find_first_bit (p, size - 64 * (p - addr));
+       return (offset + set + res);
+}
+
+#include <linux/module.h>
+
+EXPORT_SYMBOL(find_next_bit);
+EXPORT_SYMBOL(find_first_bit);
+EXPORT_SYMBOL(find_first_zero_bit);
+EXPORT_SYMBOL(find_next_zero_bit);
diff --git a/arch/x86/lib/bitstr_64.c b/arch/x86/lib/bitstr_64.c

new file mode 100644 (file)

index 0000000..2467660
--- /dev/null
+++ b/arch/x86/lib/bitstr_64.c
@@ -0,0 +1,28 @@
+#include <linux/module.h>
+#include <linux/bitops.h>
+
+/* Find string of zero bits in a bitmap */ 
+unsigned long 
+find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len)
+{ 
+       unsigned long n, end, i;        
+
+ again:
+       n = find_next_zero_bit(bitmap, nbits, start);
+       if (n == -1) 
+               return -1;
+       
+       /* could test bitsliced, but it's hardly worth it */
+       end = n+len;
+       if (end >= nbits) 
+               return -1; 
+       for (i = n+1; i < end; i++) { 
+               if (test_bit(i, bitmap)) {  
+                       start = i+1; 
+                       goto again; 
+               } 
+       }
+       return n;
+}
+
+EXPORT_SYMBOL(find_next_zero_string);
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S

new file mode 100644 (file)

index 0000000..9a10a78
--- /dev/null
+++ b/arch/x86/lib/clear_page_64.S
@@ -0,0 +1,59 @@
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+/*
+ * Zero a page.        
+ * rdi page
+ */                    
+       ALIGN
+clear_page_c:
+       CFI_STARTPROC
+       movl $4096/8,%ecx
+       xorl %eax,%eax
+       rep stosq
+       ret
+       CFI_ENDPROC
+ENDPROC(clear_page)
+
+ENTRY(clear_page)
+       CFI_STARTPROC
+       xorl   %eax,%eax
+       movl   $4096/64,%ecx
+       .p2align 4
+.Lloop:
+       decl    %ecx
+#define PUT(x) movq %rax,x*8(%rdi)
+       movq %rax,(%rdi)
+       PUT(1)
+       PUT(2)
+       PUT(3)
+       PUT(4)
+       PUT(5)
+       PUT(6)
+       PUT(7)
+       leaq    64(%rdi),%rdi
+       jnz     .Lloop
+       nop
+       ret
+       CFI_ENDPROC
+.Lclear_page_end:
+ENDPROC(clear_page)
+
+       /* Some CPUs run faster using the string instructions.
+          It is also a lot simpler. Use this when possible */
+
+#include <asm/cpufeature.h>
+
+       .section .altinstr_replacement,"ax"
+1:     .byte 0xeb                                      /* jmp <disp8> */
+       .byte (clear_page_c - clear_page) - (2f - 1b)   /* offset */
+2:
+       .previous
+       .section .altinstructions,"a"
+       .align 8
+       .quad clear_page
+       .quad 1b
+       .byte X86_FEATURE_REP_GOOD
+       .byte .Lclear_page_end - clear_page
+       .byte 2b - 1b
+       .previous
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S

new file mode 100644 (file)

index 0000000..727a5d4
--- /dev/null
+++ b/arch/x86/lib/copy_page_64.S
@@ -0,0 +1,119 @@
+/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+       ALIGN
+copy_page_c:
+       CFI_STARTPROC
+       movl $4096/8,%ecx
+       rep movsq
+       ret
+       CFI_ENDPROC
+ENDPROC(copy_page_c)
+
+/* Don't use streaming store because it's better when the target
+   ends up in cache. */
+           
+/* Could vary the prefetch distance based on SMP/UP */
+
+ENTRY(copy_page)
+       CFI_STARTPROC
+       subq    $3*8,%rsp
+       CFI_ADJUST_CFA_OFFSET 3*8
+       movq    %rbx,(%rsp)
+       CFI_REL_OFFSET rbx, 0
+       movq    %r12,1*8(%rsp)
+       CFI_REL_OFFSET r12, 1*8
+       movq    %r13,2*8(%rsp)
+       CFI_REL_OFFSET r13, 2*8
+
+       movl    $(4096/64)-5,%ecx
+       .p2align 4
+.Loop64:
+       dec     %rcx
+
+       movq        (%rsi), %rax
+       movq      8 (%rsi), %rbx
+       movq     16 (%rsi), %rdx
+       movq     24 (%rsi), %r8
+       movq     32 (%rsi), %r9
+       movq     40 (%rsi), %r10
+       movq     48 (%rsi), %r11
+       movq     56 (%rsi), %r12
+
+       prefetcht0 5*64(%rsi)
+
+       movq     %rax,    (%rdi)
+       movq     %rbx,  8 (%rdi)
+       movq     %rdx, 16 (%rdi)
+       movq     %r8,  24 (%rdi)
+       movq     %r9,  32 (%rdi)
+       movq     %r10, 40 (%rdi)
+       movq     %r11, 48 (%rdi)
+       movq     %r12, 56 (%rdi)
+
+       leaq    64 (%rsi), %rsi
+       leaq    64 (%rdi), %rdi
+
+       jnz     .Loop64
+
+       movl    $5,%ecx
+       .p2align 4
+.Loop2:
+       decl   %ecx
+
+       movq        (%rsi), %rax
+       movq      8 (%rsi), %rbx
+       movq     16 (%rsi), %rdx
+       movq     24 (%rsi), %r8
+       movq     32 (%rsi), %r9
+       movq     40 (%rsi), %r10
+       movq     48 (%rsi), %r11
+       movq     56 (%rsi), %r12
+
+       movq     %rax,    (%rdi)
+       movq     %rbx,  8 (%rdi)
+       movq     %rdx, 16 (%rdi)
+       movq     %r8,  24 (%rdi)
+       movq     %r9,  32 (%rdi)
+       movq     %r10, 40 (%rdi)
+       movq     %r11, 48 (%rdi)
+       movq     %r12, 56 (%rdi)
+
+       leaq    64(%rdi),%rdi
+       leaq    64(%rsi),%rsi
+
+       jnz     .Loop2
+
+       movq    (%rsp),%rbx
+       CFI_RESTORE rbx
+       movq    1*8(%rsp),%r12
+       CFI_RESTORE r12
+       movq    2*8(%rsp),%r13
+       CFI_RESTORE r13
+       addq    $3*8,%rsp
+       CFI_ADJUST_CFA_OFFSET -3*8
+       ret
+.Lcopy_page_end:
+       CFI_ENDPROC
+ENDPROC(copy_page)
+
+       /* Some CPUs run faster using the string copy instructions.
+          It is also a lot simpler. Use this when possible */
+
+#include <asm/cpufeature.h>
+
+       .section .altinstr_replacement,"ax"
+1:     .byte 0xeb                                      /* jmp <disp8> */
+       .byte (copy_page_c - copy_page) - (2f - 1b)     /* offset */
+2:
+       .previous
+       .section .altinstructions,"a"
+       .align 8
+       .quad copy_page
+       .quad 1b
+       .byte X86_FEATURE_REP_GOOD
+       .byte .Lcopy_page_end - copy_page
+       .byte 2b - 1b
+       .previous
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S

new file mode 100644 (file)

index 0000000..70bebd3
--- /dev/null
+++ b/arch/x86/lib/copy_user_64.S
@@ -0,0 +1,354 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v2.
+ * 
+ * Functions to copy from and to user space.           
+ */             
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+#define FIX_ALIGNMENT 1
+
+#include <asm/current.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/cpufeature.h>
+
+       .macro ALTERNATIVE_JUMP feature,orig,alt
+0:
+       .byte 0xe9      /* 32bit jump */
+       .long \orig-1f  /* by default jump to orig */
+1:
+       .section .altinstr_replacement,"ax"
+2:     .byte 0xe9                   /* near jump with 32bit immediate */
+       .long \alt-1b /* offset */   /* or alternatively to alt */
+       .previous
+       .section .altinstructions,"a"
+       .align 8
+       .quad  0b
+       .quad  2b
+       .byte  \feature              /* when feature is set */
+       .byte  5
+       .byte  5
+       .previous
+       .endm
+
+/* Standard copy_to_user with segment limit checking */                
+ENTRY(copy_to_user)
+       CFI_STARTPROC
+       GET_THREAD_INFO(%rax)
+       movq %rdi,%rcx
+       addq %rdx,%rcx
+       jc  bad_to_user
+       cmpq threadinfo_addr_limit(%rax),%rcx
+       jae bad_to_user
+       xorl %eax,%eax  /* clear zero flag */
+       ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+       CFI_ENDPROC
+
+ENTRY(copy_user_generic)
+       CFI_STARTPROC
+       movl $1,%ecx    /* set zero flag */
+       ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+       CFI_ENDPROC
+
+ENTRY(__copy_from_user_inatomic)
+       CFI_STARTPROC
+       xorl %ecx,%ecx  /* clear zero flag */
+       ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+       CFI_ENDPROC
+
+/* Standard copy_from_user with segment limit checking */      
+ENTRY(copy_from_user)
+       CFI_STARTPROC
+       GET_THREAD_INFO(%rax)
+       movq %rsi,%rcx
+       addq %rdx,%rcx
+       jc  bad_from_user
+       cmpq threadinfo_addr_limit(%rax),%rcx
+       jae  bad_from_user
+       movl $1,%ecx    /* set zero flag */
+       ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+       CFI_ENDPROC
+ENDPROC(copy_from_user)
+       
+       .section .fixup,"ax"
+       /* must zero dest */
+bad_from_user:
+       CFI_STARTPROC
+       movl %edx,%ecx
+       xorl %eax,%eax
+       rep
+       stosb
+bad_to_user:
+       movl    %edx,%eax
+       ret
+       CFI_ENDPROC
+END(bad_from_user)
+       .previous
+       
+               
+/*
+ * copy_user_generic_unrolled - memory copy with exception handling.
+ * This version is for CPUs like P4 that don't have efficient micro code for rep movsq
+ *     
+ * Input:      
+ * rdi destination
+ * rsi source
+ * rdx count
+ * ecx zero flag -- if true zero destination on error
+ *
+ * Output:             
+ * eax uncopied bytes or 0 if successful.
+ */
+ENTRY(copy_user_generic_unrolled)
+       CFI_STARTPROC
+       pushq %rbx
+       CFI_ADJUST_CFA_OFFSET 8
+       CFI_REL_OFFSET rbx, 0
+       pushq %rcx
+       CFI_ADJUST_CFA_OFFSET 8
+       CFI_REL_OFFSET rcx, 0
+       xorl %eax,%eax          /*zero for the exception handler */
+
+#ifdef FIX_ALIGNMENT
+       /* check for bad alignment of destination */
+       movl %edi,%ecx
+       andl $7,%ecx
+       jnz  .Lbad_alignment
+.Lafter_bad_alignment:
+#endif
+
+       movq %rdx,%rcx
+
+       movl $64,%ebx
+       shrq $6,%rdx
+       decq %rdx
+       js   .Lhandle_tail
+
+       .p2align 4
+.Lloop:
+.Ls1:  movq (%rsi),%r11
+.Ls2:  movq 1*8(%rsi),%r8
+.Ls3:  movq 2*8(%rsi),%r9
+.Ls4:  movq 3*8(%rsi),%r10
+.Ld1:  movq %r11,(%rdi)
+.Ld2:  movq %r8,1*8(%rdi)
+.Ld3:  movq %r9,2*8(%rdi)
+.Ld4:  movq %r10,3*8(%rdi)
+
+.Ls5:  movq 4*8(%rsi),%r11
+.Ls6:  movq 5*8(%rsi),%r8
+.Ls7:  movq 6*8(%rsi),%r9
+.Ls8:  movq 7*8(%rsi),%r10
+.Ld5:  movq %r11,4*8(%rdi)
+.Ld6:  movq %r8,5*8(%rdi)
+.Ld7:  movq %r9,6*8(%rdi)
+.Ld8:  movq %r10,7*8(%rdi)
+
+       decq %rdx
+
+       leaq 64(%rsi),%rsi
+       leaq 64(%rdi),%rdi
+
+       jns  .Lloop
+
+       .p2align 4
+.Lhandle_tail:
+       movl %ecx,%edx
+       andl $63,%ecx
+       shrl $3,%ecx
+       jz   .Lhandle_7
+       movl $8,%ebx
+       .p2align 4
+.Lloop_8:
+.Ls9:  movq (%rsi),%r8
+.Ld9:  movq %r8,(%rdi)
+       decl %ecx
+       leaq 8(%rdi),%rdi
+       leaq 8(%rsi),%rsi
+       jnz .Lloop_8
+
+.Lhandle_7:
+       movl %edx,%ecx
+       andl $7,%ecx
+       jz   .Lende
+       .p2align 4
+.Lloop_1:
+.Ls10: movb (%rsi),%bl
+.Ld10: movb %bl,(%rdi)
+       incq %rdi
+       incq %rsi
+       decl %ecx
+       jnz .Lloop_1
+
+       CFI_REMEMBER_STATE
+.Lende:
+       popq %rcx
+       CFI_ADJUST_CFA_OFFSET -8
+       CFI_RESTORE rcx
+       popq %rbx
+       CFI_ADJUST_CFA_OFFSET -8
+       CFI_RESTORE rbx
+       ret
+       CFI_RESTORE_STATE
+
+#ifdef FIX_ALIGNMENT
+       /* align destination */
+       .p2align 4
+.Lbad_alignment:
+       movl $8,%r9d
+       subl %ecx,%r9d
+       movl %r9d,%ecx
+       cmpq %r9,%rdx
+       jz   .Lhandle_7
+       js   .Lhandle_7
+.Lalign_1:
+.Ls11: movb (%rsi),%bl
+.Ld11: movb %bl,(%rdi)
+       incq %rsi
+       incq %rdi
+       decl %ecx
+       jnz .Lalign_1
+       subq %r9,%rdx
+       jmp .Lafter_bad_alignment
+#endif
+
+       /* table sorted by exception address */
+       .section __ex_table,"a"
+       .align 8
+       .quad .Ls1,.Ls1e
+       .quad .Ls2,.Ls2e
+       .quad .Ls3,.Ls3e
+       .quad .Ls4,.Ls4e
+       .quad .Ld1,.Ls1e
+       .quad .Ld2,.Ls2e
+       .quad .Ld3,.Ls3e
+       .quad .Ld4,.Ls4e
+       .quad .Ls5,.Ls5e
+       .quad .Ls6,.Ls6e
+       .quad .Ls7,.Ls7e
+       .quad .Ls8,.Ls8e
+       .quad .Ld5,.Ls5e
+       .quad .Ld6,.Ls6e
+       .quad .Ld7,.Ls7e
+       .quad .Ld8,.Ls8e
+       .quad .Ls9,.Le_quad
+       .quad .Ld9,.Le_quad
+       .quad .Ls10,.Le_byte
+       .quad .Ld10,.Le_byte
+#ifdef FIX_ALIGNMENT
+       .quad .Ls11,.Lzero_rest
+       .quad .Ld11,.Lzero_rest
+#endif
+       .quad .Le5,.Le_zero
+       .previous
+
+       /* compute 64-offset for main loop. 8 bytes accuracy with error on the
+          pessimistic side. this is gross. it would be better to fix the
+          interface. */
+       /* eax: zero, ebx: 64 */
+.Ls1e:         addl $8,%eax
+.Ls2e:         addl $8,%eax
+.Ls3e:         addl $8,%eax
+.Ls4e:         addl $8,%eax
+.Ls5e:         addl $8,%eax
+.Ls6e:         addl $8,%eax
+.Ls7e:         addl $8,%eax
+.Ls8e:         addl $8,%eax
+       addq %rbx,%rdi  /* +64 */
+       subq %rax,%rdi  /* correct destination with computed offset */
+
+       shlq $6,%rdx    /* loop counter * 64 (stride length) */
+       addq %rax,%rdx  /* add offset to loopcnt */
+       andl $63,%ecx   /* remaining bytes */
+       addq %rcx,%rdx  /* add them */
+       jmp .Lzero_rest
+
+       /* exception on quad word loop in tail handling */
+       /* ecx: loopcnt/8, %edx: length, rdi: correct */
+.Le_quad:
+       shll $3,%ecx
+       andl $7,%edx
+       addl %ecx,%edx
+       /* edx: bytes to zero, rdi: dest, eax:zero */
+.Lzero_rest:
+       cmpl $0,(%rsp)
+       jz   .Le_zero
+       movq %rdx,%rcx
+.Le_byte:
+       xorl %eax,%eax
+.Le5:  rep
+       stosb
+       /* when there is another exception while zeroing the rest just return */
+.Le_zero:
+       movq %rdx,%rax
+       jmp .Lende
+       CFI_ENDPROC
+ENDPROC(copy_user_generic)
+
+
+       /* Some CPUs run faster using the string copy instructions.
+          This is also a lot simpler. Use them when possible.
+          Patch in jmps to this code instead of copying it fully
+          to avoid unwanted aliasing in the exception tables. */
+
+ /* rdi        destination
+  * rsi source
+  * rdx count
+  * ecx zero flag
+  *
+  * Output:
+  * eax uncopied bytes or 0 if successfull.
+  *
+  * Only 4GB of copy is supported. This shouldn't be a problem
+  * because the kernel normally only writes from/to page sized chunks
+  * even if user space passed a longer buffer.
+  * And more would be dangerous because both Intel and AMD have
+  * errata with rep movsq > 4GB. If someone feels the need to fix
+  * this please consider this.
+  */
+ENTRY(copy_user_generic_string)
+       CFI_STARTPROC
+       movl %ecx,%r8d          /* save zero flag */
+       movl %edx,%ecx
+       shrl $3,%ecx
+       andl $7,%edx    
+       jz   10f
+1:     rep 
+       movsq 
+       movl %edx,%ecx
+2:     rep
+       movsb
+9:     movl %ecx,%eax
+       ret
+
+       /* multiple of 8 byte */
+10:    rep
+       movsq
+       xor %eax,%eax
+       ret
+
+       /* exception handling */
+3:      lea (%rdx,%rcx,8),%rax /* exception on quad loop */
+       jmp 6f
+5:     movl %ecx,%eax          /* exception on byte loop */
+       /* eax: left over bytes */
+6:     testl %r8d,%r8d         /* zero flag set? */
+       jz 7f
+       movl %eax,%ecx          /* initialize x86 loop counter */
+       push %rax
+       xorl %eax,%eax
+8:     rep
+       stosb                   /* zero the rest */
+11:    pop %rax
+7:     ret
+       CFI_ENDPROC
+END(copy_user_generic_c)
+
+       .section __ex_table,"a"
+       .quad 1b,3b
+       .quad 2b,5b
+       .quad 8b,11b
+       .quad 10b,3b
+       .previous
diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S

new file mode 100644 (file)

index 0000000..4620efb
--- /dev/null
+++ b/arch/x86/lib/copy_user_nocache_64.S
@@ -0,0 +1,217 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v2.
+ *
+ * Functions to copy from and to user space.
+ */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+#define FIX_ALIGNMENT 1
+
+#include <asm/current.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/cpufeature.h>
+
+/*
+ * copy_user_nocache - Uncached memory copy with exception handling
+ * This will force destination/source out of cache for more performance.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ * rcx zero flag       when 1 zero on exception
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+ENTRY(__copy_user_nocache)
+       CFI_STARTPROC
+       pushq %rbx
+       CFI_ADJUST_CFA_OFFSET 8
+       CFI_REL_OFFSET rbx, 0
+       pushq %rcx              /* save zero flag */
+       CFI_ADJUST_CFA_OFFSET 8
+       CFI_REL_OFFSET rcx, 0
+
+       xorl %eax,%eax          /* zero for the exception handler */
+
+#ifdef FIX_ALIGNMENT
+       /* check for bad alignment of destination */
+       movl %edi,%ecx
+       andl $7,%ecx
+       jnz  .Lbad_alignment
+.Lafter_bad_alignment:
+#endif
+
+       movq %rdx,%rcx
+
+       movl $64,%ebx
+       shrq $6,%rdx
+       decq %rdx
+       js   .Lhandle_tail
+
+       .p2align 4
+.Lloop:
+.Ls1:  movq (%rsi),%r11
+.Ls2:  movq 1*8(%rsi),%r8
+.Ls3:  movq 2*8(%rsi),%r9
+.Ls4:  movq 3*8(%rsi),%r10
+.Ld1:  movnti %r11,(%rdi)
+.Ld2:  movnti %r8,1*8(%rdi)
+.Ld3:  movnti %r9,2*8(%rdi)
+.Ld4:  movnti %r10,3*8(%rdi)
+
+.Ls5:  movq 4*8(%rsi),%r11
+.Ls6:  movq 5*8(%rsi),%r8
+.Ls7:  movq 6*8(%rsi),%r9
+.Ls8:  movq 7*8(%rsi),%r10
+.Ld5:  movnti %r11,4*8(%rdi)
+.Ld6:  movnti %r8,5*8(%rdi)
+.Ld7:  movnti %r9,6*8(%rdi)
+.Ld8:  movnti %r10,7*8(%rdi)
+
+       dec  %rdx
+
+       leaq 64(%rsi),%rsi
+       leaq 64(%rdi),%rdi
+
+       jns  .Lloop
+
+       .p2align 4
+.Lhandle_tail:
+       movl %ecx,%edx
+       andl $63,%ecx
+       shrl $3,%ecx
+       jz   .Lhandle_7
+       movl $8,%ebx
+       .p2align 4
+.Lloop_8:
+.Ls9:  movq (%rsi),%r8
+.Ld9:  movnti %r8,(%rdi)
+       decl %ecx
+       leaq 8(%rdi),%rdi
+       leaq 8(%rsi),%rsi
+       jnz .Lloop_8
+
+.Lhandle_7:
+       movl %edx,%ecx
+       andl $7,%ecx
+       jz   .Lende
+       .p2align 4
+.Lloop_1:
+.Ls10: movb (%rsi),%bl
+.Ld10: movb %bl,(%rdi)
+       incq %rdi
+       incq %rsi
+       decl %ecx
+       jnz .Lloop_1
+
+       CFI_REMEMBER_STATE
+.Lende:
+       popq %rcx
+       CFI_ADJUST_CFA_OFFSET -8
+       CFI_RESTORE %rcx
+       popq %rbx
+       CFI_ADJUST_CFA_OFFSET -8
+       CFI_RESTORE rbx
+       ret
+       CFI_RESTORE_STATE
+
+#ifdef FIX_ALIGNMENT
+       /* align destination */
+       .p2align 4
+.Lbad_alignment:
+       movl $8,%r9d
+       subl %ecx,%r9d
+       movl %r9d,%ecx
+       cmpq %r9,%rdx
+       jz   .Lhandle_7
+       js   .Lhandle_7
+.Lalign_1:
+.Ls11: movb (%rsi),%bl
+.Ld11: movb %bl,(%rdi)
+       incq %rsi
+       incq %rdi
+       decl %ecx
+       jnz .Lalign_1
+       subq %r9,%rdx
+       jmp .Lafter_bad_alignment
+#endif
+
+       /* table sorted by exception address */
+       .section __ex_table,"a"
+       .align 8
+       .quad .Ls1,.Ls1e
+       .quad .Ls2,.Ls2e
+       .quad .Ls3,.Ls3e
+       .quad .Ls4,.Ls4e
+       .quad .Ld1,.Ls1e
+       .quad .Ld2,.Ls2e
+       .quad .Ld3,.Ls3e
+       .quad .Ld4,.Ls4e
+       .quad .Ls5,.Ls5e
+       .quad .Ls6,.Ls6e
+       .quad .Ls7,.Ls7e
+       .quad .Ls8,.Ls8e
+       .quad .Ld5,.Ls5e
+       .quad .Ld6,.Ls6e
+       .quad .Ld7,.Ls7e
+       .quad .Ld8,.Ls8e
+       .quad .Ls9,.Le_quad
+       .quad .Ld9,.Le_quad
+       .quad .Ls10,.Le_byte
+       .quad .Ld10,.Le_byte
+#ifdef FIX_ALIGNMENT
+       .quad .Ls11,.Lzero_rest
+       .quad .Ld11,.Lzero_rest
+#endif
+       .quad .Le5,.Le_zero
+       .previous
+
+       /* compute 64-offset for main loop. 8 bytes accuracy with error on the
+          pessimistic side. this is gross. it would be better to fix the
+          interface. */
+       /* eax: zero, ebx: 64 */
+.Ls1e:         addl $8,%eax
+.Ls2e:         addl $8,%eax
+.Ls3e:         addl $8,%eax
+.Ls4e:         addl $8,%eax
+.Ls5e:         addl $8,%eax
+.Ls6e:         addl $8,%eax
+.Ls7e:         addl $8,%eax
+.Ls8e:         addl $8,%eax
+       addq %rbx,%rdi  /* +64 */
+       subq %rax,%rdi  /* correct destination with computed offset */
+
+       shlq $6,%rdx    /* loop counter * 64 (stride length) */
+       addq %rax,%rdx  /* add offset to loopcnt */
+       andl $63,%ecx   /* remaining bytes */
+       addq %rcx,%rdx  /* add them */
+       jmp .Lzero_rest
+
+       /* exception on quad word loop in tail handling */
+       /* ecx: loopcnt/8, %edx: length, rdi: correct */
+.Le_quad:
+       shll $3,%ecx
+       andl $7,%edx
+       addl %ecx,%edx
+       /* edx: bytes to zero, rdi: dest, eax:zero */
+.Lzero_rest:
+       cmpl $0,(%rsp)  /* zero flag set? */
+       jz   .Le_zero
+       movq %rdx,%rcx
+.Le_byte:
+       xorl %eax,%eax
+.Le5:  rep
+       stosb
+       /* when there is another exception while zeroing the rest just return */
+.Le_zero:
+       movq %rdx,%rax
+       jmp .Lende
+       CFI_ENDPROC
+ENDPROC(__copy_user_nocache)
+
+
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S

new file mode 100644 (file)

index 0000000..f0dba36
--- /dev/null
+++ b/arch/x86/lib/csum-copy_64.S
@@ -0,0 +1,249 @@
+/*
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ *     
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of this archive
+ * for more details. No warranty for anything given at all.
+ */
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/errno.h>
+
+/*
+ * Checksum copy with exception handling.
+ * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the 
+ * destination is zeroed.
+ * 
+ * Input
+ * rdi  source
+ * rsi  destination
+ * edx  len (32bit)
+ * ecx  sum (32bit) 
+ * r8   src_err_ptr (int)
+ * r9   dst_err_ptr (int)
+ *
+ * Output
+ * eax  64bit sum. undefined in case of exception.
+ * 
+ * Wrappers need to take care of valid exception sum and zeroing.               
+ * They also should align source or destination to 8 bytes.
+ */
+
+       .macro source
+10:
+       .section __ex_table,"a"
+       .align 8
+       .quad 10b,.Lbad_source
+       .previous
+       .endm
+               
+       .macro dest
+20:
+       .section __ex_table,"a"
+       .align 8
+       .quad 20b,.Lbad_dest
+       .previous
+       .endm
+                       
+       .macro ignore L=.Lignore
+30:
+       .section __ex_table,"a"
+       .align 8
+       .quad 30b,\L
+       .previous
+       .endm
+       
+                               
+ENTRY(csum_partial_copy_generic)
+       CFI_STARTPROC
+       cmpl     $3*64,%edx
+       jle      .Lignore
+
+.Lignore:              
+       subq  $7*8,%rsp
+       CFI_ADJUST_CFA_OFFSET 7*8
+       movq  %rbx,2*8(%rsp)
+       CFI_REL_OFFSET rbx, 2*8
+       movq  %r12,3*8(%rsp)
+       CFI_REL_OFFSET r12, 3*8
+       movq  %r14,4*8(%rsp)
+       CFI_REL_OFFSET r14, 4*8
+       movq  %r13,5*8(%rsp)
+       CFI_REL_OFFSET r13, 5*8
+       movq  %rbp,6*8(%rsp)
+       CFI_REL_OFFSET rbp, 6*8
+
+       movq  %r8,(%rsp)
+       movq  %r9,1*8(%rsp)
+       
+       movl  %ecx,%eax
+       movl  %edx,%ecx
+
+       xorl  %r9d,%r9d
+       movq  %rcx,%r12
+
+       shrq  $6,%r12
+       jz    .Lhandle_tail       /* < 64 */
+
+       clc
+       
+       /* main loop. clear in 64 byte blocks */
+       /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
+       /* r11: temp3, rdx: temp4, r12 loopcnt */
+       /* r10: temp5, rbp: temp6, r14 temp7, r13 temp8 */
+       .p2align 4
+.Lloop:
+       source
+       movq  (%rdi),%rbx
+       source
+       movq  8(%rdi),%r8
+       source
+       movq  16(%rdi),%r11
+       source
+       movq  24(%rdi),%rdx
+
+       source
+       movq  32(%rdi),%r10
+       source
+       movq  40(%rdi),%rbp
+       source
+       movq  48(%rdi),%r14
+       source
+       movq  56(%rdi),%r13
+               
+       ignore 2f
+       prefetcht0 5*64(%rdi)
+2:                                                     
+       adcq  %rbx,%rax
+       adcq  %r8,%rax
+       adcq  %r11,%rax
+       adcq  %rdx,%rax
+       adcq  %r10,%rax
+       adcq  %rbp,%rax
+       adcq  %r14,%rax
+       adcq  %r13,%rax
+
+       decl %r12d
+       
+       dest
+       movq %rbx,(%rsi)
+       dest
+       movq %r8,8(%rsi)
+       dest
+       movq %r11,16(%rsi)
+       dest
+       movq %rdx,24(%rsi)
+
+       dest
+       movq %r10,32(%rsi)
+       dest
+       movq %rbp,40(%rsi)
+       dest
+       movq %r14,48(%rsi)
+       dest
+       movq %r13,56(%rsi)
+       
+3:
+       
+       leaq 64(%rdi),%rdi
+       leaq 64(%rsi),%rsi
+
+       jnz   .Lloop
+
+       adcq  %r9,%rax
+
+       /* do last upto 56 bytes */
+.Lhandle_tail:
+       /* ecx: count */
+       movl %ecx,%r10d
+       andl $63,%ecx
+       shrl $3,%ecx
+       jz       .Lfold
+       clc
+       .p2align 4
+.Lloop_8:      
+       source
+       movq (%rdi),%rbx
+       adcq %rbx,%rax
+       decl %ecx
+       dest
+       movq %rbx,(%rsi)
+       leaq 8(%rsi),%rsi /* preserve carry */
+       leaq 8(%rdi),%rdi
+       jnz     .Lloop_8
+       adcq %r9,%rax   /* add in carry */
+
+.Lfold:
+       /* reduce checksum to 32bits */
+       movl %eax,%ebx
+       shrq $32,%rax
+       addl %ebx,%eax
+       adcl %r9d,%eax
+
+       /* do last upto 6 bytes */      
+.Lhandle_7:
+       movl %r10d,%ecx
+       andl $7,%ecx
+       shrl $1,%ecx
+       jz   .Lhandle_1
+       movl $2,%edx
+       xorl %ebx,%ebx
+       clc  
+       .p2align 4
+.Lloop_1:      
+       source
+       movw (%rdi),%bx
+       adcl %ebx,%eax
+       decl %ecx
+       dest
+       movw %bx,(%rsi)
+       leaq 2(%rdi),%rdi
+       leaq 2(%rsi),%rsi
+       jnz .Lloop_1
+       adcl %r9d,%eax  /* add in carry */
+       
+       /* handle last odd byte */
+.Lhandle_1:
+       testl $1,%r10d
+       jz    .Lende
+       xorl  %ebx,%ebx
+       source
+       movb (%rdi),%bl
+       dest
+       movb %bl,(%rsi)
+       addl %ebx,%eax
+       adcl %r9d,%eax          /* carry */
+                       
+       CFI_REMEMBER_STATE
+.Lende:
+       movq 2*8(%rsp),%rbx
+       CFI_RESTORE rbx
+       movq 3*8(%rsp),%r12
+       CFI_RESTORE r12
+       movq 4*8(%rsp),%r14
+       CFI_RESTORE r14
+       movq 5*8(%rsp),%r13
+       CFI_RESTORE r13
+       movq 6*8(%rsp),%rbp
+       CFI_RESTORE rbp
+       addq $7*8,%rsp
+       CFI_ADJUST_CFA_OFFSET -7*8
+       ret
+       CFI_RESTORE_STATE
+
+       /* Exception handlers. Very simple, zeroing is done in the wrappers */
+.Lbad_source:
+       movq (%rsp),%rax
+       testq %rax,%rax
+       jz   .Lende
+       movl $-EFAULT,(%rax)
+       jmp  .Lende
+       
+.Lbad_dest:
+       movq 8(%rsp),%rax
+       testq %rax,%rax
+       jz   .Lende     
+       movl $-EFAULT,(%rax)
+       jmp .Lende
+       CFI_ENDPROC
+ENDPROC(csum_partial_copy_generic)
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c

new file mode 100644 (file)

index 0000000..bc503f5
--- /dev/null
+++ b/arch/x86/lib/csum-partial_64.c
@@ -0,0 +1,150 @@
+/*
+ * arch/x86_64/lib/csum-partial.c
+ *
+ * This file contains network checksum routines that are better done
+ * in an architecture-specific manner due to speed.
+ */
+ 
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <asm/checksum.h>
+
+static inline unsigned short from32to16(unsigned a) 
+{
+       unsigned short b = a >> 16; 
+       asm("addw %w2,%w0\n\t"
+           "adcw $0,%w0\n" 
+           : "=r" (b)
+           : "0" (b), "r" (a));
+       return b;
+}
+
+/*
+ * Do a 64-bit checksum on an arbitrary memory area.
+ * Returns a 32bit checksum.
+ *
+ * This isn't as time critical as it used to be because many NICs
+ * do hardware checksumming these days.
+ * 
+ * Things tried and found to not make it faster:
+ * Manual Prefetching
+ * Unrolling to an 128 bytes inner loop.
+ * Using interleaving with more registers to break the carry chains.
+ */
+static unsigned do_csum(const unsigned char *buff, unsigned len)
+{
+       unsigned odd, count;
+       unsigned long result = 0;
+
+       if (unlikely(len == 0))
+               return result; 
+       odd = 1 & (unsigned long) buff;
+       if (unlikely(odd)) {
+               result = *buff << 8;
+               len--;
+               buff++;
+       }
+       count = len >> 1;               /* nr of 16-bit words.. */
+       if (count) {
+               if (2 & (unsigned long) buff) {
+                       result += *(unsigned short *)buff;
+                       count--;
+                       len -= 2;
+                       buff += 2;
+               }
+               count >>= 1;            /* nr of 32-bit words.. */
+               if (count) {
+                       unsigned long zero;
+                       unsigned count64;
+                       if (4 & (unsigned long) buff) {
+                               result += *(unsigned int *) buff;
+                               count--;
+                               len -= 4;
+                               buff += 4;
+                       }
+                       count >>= 1;    /* nr of 64-bit words.. */
+
+                       /* main loop using 64byte blocks */
+                       zero = 0;
+                       count64 = count >> 3;
+                       while (count64) { 
+                               asm("addq 0*8(%[src]),%[res]\n\t"
+                                   "adcq 1*8(%[src]),%[res]\n\t"
+                                   "adcq 2*8(%[src]),%[res]\n\t"
+                                   "adcq 3*8(%[src]),%[res]\n\t"
+                                   "adcq 4*8(%[src]),%[res]\n\t"
+                                   "adcq 5*8(%[src]),%[res]\n\t"
+                                   "adcq 6*8(%[src]),%[res]\n\t"
+                                   "adcq 7*8(%[src]),%[res]\n\t"
+                                   "adcq %[zero],%[res]"
+                                   : [res] "=r" (result)
+                                   : [src] "r" (buff), [zero] "r" (zero),
+                                   "[res]" (result));
+                               buff += 64;
+                               count64--;
+                       }
+
+                       /* last upto 7 8byte blocks */
+                       count %= 8; 
+                       while (count) { 
+                               asm("addq %1,%0\n\t"
+                                   "adcq %2,%0\n" 
+                                           : "=r" (result)
+                                   : "m" (*(unsigned long *)buff), 
+                                   "r" (zero),  "0" (result));
+                               --count; 
+                                       buff += 8;
+                       }
+                       result = add32_with_carry(result>>32,
+                                                 result&0xffffffff); 
+
+                       if (len & 4) {
+                               result += *(unsigned int *) buff;
+                               buff += 4;
+                       }
+               }
+               if (len & 2) {
+                       result += *(unsigned short *) buff;
+                       buff += 2;
+               }
+       }
+       if (len & 1)
+               result += *buff;
+       result = add32_with_carry(result>>32, result & 0xffffffff); 
+       if (unlikely(odd)) { 
+               result = from32to16(result);
+               result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
+       }
+       return result;
+}
+
+/*
+ * computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit)
+ *
+ * returns a 32-bit number suitable for feeding into itself
+ * or csum_tcpudp_magic
+ *
+ * this function must be called with even lengths, except
+ * for the last fragment, which may be odd
+ *
+ * it's best to have buff aligned on a 64-bit boundary
+ */
+__wsum csum_partial(const void *buff, int len, __wsum sum)
+{
+       return (__force __wsum)add32_with_carry(do_csum(buff, len),
+                                               (__force u32)sum);
+}
+
+EXPORT_SYMBOL(csum_partial);
+
+/*
+ * this routine is used for miscellaneous IP-like checksums, mainly
+ * in icmp.c
+ */
+__sum16 ip_compute_csum(const void *buff, int len)
+{
+       return csum_fold(csum_partial(buff,len,0));
+}
+EXPORT_SYMBOL(ip_compute_csum);
+
diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c

new file mode 100644 (file)

index 0000000..fd42a4a
--- /dev/null
+++ b/arch/x86/lib/csum-wrappers_64.c
@@ -0,0 +1,135 @@
+/* Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v.2
+ * 
+ * Wrappers of assembly checksum functions for x86-64.
+ */
+
+#include <asm/checksum.h>
+#include <linux/module.h>
+
+/** 
+ * csum_partial_copy_from_user - Copy and checksum from user space. 
+ * @src: source address (user space) 
+ * @dst: destination address
+ * @len: number of bytes to be copied.
+ * @isum: initial sum that is added into the result (32bit unfolded)
+ * @errp: set to -EFAULT for an bad source address.
+ * 
+ * Returns an 32bit unfolded checksum of the buffer.
+ * src and dst are best aligned to 64bits. 
+ */ 
+__wsum
+csum_partial_copy_from_user(const void __user *src, void *dst,
+                           int len, __wsum isum, int *errp)
+{ 
+       might_sleep();
+       *errp = 0;
+       if (likely(access_ok(VERIFY_READ,src, len))) { 
+               /* Why 6, not 7? To handle odd addresses aligned we
+                  would need to do considerable complications to fix the
+                  checksum which is defined as an 16bit accumulator. The
+                  fix alignment code is primarily for performance
+                  compatibility with 32bit and that will handle odd
+                  addresses slowly too. */
+               if (unlikely((unsigned long)src & 6)) {                 
+                       while (((unsigned long)src & 6) && len >= 2) { 
+                               __u16 val16;                    
+                               *errp = __get_user(val16, (const __u16 __user *)src);
+                               if (*errp)
+                                       return isum;
+                               *(__u16 *)dst = val16;
+                               isum = (__force __wsum)add32_with_carry(
+                                               (__force unsigned)isum, val16);
+                               src += 2; 
+                               dst += 2; 
+                               len -= 2;
+                       }
+               }
+               isum = csum_partial_copy_generic((__force const void *)src,
+                                       dst, len, isum, errp, NULL);
+               if (likely(*errp == 0)) 
+                       return isum;
+       } 
+       *errp = -EFAULT;
+       memset(dst,0,len); 
+       return isum;            
+} 
+
+EXPORT_SYMBOL(csum_partial_copy_from_user);
+
+/** 
+ * csum_partial_copy_to_user - Copy and checksum to user space. 
+ * @src: source address
+ * @dst: destination address (user space)
+ * @len: number of bytes to be copied.
+ * @isum: initial sum that is added into the result (32bit unfolded)
+ * @errp: set to -EFAULT for an bad destination address.
+ * 
+ * Returns an 32bit unfolded checksum of the buffer.
+ * src and dst are best aligned to 64bits.
+ */ 
+__wsum
+csum_partial_copy_to_user(const void *src, void __user *dst,
+                         int len, __wsum isum, int *errp)
+{ 
+       might_sleep();
+       if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) {
+               *errp = -EFAULT;
+               return 0; 
+       }
+
+       if (unlikely((unsigned long)dst & 6)) {
+               while (((unsigned long)dst & 6) && len >= 2) { 
+                       __u16 val16 = *(__u16 *)src;
+                       isum = (__force __wsum)add32_with_carry(
+                                       (__force unsigned)isum, val16);
+                       *errp = __put_user(val16, (__u16 __user *)dst);
+                       if (*errp)
+                               return isum;
+                       src += 2; 
+                       dst += 2; 
+                       len -= 2;
+               }
+       }
+
+       *errp = 0;
+       return csum_partial_copy_generic(src, (void __force *)dst,len,isum,NULL,errp); 
+} 
+
+EXPORT_SYMBOL(csum_partial_copy_to_user);
+
+/** 
+ * csum_partial_copy_nocheck - Copy and checksum.
+ * @src: source address
+ * @dst: destination address
+ * @len: number of bytes to be copied.
+ * @isum: initial sum that is added into the result (32bit unfolded)
+ * 
+ * Returns an 32bit unfolded checksum of the buffer.
+ */ 
+__wsum
+csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum)
+{ 
+       return csum_partial_copy_generic(src,dst,len,sum,NULL,NULL);
+} 
+EXPORT_SYMBOL(csum_partial_copy_nocheck);
+
+__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+                       const struct in6_addr *daddr,
+                       __u32 len, unsigned short proto, __wsum sum)
+{
+       __u64 rest, sum64;
+     
+       rest = (__force __u64)htonl(len) + (__force __u64)htons(proto) +
+               (__force __u64)sum;
+       asm("  addq (%[saddr]),%[sum]\n"
+           "  adcq 8(%[saddr]),%[sum]\n"
+           "  adcq (%[daddr]),%[sum]\n" 
+           "  adcq 8(%[daddr]),%[sum]\n"
+           "  adcq $0,%[sum]\n"
+           : [sum] "=r" (sum64) 
+           : "[sum]" (rest),[saddr] "r" (saddr), [daddr] "r" (daddr));
+       return csum_fold((__force __wsum)add32_with_carry(sum64 & 0xffffffff, sum64>>32));
+}
+
+EXPORT_SYMBOL(csum_ipv6_magic);
diff --git a/arch/x86/lib/delay_64.c b/arch/x86/lib/delay_64.c

new file mode 100644 (file)

index 0000000..2dbebd3
--- /dev/null
+++ b/arch/x86/lib/delay_64.c
@@ -0,0 +1,57 @@
+/*
+ *     Precise Delay Loops for x86-64
+ *
+ *     Copyright (C) 1993 Linus Torvalds
+ *     Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ *     The __delay function must _NOT_ be inlined as its execution time
+ *     depends wildly on alignment on many x86 processors. 
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <asm/delay.h>
+#include <asm/msr.h>
+
+#ifdef CONFIG_SMP
+#include <asm/smp.h>
+#endif
+
+int read_current_timer(unsigned long *timer_value)
+{
+       rdtscll(*timer_value);
+       return 0;
+}
+
+void __delay(unsigned long loops)
+{
+       unsigned bclock, now;
+       
+       rdtscl(bclock);
+       do
+       {
+               rep_nop(); 
+               rdtscl(now);
+       }
+       while((now-bclock) < loops);
+}
+EXPORT_SYMBOL(__delay);
+
+inline void __const_udelay(unsigned long xloops)
+{
+       __delay(((xloops * HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32) + 1);
+}
+EXPORT_SYMBOL(__const_udelay);
+
+void __udelay(unsigned long usecs)
+{
+       __const_udelay(usecs * 0x000010c7);  /* 2**32 / 1000000 (rounded up) */
+}
+EXPORT_SYMBOL(__udelay);
+
+void __ndelay(unsigned long nsecs)
+{
+       __const_udelay(nsecs * 0x00005);  /* 2**32 / 1000000000 (rounded up) */
+}
+EXPORT_SYMBOL(__ndelay);
diff --git a/arch/x86/lib/getuser_64.S b/arch/x86/lib/getuser_64.S

new file mode 100644 (file)

index 0000000..5448876
--- /dev/null
+++ b/arch/x86/lib/getuser_64.S
@@ -0,0 +1,109 @@
+/*
+ * __get_user functions.
+ *
+ * (C) Copyright 1998 Linus Torvalds
+ * (C) Copyright 2005 Andi Kleen
+ *
+ * These functions have a non-standard call interface
+ * to make them more efficient, especially as they
+ * return an error value in addition to the "real"
+ * return value.
+ */
+
+/*
+ * __get_user_X
+ *
+ * Inputs:     %rcx contains the address.
+ *             The register is modified, but all changes are undone
+ *             before returning because the C code doesn't know about it.
+ *
+ * Outputs:    %rax is error code (0 or -EFAULT)
+ *             %rdx contains zero-extended value
+ * 
+ * %r8 is destroyed.
+ *
+ * These functions should not modify any other registers,
+ * as they get called from within inline assembly.
+ */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/page.h>
+#include <asm/errno.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+
+       .text
+ENTRY(__get_user_1)
+       CFI_STARTPROC
+       GET_THREAD_INFO(%r8)
+       cmpq threadinfo_addr_limit(%r8),%rcx
+       jae bad_get_user
+1:     movzb (%rcx),%edx
+       xorl %eax,%eax
+       ret
+       CFI_ENDPROC
+ENDPROC(__get_user_1)
+
+ENTRY(__get_user_2)
+       CFI_STARTPROC
+       GET_THREAD_INFO(%r8)
+       addq $1,%rcx
+       jc 20f
+       cmpq threadinfo_addr_limit(%r8),%rcx
+       jae 20f
+       decq   %rcx
+2:     movzwl (%rcx),%edx
+       xorl %eax,%eax
+       ret
+20:    decq    %rcx
+       jmp     bad_get_user
+       CFI_ENDPROC
+ENDPROC(__get_user_2)
+
+ENTRY(__get_user_4)
+       CFI_STARTPROC
+       GET_THREAD_INFO(%r8)
+       addq $3,%rcx
+       jc 30f
+       cmpq threadinfo_addr_limit(%r8),%rcx
+       jae 30f
+       subq $3,%rcx
+3:     movl (%rcx),%edx
+       xorl %eax,%eax
+       ret
+30:    subq $3,%rcx
+       jmp bad_get_user
+       CFI_ENDPROC
+ENDPROC(__get_user_4)
+
+ENTRY(__get_user_8)
+       CFI_STARTPROC
+       GET_THREAD_INFO(%r8)
+       addq $7,%rcx
+       jc 40f
+       cmpq threadinfo_addr_limit(%r8),%rcx
+       jae     40f
+       subq    $7,%rcx
+4:     movq (%rcx),%rdx
+       xorl %eax,%eax
+       ret
+40:    subq $7,%rcx
+       jmp bad_get_user
+       CFI_ENDPROC
+ENDPROC(__get_user_8)
+
+bad_get_user:
+       CFI_STARTPROC
+       xorl %edx,%edx
+       movq $(-EFAULT),%rax
+       ret
+       CFI_ENDPROC
+END(bad_get_user)
+
+.section __ex_table,"a"
+       .quad 1b,bad_get_user
+       .quad 2b,bad_get_user
+       .quad 3b,bad_get_user
+       .quad 4b,bad_get_user
+.previous
diff --git a/arch/x86/lib/io_64.c b/arch/x86/lib/io_64.c

new file mode 100644 (file)

index 0000000..87b4a4e
--- /dev/null
+++ b/arch/x86/lib/io_64.c
@@ -0,0 +1,23 @@
+#include <linux/string.h>
+#include <asm/io.h>
+#include <linux/module.h>
+
+void __memcpy_toio(unsigned long dst,const void*src,unsigned len)
+{
+       __inline_memcpy((void *) dst,src,len);
+}
+EXPORT_SYMBOL(__memcpy_toio);
+
+void __memcpy_fromio(void *dst,unsigned long src,unsigned len)
+{
+       __inline_memcpy(dst,(const void *) src,len);
+}
+EXPORT_SYMBOL(__memcpy_fromio);
+
+void memset_io(volatile void __iomem *a, int b, size_t c)
+{
+       /* XXX: memset can mangle the IO patterns quite a bit.
+          perhaps it would be better to use a dumb one */
+       memset((void *)a,b,c);
+}
+EXPORT_SYMBOL(memset_io);
diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S

new file mode 100644 (file)

index 0000000..05a95e7
--- /dev/null
+++ b/arch/x86/lib/iomap_copy_64.S
@@ -0,0 +1,30 @@
+/*
+ * Copyright 2006 PathScale, Inc.  All Rights Reserved.
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+/*
+ * override generic version in lib/iomap_copy.c
+ */
+ENTRY(__iowrite32_copy)
+       CFI_STARTPROC
+       movl %edx,%ecx
+       rep movsd
+       ret
+       CFI_ENDPROC
+ENDPROC(__iowrite32_copy)
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S

new file mode 100644 (file)

index 0000000..c22981f
--- /dev/null
+++ b/arch/x86/lib/memcpy_64.S
@@ -0,0 +1,131 @@
+/* Copyright 2002 Andi Kleen */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
+
+/*
+ * memcpy - Copy a memory block.
+ *
+ * Input:      
+ * rdi destination
+ * rsi source
+ * rdx count
+ * 
+ * Output:
+ * rax original destination
+ */    
+
+       ALIGN
+memcpy_c:
+       CFI_STARTPROC
+       movq %rdi,%rax
+       movl %edx,%ecx
+       shrl $3,%ecx
+       andl $7,%edx
+       rep movsq
+       movl %edx,%ecx
+       rep movsb
+       ret
+       CFI_ENDPROC
+ENDPROC(memcpy_c)
+
+ENTRY(__memcpy)
+ENTRY(memcpy)
+       CFI_STARTPROC
+       pushq %rbx
+       CFI_ADJUST_CFA_OFFSET 8
+       CFI_REL_OFFSET rbx, 0
+       movq %rdi,%rax
+
+       movl %edx,%ecx
+       shrl $6,%ecx
+       jz .Lhandle_tail
+
+       .p2align 4
+.Lloop_64:
+       decl %ecx
+
+       movq (%rsi),%r11
+       movq 8(%rsi),%r8
+
+       movq %r11,(%rdi)
+       movq %r8,1*8(%rdi)
+
+       movq 2*8(%rsi),%r9
+       movq 3*8(%rsi),%r10
+
+       movq %r9,2*8(%rdi)
+       movq %r10,3*8(%rdi)
+
+       movq 4*8(%rsi),%r11
+       movq 5*8(%rsi),%r8
+
+       movq %r11,4*8(%rdi)
+       movq %r8,5*8(%rdi)
+
+       movq 6*8(%rsi),%r9
+       movq 7*8(%rsi),%r10
+
+       movq %r9,6*8(%rdi)
+       movq %r10,7*8(%rdi)
+
+       leaq 64(%rsi),%rsi
+       leaq 64(%rdi),%rdi
+       jnz  .Lloop_64
+
+.Lhandle_tail:
+       movl %edx,%ecx
+       andl $63,%ecx
+       shrl $3,%ecx
+       jz   .Lhandle_7
+       .p2align 4
+.Lloop_8:
+       decl %ecx
+       movq (%rsi),%r8
+       movq %r8,(%rdi)
+       leaq 8(%rdi),%rdi
+       leaq 8(%rsi),%rsi
+       jnz  .Lloop_8
+
+.Lhandle_7:
+       movl %edx,%ecx
+       andl $7,%ecx
+       jz .Lende
+       .p2align 4
+.Lloop_1:
+       movb (%rsi),%r8b
+       movb %r8b,(%rdi)
+       incq %rdi
+       incq %rsi
+       decl %ecx
+       jnz .Lloop_1
+
+.Lende:
+       popq %rbx
+       CFI_ADJUST_CFA_OFFSET -8
+       CFI_RESTORE rbx
+       ret
+.Lfinal:
+       CFI_ENDPROC
+ENDPROC(memcpy)
+ENDPROC(__memcpy)
+
+       /* Some CPUs run faster using the string copy instructions.
+          It is also a lot simpler. Use this when possible */
+
+       .section .altinstr_replacement,"ax"
+1:     .byte 0xeb                              /* jmp <disp8> */
+       .byte (memcpy_c - memcpy) - (2f - 1b)   /* offset */
+2:
+       .previous
+       .section .altinstructions,"a"
+       .align 8
+       .quad memcpy
+       .quad 1b
+       .byte X86_FEATURE_REP_GOOD
+       /* Replace only beginning, memcpy is used to apply alternatives, so it
+        * is silly to overwrite itself with nops - reboot is only outcome... */
+       .byte 2b - 1b
+       .byte 2b - 1b
+       .previous
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c

new file mode 100644 (file)

index 0000000..751ebae
--- /dev/null
+++ b/arch/x86/lib/memmove_64.c
@@ -0,0 +1,21 @@
+/* Normally compiler builtins are used, but sometimes the compiler calls out
+   of line code. Based on asm-i386/string.h.
+ */
+#define _STRING_C
+#include <linux/string.h>
+#include <linux/module.h>
+
+#undef memmove
+void *memmove(void * dest,const void *src,size_t count)
+{
+       if (dest < src) { 
+               return memcpy(dest,src,count);
+       } else {
+               char *p = (char *) dest + count;
+               char *s = (char *) src + count;
+               while (count--)
+                       *--p = *--s;
+       }
+       return dest;
+} 
+EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S

new file mode 100644 (file)

index 0000000..2c59481
--- /dev/null
+++ b/arch/x86/lib/memset_64.S
@@ -0,0 +1,133 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+/*
+ * ISO C memset - set a memory block to a byte value.
+ *     
+ * rdi   destination
+ * rsi   value (char) 
+ * rdx   count (bytes) 
+ * 
+ * rax   original destination
+ */    
+       ALIGN
+memset_c:
+       CFI_STARTPROC
+       movq %rdi,%r9
+       movl %edx,%r8d
+       andl $7,%r8d
+       movl %edx,%ecx
+       shrl $3,%ecx
+       /* expand byte value  */
+       movzbl %sil,%esi
+       movabs $0x0101010101010101,%rax
+       mulq %rsi               /* with rax, clobbers rdx */
+       rep stosq
+       movl %r8d,%ecx
+       rep stosb
+       movq %r9,%rax
+       ret
+       CFI_ENDPROC
+ENDPROC(memset_c)
+
+ENTRY(memset)
+ENTRY(__memset)
+       CFI_STARTPROC
+       movq %rdi,%r10
+       movq %rdx,%r11
+
+       /* expand byte value  */
+       movzbl %sil,%ecx
+       movabs $0x0101010101010101,%rax
+       mul    %rcx             /* with rax, clobbers rdx */
+
+       /* align dst */
+       movl  %edi,%r9d
+       andl  $7,%r9d
+       jnz  .Lbad_alignment
+       CFI_REMEMBER_STATE
+.Lafter_bad_alignment:
+
+       movl %r11d,%ecx
+       shrl $6,%ecx
+       jz       .Lhandle_tail
+
+       .p2align 4
+.Lloop_64:
+       decl   %ecx
+       movq  %rax,(%rdi)
+       movq  %rax,8(%rdi)
+       movq  %rax,16(%rdi)
+       movq  %rax,24(%rdi)
+       movq  %rax,32(%rdi)
+       movq  %rax,40(%rdi)
+       movq  %rax,48(%rdi)
+       movq  %rax,56(%rdi)
+       leaq  64(%rdi),%rdi
+       jnz    .Lloop_64
+
+       /* Handle tail in loops. The loops should be faster than hard
+          to predict jump tables. */
+       .p2align 4
+.Lhandle_tail:
+       movl    %r11d,%ecx
+       andl    $63&(~7),%ecx
+       jz              .Lhandle_7
+       shrl    $3,%ecx
+       .p2align 4
+.Lloop_8:
+       decl   %ecx
+       movq  %rax,(%rdi)
+       leaq  8(%rdi),%rdi
+       jnz    .Lloop_8
+
+.Lhandle_7:
+       movl    %r11d,%ecx
+       andl    $7,%ecx
+       jz      .Lende
+       .p2align 4
+.Lloop_1:
+       decl    %ecx
+       movb    %al,(%rdi)
+       leaq    1(%rdi),%rdi
+       jnz     .Lloop_1
+
+.Lende:
+       movq    %r10,%rax
+       ret
+
+       CFI_RESTORE_STATE
+.Lbad_alignment:
+       cmpq $7,%r11
+       jbe     .Lhandle_7
+       movq %rax,(%rdi)        /* unaligned store */
+       movq $8,%r8
+       subq %r9,%r8
+       addq %r8,%rdi
+       subq %r8,%r11
+       jmp .Lafter_bad_alignment
+.Lfinal:
+       CFI_ENDPROC
+ENDPROC(memset)
+ENDPROC(__memset)
+
+       /* Some CPUs run faster using the string instructions.
+          It is also a lot simpler. Use this when possible */
+
+#include <asm/cpufeature.h>
+
+       .section .altinstr_replacement,"ax"
+1:     .byte 0xeb                              /* jmp <disp8> */
+       .byte (memset_c - memset) - (2f - 1b)   /* offset */
+2:
+       .previous
+       .section .altinstructions,"a"
+       .align 8
+       .quad memset
+       .quad 1b
+       .byte X86_FEATURE_REP_GOOD
+       .byte .Lfinal - memset
+       .byte 2b - 1b
+       .previous
diff --git a/arch/x86/lib/putuser_64.S b/arch/x86/lib/putuser_64.S

new file mode 100644 (file)

index 0000000..4989f5a
--- /dev/null
+++ b/arch/x86/lib/putuser_64.S
@@ -0,0 +1,106 @@
+/*
+ * __put_user functions.
+ *
+ * (C) Copyright 1998 Linus Torvalds
+ * (C) Copyright 2005 Andi Kleen
+ *
+ * These functions have a non-standard call interface
+ * to make them more efficient, especially as they
+ * return an error value in addition to the "real"
+ * return value.
+ */
+
+/*
+ * __put_user_X
+ *
+ * Inputs:     %rcx contains the address
+ *             %rdx contains new value
+ *
+ * Outputs:    %rax is error code (0 or -EFAULT)
+ *
+ * %r8 is destroyed.
+ *
+ * These functions should not modify any other registers,
+ * as they get called from within inline assembly.
+ */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/page.h>
+#include <asm/errno.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+
+       .text
+ENTRY(__put_user_1)
+       CFI_STARTPROC
+       GET_THREAD_INFO(%r8)
+       cmpq threadinfo_addr_limit(%r8),%rcx
+       jae bad_put_user
+1:     movb %dl,(%rcx)
+       xorl %eax,%eax
+       ret
+       CFI_ENDPROC
+ENDPROC(__put_user_1)
+
+ENTRY(__put_user_2)
+       CFI_STARTPROC
+       GET_THREAD_INFO(%r8)
+       addq $1,%rcx
+       jc 20f
+       cmpq threadinfo_addr_limit(%r8),%rcx
+       jae 20f
+       decq %rcx
+2:     movw %dx,(%rcx)
+       xorl %eax,%eax
+       ret
+20:    decq %rcx
+       jmp bad_put_user
+       CFI_ENDPROC
+ENDPROC(__put_user_2)
+
+ENTRY(__put_user_4)
+       CFI_STARTPROC
+       GET_THREAD_INFO(%r8)
+       addq $3,%rcx
+       jc 30f
+       cmpq threadinfo_addr_limit(%r8),%rcx
+       jae 30f
+       subq $3,%rcx
+3:     movl %edx,(%rcx)
+       xorl %eax,%eax
+       ret
+30:    subq $3,%rcx
+       jmp bad_put_user
+       CFI_ENDPROC
+ENDPROC(__put_user_4)
+
+ENTRY(__put_user_8)
+       CFI_STARTPROC
+       GET_THREAD_INFO(%r8)
+       addq $7,%rcx
+       jc 40f
+       cmpq threadinfo_addr_limit(%r8),%rcx
+       jae 40f
+       subq $7,%rcx
+4:     movq %rdx,(%rcx)
+       xorl %eax,%eax
+       ret
+40:    subq $7,%rcx
+       jmp bad_put_user
+       CFI_ENDPROC
+ENDPROC(__put_user_8)
+
+bad_put_user:
+       CFI_STARTPROC
+       movq $(-EFAULT),%rax
+       ret
+       CFI_ENDPROC
+END(bad_put_user)
+
+.section __ex_table,"a"
+       .quad 1b,bad_put_user
+       .quad 2b,bad_put_user
+       .quad 3b,bad_put_user
+       .quad 4b,bad_put_user
+.previous
diff --git a/arch/x86/lib/rwlock_64.S b/arch/x86/lib/rwlock_64.S

new file mode 100644 (file)

index 0000000..0cde1f8
--- /dev/null
+++ b/arch/x86/lib/rwlock_64.S
@@ -0,0 +1,38 @@
+/* Slow paths of read/write spinlocks. */
+
+#include <linux/linkage.h>
+#include <asm/rwlock.h>
+#include <asm/alternative-asm.i>
+#include <asm/dwarf2.h>
+
+/* rdi:        pointer to rwlock_t */
+ENTRY(__write_lock_failed)
+       CFI_STARTPROC
+       LOCK_PREFIX
+       addl $RW_LOCK_BIAS,(%rdi)
+1:     rep
+       nop
+       cmpl $RW_LOCK_BIAS,(%rdi)
+       jne 1b
+       LOCK_PREFIX
+       subl $RW_LOCK_BIAS,(%rdi)
+       jnz  __write_lock_failed
+       ret
+       CFI_ENDPROC
+END(__write_lock_failed)
+
+/* rdi:        pointer to rwlock_t */
+ENTRY(__read_lock_failed)
+       CFI_STARTPROC
+       LOCK_PREFIX
+       incl (%rdi)
+1:     rep
+       nop
+       cmpl $1,(%rdi)
+       js 1b
+       LOCK_PREFIX
+       decl (%rdi)
+       js __read_lock_failed
+       ret
+       CFI_ENDPROC
+END(__read_lock_failed)
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S

new file mode 100644 (file)

index 0000000..55e586d
--- /dev/null
+++ b/arch/x86/lib/thunk_64.S
@@ -0,0 +1,67 @@
+/*
+ * Save registers before calling assembly functions. This avoids
+ * disturbance of register allocation in some inline assembly constructs.
+ * Copyright 2001,2002 by Andi Kleen, SuSE Labs.
+ * Subject to the GNU public license, v.2. No warranty of any kind.
+ */
+
+       #include <linux/linkage.h>
+       #include <asm/dwarf2.h>
+       #include <asm/calling.h>                        
+       #include <asm/rwlock.h>
+               
+       /* rdi: arg1 ... normal C conventions. rax is saved/restored. */        
+       .macro thunk name,func
+       .globl \name
+\name: 
+       CFI_STARTPROC
+       SAVE_ARGS
+       call \func
+       jmp  restore
+       CFI_ENDPROC
+       .endm
+
+       /* rdi: arg1 ... normal C conventions. rax is passed from C. */         
+       .macro thunk_retrax name,func
+       .globl \name
+\name: 
+       CFI_STARTPROC
+       SAVE_ARGS
+       call \func
+       jmp  restore_norax
+       CFI_ENDPROC
+       .endm
+       
+
+       .section .sched.text
+#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
+       thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed
+       thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed
+       thunk rwsem_wake_thunk,rwsem_wake
+       thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
+#endif 
+       
+       thunk __down_failed,__down
+       thunk_retrax __down_failed_interruptible,__down_interruptible
+       thunk_retrax __down_failed_trylock,__down_trylock
+       thunk __up_wakeup,__up
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+       thunk trace_hardirqs_on_thunk,trace_hardirqs_on
+       thunk trace_hardirqs_off_thunk,trace_hardirqs_off
+#endif
+       
+       /* SAVE_ARGS below is used only for the .cfi directives it contains. */
+       CFI_STARTPROC
+       SAVE_ARGS
+restore:
+       RESTORE_ARGS
+       ret     
+       CFI_ENDPROC
+       
+       CFI_STARTPROC
+       SAVE_ARGS
+restore_norax: 
+       RESTORE_ARGS 1
+       ret
+       CFI_ENDPROC
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c

new file mode 100644 (file)

index 0000000..893d43f
--- /dev/null
+++ b/arch/x86/lib/usercopy_64.c
@@ -0,0 +1,166 @@
+/* 
+ * User address space access functions.
+ *
+ * Copyright 1997 Andi Kleen <ak@muc.de>
+ * Copyright 1997 Linus Torvalds
+ * Copyright 2002 Andi Kleen <ak@suse.de>
+ */
+#include <linux/module.h>
+#include <asm/uaccess.h>
+
+/*
+ * Copy a null terminated string from userspace.
+ */
+
+#define __do_strncpy_from_user(dst,src,count,res)                         \
+do {                                                                      \
+       long __d0, __d1, __d2;                                             \
+       might_sleep();                                                     \
+       __asm__ __volatile__(                                              \
+               "       testq %1,%1\n"                                     \
+               "       jz 2f\n"                                           \
+               "0:     lodsb\n"                                           \
+               "       stosb\n"                                           \
+               "       testb %%al,%%al\n"                                 \
+               "       jz 1f\n"                                           \
+               "       decq %1\n"                                         \
+               "       jnz 0b\n"                                          \
+               "1:     subq %1,%0\n"                                      \
+               "2:\n"                                                     \
+               ".section .fixup,\"ax\"\n"                                 \
+               "3:     movq %5,%0\n"                                      \
+               "       jmp 2b\n"                                          \
+               ".previous\n"                                              \
+               ".section __ex_table,\"a\"\n"                              \
+               "       .align 8\n"                                        \
+               "       .quad 0b,3b\n"                                     \
+               ".previous"                                                \
+               : "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1),      \
+                 "=&D" (__d2)                                             \
+               : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
+               : "memory");                                               \
+} while (0)
+
+long
+__strncpy_from_user(char *dst, const char __user *src, long count)
+{
+       long res;
+       __do_strncpy_from_user(dst, src, count, res);
+       return res;
+}
+EXPORT_SYMBOL(__strncpy_from_user);
+
+long
+strncpy_from_user(char *dst, const char __user *src, long count)
+{
+       long res = -EFAULT;
+       if (access_ok(VERIFY_READ, src, 1))
+               return __strncpy_from_user(dst, src, count);
+       return res;
+}
+EXPORT_SYMBOL(strncpy_from_user);
+
+/*
+ * Zero Userspace
+ */
+
+unsigned long __clear_user(void __user *addr, unsigned long size)
+{
+       long __d0;
+       might_sleep();
+       /* no memory constraint because it doesn't change any memory gcc knows
+          about */
+       asm volatile(
+               "       testq  %[size8],%[size8]\n"
+               "       jz     4f\n"
+               "0:     movq %[zero],(%[dst])\n"
+               "       addq   %[eight],%[dst]\n"
+               "       decl %%ecx ; jnz   0b\n"
+               "4:     movq  %[size1],%%rcx\n"
+               "       testl %%ecx,%%ecx\n"
+               "       jz     2f\n"
+               "1:     movb   %b[zero],(%[dst])\n"
+               "       incq   %[dst]\n"
+               "       decl %%ecx ; jnz  1b\n"
+               "2:\n"
+               ".section .fixup,\"ax\"\n"
+               "3:     lea 0(%[size1],%[size8],8),%[size8]\n"
+               "       jmp 2b\n"
+               ".previous\n"
+               ".section __ex_table,\"a\"\n"
+               "       .align 8\n"
+               "       .quad 0b,3b\n"
+               "       .quad 1b,2b\n"
+               ".previous"
+               : [size8] "=c"(size), [dst] "=&D" (__d0)
+               : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr),
+                 [zero] "r" (0UL), [eight] "r" (8UL));
+       return size;
+}
+EXPORT_SYMBOL(__clear_user);
+
+unsigned long clear_user(void __user *to, unsigned long n)
+{
+       if (access_ok(VERIFY_WRITE, to, n))
+               return __clear_user(to, n);
+       return n;
+}
+EXPORT_SYMBOL(clear_user);
+
+/*
+ * Return the size of a string (including the ending 0)
+ *
+ * Return 0 on exception, a value greater than N if too long
+ */
+
+long __strnlen_user(const char __user *s, long n)
+{
+       long res = 0;
+       char c;
+
+       while (1) {
+               if (res>n)
+                       return n+1;
+               if (__get_user(c, s))
+                       return 0;
+               if (!c)
+                       return res+1;
+               res++;
+               s++;
+       }
+}
+EXPORT_SYMBOL(__strnlen_user);
+
+long strnlen_user(const char __user *s, long n)
+{
+       if (!access_ok(VERIFY_READ, s, n))
+               return 0;
+       return __strnlen_user(s, n);
+}
+EXPORT_SYMBOL(strnlen_user);
+
+long strlen_user(const char __user *s)
+{
+       long res = 0;
+       char c;
+
+       for (;;) {
+               if (get_user(c, s))
+                       return 0;
+               if (!c)
+                       return res+1;
+               res++;
+               s++;
+       }
+}
+EXPORT_SYMBOL(strlen_user);
+
+unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len)
+{
+       if (access_ok(VERIFY_WRITE, to, len) && access_ok(VERIFY_READ, from, len)) { 
+               return copy_user_generic((__force void *)to, (__force void *)from, len);
+       } 
+       return len;             
+}
+EXPORT_SYMBOL(copy_in_user);
+
diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile

index 265484c1723498c0651df7b8a43cebee98fa31bc..ae48f179d719f1fafb8211c259704f3ed3d353a3 100644 (file)
--- a/arch/x86_64/Makefile
+++ b/arch/x86_64/Makefile
@@ -73,7 +73,7 @@ AFLAGS += -m64
  
  head-y := arch/x86_64/kernel/head_64.o arch/x86_64/kernel/head64.o arch/x86_64/kernel/init_task_64.o
  
-libs-y                                         += arch/x86_64/lib/
+libs-y                                         += arch/x86/lib/
  core-y                                 += arch/x86_64/kernel/ \
                                            arch/x86_64/mm/ \
                                            arch/x86/crypto/ \
diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile

deleted file mode 100644 (file)

index 2d7d724..0000000
--- a/arch/x86_64/lib/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-ifeq ($(CONFIG_X86_32),y)
-include ${srctree}/arch/x86/lib/Makefile_32
-else
-include ${srctree}/arch/x86_64/lib/Makefile_64
-endif
diff --git a/arch/x86_64/lib/Makefile_64 b/arch/x86_64/lib/Makefile_64

deleted file mode 100644 (file)

index bbabad3..0000000
--- a/arch/x86_64/lib/Makefile_64
+++ /dev/null
@@ -1,13 +0,0 @@
-#
-# Makefile for x86_64-specific library files.
-#
-
-CFLAGS_csum-partial_64.o := -funroll-loops
-
-obj-y := io_64.o iomap_copy_64.o
-obj-$(CONFIG_SMP)      += msr-on-cpu.o
-
-lib-y := csum-partial_64.o csum-copy_64.o csum-wrappers_64.o delay_64.o \
-       usercopy_64.o getuser_64.o putuser_64.o  \
-       thunk_64.o clear_page_64.o copy_page_64.o bitstr_64.o bitops_64.o
-lib-y += memcpy_64.o memmove_64.o memset_64.o copy_user_64.o rwlock_64.o copy_user_nocache_64.o
diff --git a/arch/x86_64/lib/bitops_64.c b/arch/x86_64/lib/bitops_64.c

deleted file mode 100644 (file)

index 95b6d96..0000000
--- a/arch/x86_64/lib/bitops_64.c
+++ /dev/null
@@ -1,175 +0,0 @@
-#include <linux/bitops.h>
-
-#undef find_first_zero_bit
-#undef find_next_zero_bit
-#undef find_first_bit
-#undef find_next_bit
-
-static inline long
-__find_first_zero_bit(const unsigned long * addr, unsigned long size)
-{
-       long d0, d1, d2;
-       long res;
-
-       /*
-        * We must test the size in words, not in bits, because
-        * otherwise incoming sizes in the range -63..-1 will not run
-        * any scasq instructions, and then the flags used by the je
-        * instruction will have whatever random value was in place
-        * before.  Nobody should call us like that, but
-        * find_next_zero_bit() does when offset and size are at the
-        * same word and it fails to find a zero itself.
-        */
-       size += 63;
-       size >>= 6;
-       if (!size)
-               return 0;
-       asm volatile(
-               "  repe; scasq\n"
-               "  je 1f\n"
-               "  xorq -8(%%rdi),%%rax\n"
-               "  subq $8,%%rdi\n"
-               "  bsfq %%rax,%%rdx\n"
-               "1:  subq %[addr],%%rdi\n"
-               "  shlq $3,%%rdi\n"
-               "  addq %%rdi,%%rdx"
-               :"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2)
-               :"0" (0ULL), "1" (size), "2" (addr), "3" (-1ULL),
-                [addr] "S" (addr) : "memory");
-       /*
-        * Any register would do for [addr] above, but GCC tends to
-        * prefer rbx over rsi, even though rsi is readily available
-        * and doesn't have to be saved.
-        */
-       return res;
-}
-
-/**
- * find_first_zero_bit - find the first zero bit in a memory region
- * @addr: The address to start the search at
- * @size: The maximum size to search
- *
- * Returns the bit-number of the first zero bit, not the number of the byte
- * containing a bit.
- */
-long find_first_zero_bit(const unsigned long * addr, unsigned long size)
-{
-       return __find_first_zero_bit (addr, size);
-}
-
-/**
- * find_next_zero_bit - find the first zero bit in a memory region
- * @addr: The address to base the search on
- * @offset: The bitnumber to start searching at
- * @size: The maximum size to search
- */
-long find_next_zero_bit (const unsigned long * addr, long size, long offset)
-{
-       const unsigned long * p = addr + (offset >> 6);
-       unsigned long set = 0;
-       unsigned long res, bit = offset&63;
-
-       if (bit) {
-               /*
-                * Look for zero in first word
-                */
-               asm("bsfq %1,%0\n\t"
-                   "cmoveq %2,%0"
-                   : "=r" (set)
-                   : "r" (~(*p >> bit)), "r"(64L));
-               if (set < (64 - bit))
-                       return set + offset;
-               set = 64 - bit;
-               p++;
-       }
-       /*
-        * No zero yet, search remaining full words for a zero
-        */
-       res = __find_first_zero_bit (p, size - 64 * (p - addr));
-
-       return (offset + set + res);
-}
-
-static inline long
-__find_first_bit(const unsigned long * addr, unsigned long size)
-{
-       long d0, d1;
-       long res;
-
-       /*
-        * We must test the size in words, not in bits, because
-        * otherwise incoming sizes in the range -63..-1 will not run
-        * any scasq instructions, and then the flags used by the jz
-        * instruction will have whatever random value was in place
-        * before.  Nobody should call us like that, but
-        * find_next_bit() does when offset and size are at the same
-        * word and it fails to find a one itself.
-        */
-       size += 63;
-       size >>= 6;
-       if (!size)
-               return 0;
-       asm volatile(
-               "   repe; scasq\n"
-               "   jz 1f\n"
-               "   subq $8,%%rdi\n"
-               "   bsfq (%%rdi),%%rax\n"
-               "1: subq %[addr],%%rdi\n"
-               "   shlq $3,%%rdi\n"
-               "   addq %%rdi,%%rax"
-               :"=a" (res), "=&c" (d0), "=&D" (d1)
-               :"0" (0ULL), "1" (size), "2" (addr),
-                [addr] "r" (addr) : "memory");
-       return res;
-}
-
-/**
- * find_first_bit - find the first set bit in a memory region
- * @addr: The address to start the search at
- * @size: The maximum size to search
- *
- * Returns the bit-number of the first set bit, not the number of the byte
- * containing a bit.
- */
-long find_first_bit(const unsigned long * addr, unsigned long size)
-{
-       return __find_first_bit(addr,size);
-}
-
-/**
- * find_next_bit - find the first set bit in a memory region
- * @addr: The address to base the search on
- * @offset: The bitnumber to start searching at
- * @size: The maximum size to search
- */
-long find_next_bit(const unsigned long * addr, long size, long offset)
-{
-       const unsigned long * p = addr + (offset >> 6);
-       unsigned long set = 0, bit = offset & 63, res;
-
-       if (bit) {
-               /*
-                * Look for nonzero in the first 64 bits:
-                */
-               asm("bsfq %1,%0\n\t"
-                   "cmoveq %2,%0\n\t"
-                   : "=r" (set)
-                   : "r" (*p >> bit), "r" (64L));
-               if (set < (64 - bit))
-                       return set + offset;
-               set = 64 - bit;
-               p++;
-       }
-       /*
-        * No set bit yet, search remaining full words for a bit
-        */
-       res = __find_first_bit (p, size - 64 * (p - addr));
-       return (offset + set + res);
-}
-
-#include <linux/module.h>
-
-EXPORT_SYMBOL(find_next_bit);
-EXPORT_SYMBOL(find_first_bit);
-EXPORT_SYMBOL(find_first_zero_bit);
-EXPORT_SYMBOL(find_next_zero_bit);
diff --git a/arch/x86_64/lib/bitstr_64.c b/arch/x86_64/lib/bitstr_64.c

deleted file mode 100644 (file)

index 2467660..0000000
--- a/arch/x86_64/lib/bitstr_64.c
+++ /dev/null
@@ -1,28 +0,0 @@
-#include <linux/module.h>
-#include <linux/bitops.h>
-
-/* Find string of zero bits in a bitmap */ 
-unsigned long 
-find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len)
-{ 
-       unsigned long n, end, i;        
-
- again:
-       n = find_next_zero_bit(bitmap, nbits, start);
-       if (n == -1) 
-               return -1;
-       
-       /* could test bitsliced, but it's hardly worth it */
-       end = n+len;
-       if (end >= nbits) 
-               return -1; 
-       for (i = n+1; i < end; i++) { 
-               if (test_bit(i, bitmap)) {  
-                       start = i+1; 
-                       goto again; 
-               } 
-       }
-       return n;
-}
-
-EXPORT_SYMBOL(find_next_zero_string);
diff --git a/arch/x86_64/lib/clear_page_64.S b/arch/x86_64/lib/clear_page_64.S

deleted file mode 100644 (file)

index 9a10a78..0000000
--- a/arch/x86_64/lib/clear_page_64.S
+++ /dev/null
@@ -1,59 +0,0 @@
-#include <linux/linkage.h>
-#include <asm/dwarf2.h>
-
-/*
- * Zero a page.        
- * rdi page
- */                    
-       ALIGN
-clear_page_c:
-       CFI_STARTPROC
-       movl $4096/8,%ecx
-       xorl %eax,%eax
-       rep stosq
-       ret
-       CFI_ENDPROC
-ENDPROC(clear_page)
-
-ENTRY(clear_page)
-       CFI_STARTPROC
-       xorl   %eax,%eax
-       movl   $4096/64,%ecx
-       .p2align 4
-.Lloop:
-       decl    %ecx
-#define PUT(x) movq %rax,x*8(%rdi)
-       movq %rax,(%rdi)
-       PUT(1)
-       PUT(2)
-       PUT(3)
-       PUT(4)
-       PUT(5)
-       PUT(6)
-       PUT(7)
-       leaq    64(%rdi),%rdi
-       jnz     .Lloop
-       nop
-       ret
-       CFI_ENDPROC
-.Lclear_page_end:
-ENDPROC(clear_page)
-
-       /* Some CPUs run faster using the string instructions.
-          It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>
-
-       .section .altinstr_replacement,"ax"
-1:     .byte 0xeb                                      /* jmp <disp8> */
-       .byte (clear_page_c - clear_page) - (2f - 1b)   /* offset */
-2:
-       .previous
-       .section .altinstructions,"a"
-       .align 8
-       .quad clear_page
-       .quad 1b
-       .byte X86_FEATURE_REP_GOOD
-       .byte .Lclear_page_end - clear_page
-       .byte 2b - 1b
-       .previous
diff --git a/arch/x86_64/lib/copy_page_64.S b/arch/x86_64/lib/copy_page_64.S

deleted file mode 100644 (file)

index 727a5d4..0000000
--- a/arch/x86_64/lib/copy_page_64.S
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
-
-#include <linux/linkage.h>
-#include <asm/dwarf2.h>
-
-       ALIGN
-copy_page_c:
-       CFI_STARTPROC
-       movl $4096/8,%ecx
-       rep movsq
-       ret
-       CFI_ENDPROC
-ENDPROC(copy_page_c)
-
-/* Don't use streaming store because it's better when the target
-   ends up in cache. */
-           
-/* Could vary the prefetch distance based on SMP/UP */
-
-ENTRY(copy_page)
-       CFI_STARTPROC
-       subq    $3*8,%rsp
-       CFI_ADJUST_CFA_OFFSET 3*8
-       movq    %rbx,(%rsp)
-       CFI_REL_OFFSET rbx, 0
-       movq    %r12,1*8(%rsp)
-       CFI_REL_OFFSET r12, 1*8
-       movq    %r13,2*8(%rsp)
-       CFI_REL_OFFSET r13, 2*8
-
-       movl    $(4096/64)-5,%ecx
-       .p2align 4
-.Loop64:
-       dec     %rcx
-
-       movq        (%rsi), %rax
-       movq      8 (%rsi), %rbx
-       movq     16 (%rsi), %rdx
-       movq     24 (%rsi), %r8
-       movq     32 (%rsi), %r9
-       movq     40 (%rsi), %r10
-       movq     48 (%rsi), %r11
-       movq     56 (%rsi), %r12
-
-       prefetcht0 5*64(%rsi)
-
-       movq     %rax,    (%rdi)
-       movq     %rbx,  8 (%rdi)
-       movq     %rdx, 16 (%rdi)
-       movq     %r8,  24 (%rdi)
-       movq     %r9,  32 (%rdi)
-       movq     %r10, 40 (%rdi)
-       movq     %r11, 48 (%rdi)
-       movq     %r12, 56 (%rdi)
-
-       leaq    64 (%rsi), %rsi
-       leaq    64 (%rdi), %rdi
-
-       jnz     .Loop64
-
-       movl    $5,%ecx
-       .p2align 4
-.Loop2:
-       decl   %ecx
-
-       movq        (%rsi), %rax
-       movq      8 (%rsi), %rbx
-       movq     16 (%rsi), %rdx
-       movq     24 (%rsi), %r8
-       movq     32 (%rsi), %r9
-       movq     40 (%rsi), %r10
-       movq     48 (%rsi), %r11
-       movq     56 (%rsi), %r12
-
-       movq     %rax,    (%rdi)
-       movq     %rbx,  8 (%rdi)
-       movq     %rdx, 16 (%rdi)
-       movq     %r8,  24 (%rdi)
-       movq     %r9,  32 (%rdi)
-       movq     %r10, 40 (%rdi)
-       movq     %r11, 48 (%rdi)
-       movq     %r12, 56 (%rdi)
-
-       leaq    64(%rdi),%rdi
-       leaq    64(%rsi),%rsi
-
-       jnz     .Loop2
-
-       movq    (%rsp),%rbx
-       CFI_RESTORE rbx
-       movq    1*8(%rsp),%r12
-       CFI_RESTORE r12
-       movq    2*8(%rsp),%r13
-       CFI_RESTORE r13
-       addq    $3*8,%rsp
-       CFI_ADJUST_CFA_OFFSET -3*8
-       ret
-.Lcopy_page_end:
-       CFI_ENDPROC
-ENDPROC(copy_page)
-
-       /* Some CPUs run faster using the string copy instructions.
-          It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>
-
-       .section .altinstr_replacement,"ax"
-1:     .byte 0xeb                                      /* jmp <disp8> */
-       .byte (copy_page_c - copy_page) - (2f - 1b)     /* offset */
-2:
-       .previous
-       .section .altinstructions,"a"
-       .align 8
-       .quad copy_page
-       .quad 1b
-       .byte X86_FEATURE_REP_GOOD
-       .byte .Lcopy_page_end - copy_page
-       .byte 2b - 1b
-       .previous
diff --git a/arch/x86_64/lib/copy_user_64.S b/arch/x86_64/lib/copy_user_64.S

deleted file mode 100644 (file)

index 70bebd3..0000000
--- a/arch/x86_64/lib/copy_user_64.S
+++ /dev/null
@@ -1,354 +0,0 @@
-/* Copyright 2002 Andi Kleen, SuSE Labs.
- * Subject to the GNU Public License v2.
- * 
- * Functions to copy from and to user space.           
- */             
-
-#include <linux/linkage.h>
-#include <asm/dwarf2.h>
-
-#define FIX_ALIGNMENT 1
-
-#include <asm/current.h>
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-#include <asm/cpufeature.h>
-
-       .macro ALTERNATIVE_JUMP feature,orig,alt
-0:
-       .byte 0xe9      /* 32bit jump */
-       .long \orig-1f  /* by default jump to orig */
-1:
-       .section .altinstr_replacement,"ax"
-2:     .byte 0xe9                   /* near jump with 32bit immediate */
-       .long \alt-1b /* offset */   /* or alternatively to alt */
-       .previous
-       .section .altinstructions,"a"
-       .align 8
-       .quad  0b
-       .quad  2b
-       .byte  \feature              /* when feature is set */
-       .byte  5
-       .byte  5
-       .previous
-       .endm
-
-/* Standard copy_to_user with segment limit checking */                
-ENTRY(copy_to_user)
-       CFI_STARTPROC
-       GET_THREAD_INFO(%rax)
-       movq %rdi,%rcx
-       addq %rdx,%rcx
-       jc  bad_to_user
-       cmpq threadinfo_addr_limit(%rax),%rcx
-       jae bad_to_user
-       xorl %eax,%eax  /* clear zero flag */
-       ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
-       CFI_ENDPROC
-
-ENTRY(copy_user_generic)
-       CFI_STARTPROC
-       movl $1,%ecx    /* set zero flag */
-       ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
-       CFI_ENDPROC
-
-ENTRY(__copy_from_user_inatomic)
-       CFI_STARTPROC
-       xorl %ecx,%ecx  /* clear zero flag */
-       ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
-       CFI_ENDPROC
-
-/* Standard copy_from_user with segment limit checking */      
-ENTRY(copy_from_user)
-       CFI_STARTPROC
-       GET_THREAD_INFO(%rax)
-       movq %rsi,%rcx
-       addq %rdx,%rcx
-       jc  bad_from_user
-       cmpq threadinfo_addr_limit(%rax),%rcx
-       jae  bad_from_user
-       movl $1,%ecx    /* set zero flag */
-       ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
-       CFI_ENDPROC
-ENDPROC(copy_from_user)
-       
-       .section .fixup,"ax"
-       /* must zero dest */
-bad_from_user:
-       CFI_STARTPROC
-       movl %edx,%ecx
-       xorl %eax,%eax
-       rep
-       stosb
-bad_to_user:
-       movl    %edx,%eax
-       ret
-       CFI_ENDPROC
-END(bad_from_user)
-       .previous
-       
-               
-/*
- * copy_user_generic_unrolled - memory copy with exception handling.
- * This version is for CPUs like P4 that don't have efficient micro code for rep movsq
- *     
- * Input:      
- * rdi destination
- * rsi source
- * rdx count
- * ecx zero flag -- if true zero destination on error
- *
- * Output:             
- * eax uncopied bytes or 0 if successful.
- */
-ENTRY(copy_user_generic_unrolled)
-       CFI_STARTPROC
-       pushq %rbx
-       CFI_ADJUST_CFA_OFFSET 8
-       CFI_REL_OFFSET rbx, 0
-       pushq %rcx
-       CFI_ADJUST_CFA_OFFSET 8
-       CFI_REL_OFFSET rcx, 0
-       xorl %eax,%eax          /*zero for the exception handler */
-
-#ifdef FIX_ALIGNMENT
-       /* check for bad alignment of destination */
-       movl %edi,%ecx
-       andl $7,%ecx
-       jnz  .Lbad_alignment
-.Lafter_bad_alignment:
-#endif
-
-       movq %rdx,%rcx
-
-       movl $64,%ebx
-       shrq $6,%rdx
-       decq %rdx
-       js   .Lhandle_tail
-
-       .p2align 4
-.Lloop:
-.Ls1:  movq (%rsi),%r11
-.Ls2:  movq 1*8(%rsi),%r8
-.Ls3:  movq 2*8(%rsi),%r9
-.Ls4:  movq 3*8(%rsi),%r10
-.Ld1:  movq %r11,(%rdi)
-.Ld2:  movq %r8,1*8(%rdi)
-.Ld3:  movq %r9,2*8(%rdi)
-.Ld4:  movq %r10,3*8(%rdi)
-
-.Ls5:  movq 4*8(%rsi),%r11
-.Ls6:  movq 5*8(%rsi),%r8
-.Ls7:  movq 6*8(%rsi),%r9
-.Ls8:  movq 7*8(%rsi),%r10
-.Ld5:  movq %r11,4*8(%rdi)
-.Ld6:  movq %r8,5*8(%rdi)
-.Ld7:  movq %r9,6*8(%rdi)
-.Ld8:  movq %r10,7*8(%rdi)
-
-       decq %rdx
-
-       leaq 64(%rsi),%rsi
-       leaq 64(%rdi),%rdi
-
-       jns  .Lloop
-
-       .p2align 4
-.Lhandle_tail:
-       movl %ecx,%edx
-       andl $63,%ecx
-       shrl $3,%ecx
-       jz   .Lhandle_7
-       movl $8,%ebx
-       .p2align 4
-.Lloop_8:
-.Ls9:  movq (%rsi),%r8
-.Ld9:  movq %r8,(%rdi)
-       decl %ecx
-       leaq 8(%rdi),%rdi
-       leaq 8(%rsi),%rsi
-       jnz .Lloop_8
-
-.Lhandle_7:
-       movl %edx,%ecx
-       andl $7,%ecx
-       jz   .Lende
-       .p2align 4
-.Lloop_1:
-.Ls10: movb (%rsi),%bl
-.Ld10: movb %bl,(%rdi)
-       incq %rdi
-       incq %rsi
-       decl %ecx
-       jnz .Lloop_1
-
-       CFI_REMEMBER_STATE
-.Lende:
-       popq %rcx
-       CFI_ADJUST_CFA_OFFSET -8
-       CFI_RESTORE rcx
-       popq %rbx
-       CFI_ADJUST_CFA_OFFSET -8
-       CFI_RESTORE rbx
-       ret
-       CFI_RESTORE_STATE
-
-#ifdef FIX_ALIGNMENT
-       /* align destination */
-       .p2align 4
-.Lbad_alignment:
-       movl $8,%r9d
-       subl %ecx,%r9d
-       movl %r9d,%ecx
-       cmpq %r9,%rdx
-       jz   .Lhandle_7
-       js   .Lhandle_7
-.Lalign_1:
-.Ls11: movb (%rsi),%bl
-.Ld11: movb %bl,(%rdi)
-       incq %rsi
-       incq %rdi
-       decl %ecx
-       jnz .Lalign_1
-       subq %r9,%rdx
-       jmp .Lafter_bad_alignment
-#endif
-
-       /* table sorted by exception address */
-       .section __ex_table,"a"
-       .align 8
-       .quad .Ls1,.Ls1e
-       .quad .Ls2,.Ls2e
-       .quad .Ls3,.Ls3e
-       .quad .Ls4,.Ls4e
-       .quad .Ld1,.Ls1e
-       .quad .Ld2,.Ls2e
-       .quad .Ld3,.Ls3e
-       .quad .Ld4,.Ls4e
-       .quad .Ls5,.Ls5e
-       .quad .Ls6,.Ls6e
-       .quad .Ls7,.Ls7e
-       .quad .Ls8,.Ls8e
-       .quad .Ld5,.Ls5e
-       .quad .Ld6,.Ls6e
-       .quad .Ld7,.Ls7e
-       .quad .Ld8,.Ls8e
-       .quad .Ls9,.Le_quad
-       .quad .Ld9,.Le_quad
-       .quad .Ls10,.Le_byte
-       .quad .Ld10,.Le_byte
-#ifdef FIX_ALIGNMENT
-       .quad .Ls11,.Lzero_rest
-       .quad .Ld11,.Lzero_rest
-#endif
-       .quad .Le5,.Le_zero
-       .previous
-
-       /* compute 64-offset for main loop. 8 bytes accuracy with error on the
-          pessimistic side. this is gross. it would be better to fix the
-          interface. */
-       /* eax: zero, ebx: 64 */
-.Ls1e:         addl $8,%eax
-.Ls2e:         addl $8,%eax
-.Ls3e:         addl $8,%eax
-.Ls4e:         addl $8,%eax
-.Ls5e:         addl $8,%eax
-.Ls6e:         addl $8,%eax
-.Ls7e:         addl $8,%eax
-.Ls8e:         addl $8,%eax
-       addq %rbx,%rdi  /* +64 */
-       subq %rax,%rdi  /* correct destination with computed offset */
-
-       shlq $6,%rdx    /* loop counter * 64 (stride length) */
-       addq %rax,%rdx  /* add offset to loopcnt */
-       andl $63,%ecx   /* remaining bytes */
-       addq %rcx,%rdx  /* add them */
-       jmp .Lzero_rest
-
-       /* exception on quad word loop in tail handling */
-       /* ecx: loopcnt/8, %edx: length, rdi: correct */
-.Le_quad:
-       shll $3,%ecx
-       andl $7,%edx
-       addl %ecx,%edx
-       /* edx: bytes to zero, rdi: dest, eax:zero */
-.Lzero_rest:
-       cmpl $0,(%rsp)
-       jz   .Le_zero
-       movq %rdx,%rcx
-.Le_byte:
-       xorl %eax,%eax
-.Le5:  rep
-       stosb
-       /* when there is another exception while zeroing the rest just return */
-.Le_zero:
-       movq %rdx,%rax
-       jmp .Lende
-       CFI_ENDPROC
-ENDPROC(copy_user_generic)
-
-
-       /* Some CPUs run faster using the string copy instructions.
-          This is also a lot simpler. Use them when possible.
-          Patch in jmps to this code instead of copying it fully
-          to avoid unwanted aliasing in the exception tables. */
-
- /* rdi        destination
-  * rsi source
-  * rdx count
-  * ecx zero flag
-  *
-  * Output:
-  * eax uncopied bytes or 0 if successfull.
-  *
-  * Only 4GB of copy is supported. This shouldn't be a problem
-  * because the kernel normally only writes from/to page sized chunks
-  * even if user space passed a longer buffer.
-  * And more would be dangerous because both Intel and AMD have
-  * errata with rep movsq > 4GB. If someone feels the need to fix
-  * this please consider this.
-  */
-ENTRY(copy_user_generic_string)
-       CFI_STARTPROC
-       movl %ecx,%r8d          /* save zero flag */
-       movl %edx,%ecx
-       shrl $3,%ecx
-       andl $7,%edx    
-       jz   10f
-1:     rep 
-       movsq 
-       movl %edx,%ecx
-2:     rep
-       movsb
-9:     movl %ecx,%eax
-       ret
-
-       /* multiple of 8 byte */
-10:    rep
-       movsq
-       xor %eax,%eax
-       ret
-
-       /* exception handling */
-3:      lea (%rdx,%rcx,8),%rax /* exception on quad loop */
-       jmp 6f
-5:     movl %ecx,%eax          /* exception on byte loop */
-       /* eax: left over bytes */
-6:     testl %r8d,%r8d         /* zero flag set? */
-       jz 7f
-       movl %eax,%ecx          /* initialize x86 loop counter */
-       push %rax
-       xorl %eax,%eax
-8:     rep
-       stosb                   /* zero the rest */
-11:    pop %rax
-7:     ret
-       CFI_ENDPROC
-END(copy_user_generic_c)
-
-       .section __ex_table,"a"
-       .quad 1b,3b
-       .quad 2b,5b
-       .quad 8b,11b
-       .quad 10b,3b
-       .previous
diff --git a/arch/x86_64/lib/copy_user_nocache_64.S b/arch/x86_64/lib/copy_user_nocache_64.S

deleted file mode 100644 (file)

index 4620efb..0000000
--- a/arch/x86_64/lib/copy_user_nocache_64.S
+++ /dev/null
@@ -1,217 +0,0 @@
-/* Copyright 2002 Andi Kleen, SuSE Labs.
- * Subject to the GNU Public License v2.
- *
- * Functions to copy from and to user space.
- */
-
-#include <linux/linkage.h>
-#include <asm/dwarf2.h>
-
-#define FIX_ALIGNMENT 1
-
-#include <asm/current.h>
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-#include <asm/cpufeature.h>
-
-/*
- * copy_user_nocache - Uncached memory copy with exception handling
- * This will force destination/source out of cache for more performance.
- *
- * Input:
- * rdi destination
- * rsi source
- * rdx count
- * rcx zero flag       when 1 zero on exception
- *
- * Output:
- * eax uncopied bytes or 0 if successful.
- */
-ENTRY(__copy_user_nocache)
-       CFI_STARTPROC
-       pushq %rbx
-       CFI_ADJUST_CFA_OFFSET 8
-       CFI_REL_OFFSET rbx, 0
-       pushq %rcx              /* save zero flag */
-       CFI_ADJUST_CFA_OFFSET 8
-       CFI_REL_OFFSET rcx, 0
-
-       xorl %eax,%eax          /* zero for the exception handler */
-
-#ifdef FIX_ALIGNMENT
-       /* check for bad alignment of destination */
-       movl %edi,%ecx
-       andl $7,%ecx
-       jnz  .Lbad_alignment
-.Lafter_bad_alignment:
-#endif
-
-       movq %rdx,%rcx
-
-       movl $64,%ebx
-       shrq $6,%rdx
-       decq %rdx
-       js   .Lhandle_tail
-
-       .p2align 4
-.Lloop:
-.Ls1:  movq (%rsi),%r11
-.Ls2:  movq 1*8(%rsi),%r8
-.Ls3:  movq 2*8(%rsi),%r9
-.Ls4:  movq 3*8(%rsi),%r10
-.Ld1:  movnti %r11,(%rdi)
-.Ld2:  movnti %r8,1*8(%rdi)
-.Ld3:  movnti %r9,2*8(%rdi)
-.Ld4:  movnti %r10,3*8(%rdi)
-
-.Ls5:  movq 4*8(%rsi),%r11
-.Ls6:  movq 5*8(%rsi),%r8
-.Ls7:  movq 6*8(%rsi),%r9
-.Ls8:  movq 7*8(%rsi),%r10
-.Ld5:  movnti %r11,4*8(%rdi)
-.Ld6:  movnti %r8,5*8(%rdi)
-.Ld7:  movnti %r9,6*8(%rdi)
-.Ld8:  movnti %r10,7*8(%rdi)
-
-       dec  %rdx
-
-       leaq 64(%rsi),%rsi
-       leaq 64(%rdi),%rdi
-
-       jns  .Lloop
-
-       .p2align 4
-.Lhandle_tail:
-       movl %ecx,%edx
-       andl $63,%ecx
-       shrl $3,%ecx
-       jz   .Lhandle_7
-       movl $8,%ebx
-       .p2align 4
-.Lloop_8:
-.Ls9:  movq (%rsi),%r8
-.Ld9:  movnti %r8,(%rdi)
-       decl %ecx
-       leaq 8(%rdi),%rdi
-       leaq 8(%rsi),%rsi
-       jnz .Lloop_8
-
-.Lhandle_7:
-       movl %edx,%ecx
-       andl $7,%ecx
-       jz   .Lende
-       .p2align 4
-.Lloop_1:
-.Ls10: movb (%rsi),%bl
-.Ld10: movb %bl,(%rdi)
-       incq %rdi
-       incq %rsi
-       decl %ecx
-       jnz .Lloop_1
-
-       CFI_REMEMBER_STATE
-.Lende:
-       popq %rcx
-       CFI_ADJUST_CFA_OFFSET -8
-       CFI_RESTORE %rcx
-       popq %rbx
-       CFI_ADJUST_CFA_OFFSET -8
-       CFI_RESTORE rbx
-       ret
-       CFI_RESTORE_STATE
-
-#ifdef FIX_ALIGNMENT
-       /* align destination */
-       .p2align 4
-.Lbad_alignment:
-       movl $8,%r9d
-       subl %ecx,%r9d
-       movl %r9d,%ecx
-       cmpq %r9,%rdx
-       jz   .Lhandle_7
-       js   .Lhandle_7
-.Lalign_1:
-.Ls11: movb (%rsi),%bl
-.Ld11: movb %bl,(%rdi)
-       incq %rsi
-       incq %rdi
-       decl %ecx
-       jnz .Lalign_1
-       subq %r9,%rdx
-       jmp .Lafter_bad_alignment
-#endif
-
-       /* table sorted by exception address */
-       .section __ex_table,"a"
-       .align 8
-       .quad .Ls1,.Ls1e
-       .quad .Ls2,.Ls2e
-       .quad .Ls3,.Ls3e
-       .quad .Ls4,.Ls4e
-       .quad .Ld1,.Ls1e
-       .quad .Ld2,.Ls2e
-       .quad .Ld3,.Ls3e
-       .quad .Ld4,.Ls4e
-       .quad .Ls5,.Ls5e
-       .quad .Ls6,.Ls6e
-       .quad .Ls7,.Ls7e
-       .quad .Ls8,.Ls8e
-       .quad .Ld5,.Ls5e
-       .quad .Ld6,.Ls6e
-       .quad .Ld7,.Ls7e
-       .quad .Ld8,.Ls8e
-       .quad .Ls9,.Le_quad
-       .quad .Ld9,.Le_quad
-       .quad .Ls10,.Le_byte
-       .quad .Ld10,.Le_byte
-#ifdef FIX_ALIGNMENT
-       .quad .Ls11,.Lzero_rest
-       .quad .Ld11,.Lzero_rest
-#endif
-       .quad .Le5,.Le_zero
-       .previous
-
-       /* compute 64-offset for main loop. 8 bytes accuracy with error on the
-          pessimistic side. this is gross. it would be better to fix the
-          interface. */
-       /* eax: zero, ebx: 64 */
-.Ls1e:         addl $8,%eax
-.Ls2e:         addl $8,%eax
-.Ls3e:         addl $8,%eax
-.Ls4e:         addl $8,%eax
-.Ls5e:         addl $8,%eax
-.Ls6e:         addl $8,%eax
-.Ls7e:         addl $8,%eax
-.Ls8e:         addl $8,%eax
-       addq %rbx,%rdi  /* +64 */
-       subq %rax,%rdi  /* correct destination with computed offset */
-
-       shlq $6,%rdx    /* loop counter * 64 (stride length) */
-       addq %rax,%rdx  /* add offset to loopcnt */
-       andl $63,%ecx   /* remaining bytes */
-       addq %rcx,%rdx  /* add them */
-       jmp .Lzero_rest
-
-       /* exception on quad word loop in tail handling */
-       /* ecx: loopcnt/8, %edx: length, rdi: correct */
-.Le_quad:
-       shll $3,%ecx
-       andl $7,%edx
-       addl %ecx,%edx
-       /* edx: bytes to zero, rdi: dest, eax:zero */
-.Lzero_rest:
-       cmpl $0,(%rsp)  /* zero flag set? */
-       jz   .Le_zero
-       movq %rdx,%rcx
-.Le_byte:
-       xorl %eax,%eax
-.Le5:  rep
-       stosb
-       /* when there is another exception while zeroing the rest just return */
-.Le_zero:
-       movq %rdx,%rax
-       jmp .Lende
-       CFI_ENDPROC
-ENDPROC(__copy_user_nocache)
-
-
diff --git a/arch/x86_64/lib/csum-copy_64.S b/arch/x86_64/lib/csum-copy_64.S

deleted file mode 100644 (file)

index f0dba36..0000000
--- a/arch/x86_64/lib/csum-copy_64.S
+++ /dev/null
@@ -1,249 +0,0 @@
-/*
- * Copyright 2002,2003 Andi Kleen, SuSE Labs.
- *     
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file COPYING in the main directory of this archive
- * for more details. No warranty for anything given at all.
- */
-#include <linux/linkage.h>
-#include <asm/dwarf2.h>
-#include <asm/errno.h>
-
-/*
- * Checksum copy with exception handling.
- * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the 
- * destination is zeroed.
- * 
- * Input
- * rdi  source
- * rsi  destination
- * edx  len (32bit)
- * ecx  sum (32bit) 
- * r8   src_err_ptr (int)
- * r9   dst_err_ptr (int)
- *
- * Output
- * eax  64bit sum. undefined in case of exception.
- * 
- * Wrappers need to take care of valid exception sum and zeroing.               
- * They also should align source or destination to 8 bytes.
- */
-
-       .macro source
-10:
-       .section __ex_table,"a"
-       .align 8
-       .quad 10b,.Lbad_source
-       .previous
-       .endm
-               
-       .macro dest
-20:
-       .section __ex_table,"a"
-       .align 8
-       .quad 20b,.Lbad_dest
-       .previous
-       .endm
-                       
-       .macro ignore L=.Lignore
-30:
-       .section __ex_table,"a"
-       .align 8
-       .quad 30b,\L
-       .previous
-       .endm
-       
-                               
-ENTRY(csum_partial_copy_generic)
-       CFI_STARTPROC
-       cmpl     $3*64,%edx
-       jle      .Lignore
-
-.Lignore:              
-       subq  $7*8,%rsp
-       CFI_ADJUST_CFA_OFFSET 7*8
-       movq  %rbx,2*8(%rsp)
-       CFI_REL_OFFSET rbx, 2*8
-       movq  %r12,3*8(%rsp)
-       CFI_REL_OFFSET r12, 3*8
-       movq  %r14,4*8(%rsp)
-       CFI_REL_OFFSET r14, 4*8
-       movq  %r13,5*8(%rsp)
-       CFI_REL_OFFSET r13, 5*8
-       movq  %rbp,6*8(%rsp)
-       CFI_REL_OFFSET rbp, 6*8
-
-       movq  %r8,(%rsp)
-       movq  %r9,1*8(%rsp)
-       
-       movl  %ecx,%eax
-       movl  %edx,%ecx
-
-       xorl  %r9d,%r9d
-       movq  %rcx,%r12
-
-       shrq  $6,%r12
-       jz    .Lhandle_tail       /* < 64 */
-
-       clc
-       
-       /* main loop. clear in 64 byte blocks */
-       /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
-       /* r11: temp3, rdx: temp4, r12 loopcnt */
-       /* r10: temp5, rbp: temp6, r14 temp7, r13 temp8 */
-       .p2align 4
-.Lloop:
-       source
-       movq  (%rdi),%rbx
-       source
-       movq  8(%rdi),%r8
-       source
-       movq  16(%rdi),%r11
-       source
-       movq  24(%rdi),%rdx
-
-       source
-       movq  32(%rdi),%r10
-       source
-       movq  40(%rdi),%rbp
-       source
-       movq  48(%rdi),%r14
-       source
-       movq  56(%rdi),%r13
-               
-       ignore 2f
-       prefetcht0 5*64(%rdi)
-2:                                                     
-       adcq  %rbx,%rax
-       adcq  %r8,%rax
-       adcq  %r11,%rax
-       adcq  %rdx,%rax
-       adcq  %r10,%rax
-       adcq  %rbp,%rax
-       adcq  %r14,%rax
-       adcq  %r13,%rax
-
-       decl %r12d
-       
-       dest
-       movq %rbx,(%rsi)
-       dest
-       movq %r8,8(%rsi)
-       dest
-       movq %r11,16(%rsi)
-       dest
-       movq %rdx,24(%rsi)
-
-       dest
-       movq %r10,32(%rsi)
-       dest
-       movq %rbp,40(%rsi)
-       dest
-       movq %r14,48(%rsi)
-       dest
-       movq %r13,56(%rsi)
-       
-3:
-       
-       leaq 64(%rdi),%rdi
-       leaq 64(%rsi),%rsi
-
-       jnz   .Lloop
-
-       adcq  %r9,%rax
-
-       /* do last upto 56 bytes */
-.Lhandle_tail:
-       /* ecx: count */
-       movl %ecx,%r10d
-       andl $63,%ecx
-       shrl $3,%ecx
-       jz       .Lfold
-       clc
-       .p2align 4
-.Lloop_8:      
-       source
-       movq (%rdi),%rbx
-       adcq %rbx,%rax
-       decl %ecx
-       dest
-       movq %rbx,(%rsi)
-       leaq 8(%rsi),%rsi /* preserve carry */
-       leaq 8(%rdi),%rdi
-       jnz     .Lloop_8
-       adcq %r9,%rax   /* add in carry */
-
-.Lfold:
-       /* reduce checksum to 32bits */
-       movl %eax,%ebx
-       shrq $32,%rax
-       addl %ebx,%eax
-       adcl %r9d,%eax
-
-       /* do last upto 6 bytes */      
-.Lhandle_7:
-       movl %r10d,%ecx
-       andl $7,%ecx
-       shrl $1,%ecx
-       jz   .Lhandle_1
-       movl $2,%edx
-       xorl %ebx,%ebx
-       clc  
-       .p2align 4
-.Lloop_1:      
-       source
-       movw (%rdi),%bx
-       adcl %ebx,%eax
-       decl %ecx
-       dest
-       movw %bx,(%rsi)
-       leaq 2(%rdi),%rdi
-       leaq 2(%rsi),%rsi
-       jnz .Lloop_1
-       adcl %r9d,%eax  /* add in carry */
-       
-       /* handle last odd byte */
-.Lhandle_1:
-       testl $1,%r10d
-       jz    .Lende
-       xorl  %ebx,%ebx
-       source
-       movb (%rdi),%bl
-       dest
-       movb %bl,(%rsi)
-       addl %ebx,%eax
-       adcl %r9d,%eax          /* carry */
-                       
-       CFI_REMEMBER_STATE
-.Lende:
-       movq 2*8(%rsp),%rbx
-       CFI_RESTORE rbx
-       movq 3*8(%rsp),%r12
-       CFI_RESTORE r12
-       movq 4*8(%rsp),%r14
-       CFI_RESTORE r14
-       movq 5*8(%rsp),%r13
-       CFI_RESTORE r13
-       movq 6*8(%rsp),%rbp
-       CFI_RESTORE rbp
-       addq $7*8,%rsp
-       CFI_ADJUST_CFA_OFFSET -7*8
-       ret
-       CFI_RESTORE_STATE
-
-       /* Exception handlers. Very simple, zeroing is done in the wrappers */
-.Lbad_source:
-       movq (%rsp),%rax
-       testq %rax,%rax
-       jz   .Lende
-       movl $-EFAULT,(%rax)
-       jmp  .Lende
-       
-.Lbad_dest:
-       movq 8(%rsp),%rax
-       testq %rax,%rax
-       jz   .Lende     
-       movl $-EFAULT,(%rax)
-       jmp .Lende
-       CFI_ENDPROC
-ENDPROC(csum_partial_copy_generic)
diff --git a/arch/x86_64/lib/csum-partial_64.c b/arch/x86_64/lib/csum-partial_64.c

deleted file mode 100644 (file)

index bc503f5..0000000
--- a/arch/x86_64/lib/csum-partial_64.c
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * arch/x86_64/lib/csum-partial.c
- *
- * This file contains network checksum routines that are better done
- * in an architecture-specific manner due to speed.
- */
- 
-#include <linux/compiler.h>
-#include <linux/module.h>
-#include <asm/checksum.h>
-
-static inline unsigned short from32to16(unsigned a) 
-{
-       unsigned short b = a >> 16; 
-       asm("addw %w2,%w0\n\t"
-           "adcw $0,%w0\n" 
-           : "=r" (b)
-           : "0" (b), "r" (a));
-       return b;
-}
-
-/*
- * Do a 64-bit checksum on an arbitrary memory area.
- * Returns a 32bit checksum.
- *
- * This isn't as time critical as it used to be because many NICs
- * do hardware checksumming these days.
- * 
- * Things tried and found to not make it faster:
- * Manual Prefetching
- * Unrolling to an 128 bytes inner loop.
- * Using interleaving with more registers to break the carry chains.
- */
-static unsigned do_csum(const unsigned char *buff, unsigned len)
-{
-       unsigned odd, count;
-       unsigned long result = 0;
-
-       if (unlikely(len == 0))
-               return result; 
-       odd = 1 & (unsigned long) buff;
-       if (unlikely(odd)) {
-               result = *buff << 8;
-               len--;
-               buff++;
-       }
-       count = len >> 1;               /* nr of 16-bit words.. */
-       if (count) {
-               if (2 & (unsigned long) buff) {
-                       result += *(unsigned short *)buff;
-                       count--;
-                       len -= 2;
-                       buff += 2;
-               }
-               count >>= 1;            /* nr of 32-bit words.. */
-               if (count) {
-                       unsigned long zero;
-                       unsigned count64;
-                       if (4 & (unsigned long) buff) {
-                               result += *(unsigned int *) buff;
-                               count--;
-                               len -= 4;
-                               buff += 4;
-                       }
-                       count >>= 1;    /* nr of 64-bit words.. */
-
-                       /* main loop using 64byte blocks */
-                       zero = 0;
-                       count64 = count >> 3;
-                       while (count64) { 
-                               asm("addq 0*8(%[src]),%[res]\n\t"
-                                   "adcq 1*8(%[src]),%[res]\n\t"
-                                   "adcq 2*8(%[src]),%[res]\n\t"
-                                   "adcq 3*8(%[src]),%[res]\n\t"
-                                   "adcq 4*8(%[src]),%[res]\n\t"
-                                   "adcq 5*8(%[src]),%[res]\n\t"
-                                   "adcq 6*8(%[src]),%[res]\n\t"
-                                   "adcq 7*8(%[src]),%[res]\n\t"
-                                   "adcq %[zero],%[res]"
-                                   : [res] "=r" (result)
-                                   : [src] "r" (buff), [zero] "r" (zero),
-                                   "[res]" (result));
-                               buff += 64;
-                               count64--;
-                       }
-
-                       /* last upto 7 8byte blocks */
-                       count %= 8; 
-                       while (count) { 
-                               asm("addq %1,%0\n\t"
-                                   "adcq %2,%0\n" 
-                                           : "=r" (result)
-                                   : "m" (*(unsigned long *)buff), 
-                                   "r" (zero),  "0" (result));
-                               --count; 
-                                       buff += 8;
-                       }
-                       result = add32_with_carry(result>>32,
-                                                 result&0xffffffff); 
-
-                       if (len & 4) {
-                               result += *(unsigned int *) buff;
-                               buff += 4;
-                       }
-               }
-               if (len & 2) {
-                       result += *(unsigned short *) buff;
-                       buff += 2;
-               }
-       }
-       if (len & 1)
-               result += *buff;
-       result = add32_with_carry(result>>32, result & 0xffffffff); 
-       if (unlikely(odd)) { 
-               result = from32to16(result);
-               result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
-       }
-       return result;
-}
-
-/*
- * computes the checksum of a memory block at buff, length len,
- * and adds in "sum" (32-bit)
- *
- * returns a 32-bit number suitable for feeding into itself
- * or csum_tcpudp_magic
- *
- * this function must be called with even lengths, except
- * for the last fragment, which may be odd
- *
- * it's best to have buff aligned on a 64-bit boundary
- */
-__wsum csum_partial(const void *buff, int len, __wsum sum)
-{
-       return (__force __wsum)add32_with_carry(do_csum(buff, len),
-                                               (__force u32)sum);
-}
-
-EXPORT_SYMBOL(csum_partial);
-
-/*
- * this routine is used for miscellaneous IP-like checksums, mainly
- * in icmp.c
- */
-__sum16 ip_compute_csum(const void *buff, int len)
-{
-       return csum_fold(csum_partial(buff,len,0));
-}
-EXPORT_SYMBOL(ip_compute_csum);
-
diff --git a/arch/x86_64/lib/csum-wrappers_64.c b/arch/x86_64/lib/csum-wrappers_64.c

deleted file mode 100644 (file)

index fd42a4a..0000000
--- a/arch/x86_64/lib/csum-wrappers_64.c
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright 2002,2003 Andi Kleen, SuSE Labs.
- * Subject to the GNU Public License v.2
- * 
- * Wrappers of assembly checksum functions for x86-64.
- */
-
-#include <asm/checksum.h>
-#include <linux/module.h>
-
-/** 
- * csum_partial_copy_from_user - Copy and checksum from user space. 
- * @src: source address (user space) 
- * @dst: destination address
- * @len: number of bytes to be copied.
- * @isum: initial sum that is added into the result (32bit unfolded)
- * @errp: set to -EFAULT for an bad source address.
- * 
- * Returns an 32bit unfolded checksum of the buffer.
- * src and dst are best aligned to 64bits. 
- */ 
-__wsum
-csum_partial_copy_from_user(const void __user *src, void *dst,
-                           int len, __wsum isum, int *errp)
-{ 
-       might_sleep();
-       *errp = 0;
-       if (likely(access_ok(VERIFY_READ,src, len))) { 
-               /* Why 6, not 7? To handle odd addresses aligned we
-                  would need to do considerable complications to fix the
-                  checksum which is defined as an 16bit accumulator. The
-                  fix alignment code is primarily for performance
-                  compatibility with 32bit and that will handle odd
-                  addresses slowly too. */
-               if (unlikely((unsigned long)src & 6)) {                 
-                       while (((unsigned long)src & 6) && len >= 2) { 
-                               __u16 val16;                    
-                               *errp = __get_user(val16, (const __u16 __user *)src);
-                               if (*errp)
-                                       return isum;
-                               *(__u16 *)dst = val16;
-                               isum = (__force __wsum)add32_with_carry(
-                                               (__force unsigned)isum, val16);
-                               src += 2; 
-                               dst += 2; 
-                               len -= 2;
-                       }
-               }
-               isum = csum_partial_copy_generic((__force const void *)src,
-                                       dst, len, isum, errp, NULL);
-               if (likely(*errp == 0)) 
-                       return isum;
-       } 
-       *errp = -EFAULT;
-       memset(dst,0,len); 
-       return isum;            
-} 
-
-EXPORT_SYMBOL(csum_partial_copy_from_user);
-
-/** 
- * csum_partial_copy_to_user - Copy and checksum to user space. 
- * @src: source address
- * @dst: destination address (user space)
- * @len: number of bytes to be copied.
- * @isum: initial sum that is added into the result (32bit unfolded)
- * @errp: set to -EFAULT for an bad destination address.
- * 
- * Returns an 32bit unfolded checksum of the buffer.
- * src and dst are best aligned to 64bits.
- */ 
-__wsum
-csum_partial_copy_to_user(const void *src, void __user *dst,
-                         int len, __wsum isum, int *errp)
-{ 
-       might_sleep();
-       if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) {
-               *errp = -EFAULT;
-               return 0; 
-       }
-
-       if (unlikely((unsigned long)dst & 6)) {
-               while (((unsigned long)dst & 6) && len >= 2) { 
-                       __u16 val16 = *(__u16 *)src;
-                       isum = (__force __wsum)add32_with_carry(
-                                       (__force unsigned)isum, val16);
-                       *errp = __put_user(val16, (__u16 __user *)dst);
-                       if (*errp)
-                               return isum;
-                       src += 2; 
-                       dst += 2; 
-                       len -= 2;
-               }
-       }
-
-       *errp = 0;
-       return csum_partial_copy_generic(src, (void __force *)dst,len,isum,NULL,errp); 
-} 
-
-EXPORT_SYMBOL(csum_partial_copy_to_user);
-
-/** 
- * csum_partial_copy_nocheck - Copy and checksum.
- * @src: source address
- * @dst: destination address
- * @len: number of bytes to be copied.
- * @isum: initial sum that is added into the result (32bit unfolded)
- * 
- * Returns an 32bit unfolded checksum of the buffer.
- */ 
-__wsum
-csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum)
-{ 
-       return csum_partial_copy_generic(src,dst,len,sum,NULL,NULL);
-} 
-EXPORT_SYMBOL(csum_partial_copy_nocheck);
-
-__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
-                       const struct in6_addr *daddr,
-                       __u32 len, unsigned short proto, __wsum sum)
-{
-       __u64 rest, sum64;
-     
-       rest = (__force __u64)htonl(len) + (__force __u64)htons(proto) +
-               (__force __u64)sum;
-       asm("  addq (%[saddr]),%[sum]\n"
-           "  adcq 8(%[saddr]),%[sum]\n"
-           "  adcq (%[daddr]),%[sum]\n" 
-           "  adcq 8(%[daddr]),%[sum]\n"
-           "  adcq $0,%[sum]\n"
-           : [sum] "=r" (sum64) 
-           : "[sum]" (rest),[saddr] "r" (saddr), [daddr] "r" (daddr));
-       return csum_fold((__force __wsum)add32_with_carry(sum64 & 0xffffffff, sum64>>32));
-}
-
-EXPORT_SYMBOL(csum_ipv6_magic);
diff --git a/arch/x86_64/lib/delay_64.c b/arch/x86_64/lib/delay_64.c

deleted file mode 100644 (file)

index 2dbebd3..0000000
--- a/arch/x86_64/lib/delay_64.c
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- *     Precise Delay Loops for x86-64
- *
- *     Copyright (C) 1993 Linus Torvalds
- *     Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
- *
- *     The __delay function must _NOT_ be inlined as its execution time
- *     depends wildly on alignment on many x86 processors. 
- */
-
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/delay.h>
-#include <asm/delay.h>
-#include <asm/msr.h>
-
-#ifdef CONFIG_SMP
-#include <asm/smp.h>
-#endif
-
-int read_current_timer(unsigned long *timer_value)
-{
-       rdtscll(*timer_value);
-       return 0;
-}
-
-void __delay(unsigned long loops)
-{
-       unsigned bclock, now;
-       
-       rdtscl(bclock);
-       do
-       {
-               rep_nop(); 
-               rdtscl(now);
-       }
-       while((now-bclock) < loops);
-}
-EXPORT_SYMBOL(__delay);
-
-inline void __const_udelay(unsigned long xloops)
-{
-       __delay(((xloops * HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32) + 1);
-}
-EXPORT_SYMBOL(__const_udelay);
-
-void __udelay(unsigned long usecs)
-{
-       __const_udelay(usecs * 0x000010c7);  /* 2**32 / 1000000 (rounded up) */
-}
-EXPORT_SYMBOL(__udelay);
-
-void __ndelay(unsigned long nsecs)
-{
-       __const_udelay(nsecs * 0x00005);  /* 2**32 / 1000000000 (rounded up) */
-}
-EXPORT_SYMBOL(__ndelay);
diff --git a/arch/x86_64/lib/getuser_64.S b/arch/x86_64/lib/getuser_64.S

deleted file mode 100644 (file)

index 5448876..0000000
--- a/arch/x86_64/lib/getuser_64.S
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * __get_user functions.
- *
- * (C) Copyright 1998 Linus Torvalds
- * (C) Copyright 2005 Andi Kleen
- *
- * These functions have a non-standard call interface
- * to make them more efficient, especially as they
- * return an error value in addition to the "real"
- * return value.
- */
-
-/*
- * __get_user_X
- *
- * Inputs:     %rcx contains the address.
- *             The register is modified, but all changes are undone
- *             before returning because the C code doesn't know about it.
- *
- * Outputs:    %rax is error code (0 or -EFAULT)
- *             %rdx contains zero-extended value
- * 
- * %r8 is destroyed.
- *
- * These functions should not modify any other registers,
- * as they get called from within inline assembly.
- */
-
-#include <linux/linkage.h>
-#include <asm/dwarf2.h>
-#include <asm/page.h>
-#include <asm/errno.h>
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-
-       .text
-ENTRY(__get_user_1)
-       CFI_STARTPROC
-       GET_THREAD_INFO(%r8)
-       cmpq threadinfo_addr_limit(%r8),%rcx
-       jae bad_get_user
-1:     movzb (%rcx),%edx
-       xorl %eax,%eax
-       ret
-       CFI_ENDPROC
-ENDPROC(__get_user_1)
-
-ENTRY(__get_user_2)
-       CFI_STARTPROC
-       GET_THREAD_INFO(%r8)
-       addq $1,%rcx
-       jc 20f
-       cmpq threadinfo_addr_limit(%r8),%rcx
-       jae 20f
-       decq   %rcx
-2:     movzwl (%rcx),%edx
-       xorl %eax,%eax
-       ret
-20:    decq    %rcx
-       jmp     bad_get_user
-       CFI_ENDPROC
-ENDPROC(__get_user_2)
-
-ENTRY(__get_user_4)
-       CFI_STARTPROC
-       GET_THREAD_INFO(%r8)
-       addq $3,%rcx
-       jc 30f
-       cmpq threadinfo_addr_limit(%r8),%rcx
-       jae 30f
-       subq $3,%rcx
-3:     movl (%rcx),%edx
-       xorl %eax,%eax
-       ret
-30:    subq $3,%rcx
-       jmp bad_get_user
-       CFI_ENDPROC
-ENDPROC(__get_user_4)
-
-ENTRY(__get_user_8)
-       CFI_STARTPROC
-       GET_THREAD_INFO(%r8)
-       addq $7,%rcx
-       jc 40f
-       cmpq threadinfo_addr_limit(%r8),%rcx
-       jae     40f
-       subq    $7,%rcx
-4:     movq (%rcx),%rdx
-       xorl %eax,%eax
-       ret
-40:    subq $7,%rcx
-       jmp bad_get_user
-       CFI_ENDPROC
-ENDPROC(__get_user_8)
-
-bad_get_user:
-       CFI_STARTPROC
-       xorl %edx,%edx
-       movq $(-EFAULT),%rax
-       ret
-       CFI_ENDPROC
-END(bad_get_user)
-
-.section __ex_table,"a"
-       .quad 1b,bad_get_user
-       .quad 2b,bad_get_user
-       .quad 3b,bad_get_user
-       .quad 4b,bad_get_user
-.previous
diff --git a/arch/x86_64/lib/io_64.c b/arch/x86_64/lib/io_64.c

deleted file mode 100644 (file)

index 87b4a4e..0000000
--- a/arch/x86_64/lib/io_64.c
+++ /dev/null
@@ -1,23 +0,0 @@
-#include <linux/string.h>
-#include <asm/io.h>
-#include <linux/module.h>
-
-void __memcpy_toio(unsigned long dst,const void*src,unsigned len)
-{
-       __inline_memcpy((void *) dst,src,len);
-}
-EXPORT_SYMBOL(__memcpy_toio);
-
-void __memcpy_fromio(void *dst,unsigned long src,unsigned len)
-{
-       __inline_memcpy(dst,(const void *) src,len);
-}
-EXPORT_SYMBOL(__memcpy_fromio);
-
-void memset_io(volatile void __iomem *a, int b, size_t c)
-{
-       /* XXX: memset can mangle the IO patterns quite a bit.
-          perhaps it would be better to use a dumb one */
-       memset((void *)a,b,c);
-}
-EXPORT_SYMBOL(memset_io);
diff --git a/arch/x86_64/lib/iomap_copy_64.S b/arch/x86_64/lib/iomap_copy_64.S

deleted file mode 100644 (file)

index 05a95e7..0000000
--- a/arch/x86_64/lib/iomap_copy_64.S
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright 2006 PathScale, Inc.  All Rights Reserved.
- *
- * This file is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
- */
-
-#include <linux/linkage.h>
-#include <asm/dwarf2.h>
-
-/*
- * override generic version in lib/iomap_copy.c
- */
-ENTRY(__iowrite32_copy)
-       CFI_STARTPROC
-       movl %edx,%ecx
-       rep movsd
-       ret
-       CFI_ENDPROC
-ENDPROC(__iowrite32_copy)
diff --git a/arch/x86_64/lib/memcpy_64.S b/arch/x86_64/lib/memcpy_64.S

deleted file mode 100644 (file)

index c22981f..0000000
--- a/arch/x86_64/lib/memcpy_64.S
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright 2002 Andi Kleen */
-
-#include <linux/linkage.h>
-#include <asm/dwarf2.h>
-#include <asm/cpufeature.h>
-
-/*
- * memcpy - Copy a memory block.
- *
- * Input:      
- * rdi destination
- * rsi source
- * rdx count
- * 
- * Output:
- * rax original destination
- */    
-
-       ALIGN
-memcpy_c:
-       CFI_STARTPROC
-       movq %rdi,%rax
-       movl %edx,%ecx
-       shrl $3,%ecx
-       andl $7,%edx
-       rep movsq
-       movl %edx,%ecx
-       rep movsb
-       ret
-       CFI_ENDPROC
-ENDPROC(memcpy_c)
-
-ENTRY(__memcpy)
-ENTRY(memcpy)
-       CFI_STARTPROC
-       pushq %rbx
-       CFI_ADJUST_CFA_OFFSET 8
-       CFI_REL_OFFSET rbx, 0
-       movq %rdi,%rax
-
-       movl %edx,%ecx
-       shrl $6,%ecx
-       jz .Lhandle_tail
-
-       .p2align 4
-.Lloop_64:
-       decl %ecx
-
-       movq (%rsi),%r11
-       movq 8(%rsi),%r8
-
-       movq %r11,(%rdi)
-       movq %r8,1*8(%rdi)
-
-       movq 2*8(%rsi),%r9
-       movq 3*8(%rsi),%r10
-
-       movq %r9,2*8(%rdi)
-       movq %r10,3*8(%rdi)
-
-       movq 4*8(%rsi),%r11
-       movq 5*8(%rsi),%r8
-
-       movq %r11,4*8(%rdi)
-       movq %r8,5*8(%rdi)
-
-       movq 6*8(%rsi),%r9
-       movq 7*8(%rsi),%r10
-
-       movq %r9,6*8(%rdi)
-       movq %r10,7*8(%rdi)
-
-       leaq 64(%rsi),%rsi
-       leaq 64(%rdi),%rdi
-       jnz  .Lloop_64
-
-.Lhandle_tail:
-       movl %edx,%ecx
-       andl $63,%ecx
-       shrl $3,%ecx
-       jz   .Lhandle_7
-       .p2align 4
-.Lloop_8:
-       decl %ecx
-       movq (%rsi),%r8
-       movq %r8,(%rdi)
-       leaq 8(%rdi),%rdi
-       leaq 8(%rsi),%rsi
-       jnz  .Lloop_8
-
-.Lhandle_7:
-       movl %edx,%ecx
-       andl $7,%ecx
-       jz .Lende
-       .p2align 4
-.Lloop_1:
-       movb (%rsi),%r8b
-       movb %r8b,(%rdi)
-       incq %rdi
-       incq %rsi
-       decl %ecx
-       jnz .Lloop_1
-
-.Lende:
-       popq %rbx
-       CFI_ADJUST_CFA_OFFSET -8
-       CFI_RESTORE rbx
-       ret
-.Lfinal:
-       CFI_ENDPROC
-ENDPROC(memcpy)
-ENDPROC(__memcpy)
-
-       /* Some CPUs run faster using the string copy instructions.
-          It is also a lot simpler. Use this when possible */
-
-       .section .altinstr_replacement,"ax"
-1:     .byte 0xeb                              /* jmp <disp8> */
-       .byte (memcpy_c - memcpy) - (2f - 1b)   /* offset */
-2:
-       .previous
-       .section .altinstructions,"a"
-       .align 8
-       .quad memcpy
-       .quad 1b
-       .byte X86_FEATURE_REP_GOOD
-       /* Replace only beginning, memcpy is used to apply alternatives, so it
-        * is silly to overwrite itself with nops - reboot is only outcome... */
-       .byte 2b - 1b
-       .byte 2b - 1b
-       .previous
diff --git a/arch/x86_64/lib/memmove_64.c b/arch/x86_64/lib/memmove_64.c

deleted file mode 100644 (file)

index 751ebae..0000000
--- a/arch/x86_64/lib/memmove_64.c
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Normally compiler builtins are used, but sometimes the compiler calls out
-   of line code. Based on asm-i386/string.h.
- */
-#define _STRING_C
-#include <linux/string.h>
-#include <linux/module.h>
-
-#undef memmove
-void *memmove(void * dest,const void *src,size_t count)
-{
-       if (dest < src) { 
-               return memcpy(dest,src,count);
-       } else {
-               char *p = (char *) dest + count;
-               char *s = (char *) src + count;
-               while (count--)
-                       *--p = *--s;
-       }
-       return dest;
-} 
-EXPORT_SYMBOL(memmove);
diff --git a/arch/x86_64/lib/memset_64.S b/arch/x86_64/lib/memset_64.S

deleted file mode 100644 (file)

index 2c59481..0000000
--- a/arch/x86_64/lib/memset_64.S
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright 2002 Andi Kleen, SuSE Labs */
-
-#include <linux/linkage.h>
-#include <asm/dwarf2.h>
-
-/*
- * ISO C memset - set a memory block to a byte value.
- *     
- * rdi   destination
- * rsi   value (char) 
- * rdx   count (bytes) 
- * 
- * rax   original destination
- */    
-       ALIGN
-memset_c:
-       CFI_STARTPROC
-       movq %rdi,%r9
-       movl %edx,%r8d
-       andl $7,%r8d
-       movl %edx,%ecx
-       shrl $3,%ecx
-       /* expand byte value  */
-       movzbl %sil,%esi
-       movabs $0x0101010101010101,%rax
-       mulq %rsi               /* with rax, clobbers rdx */
-       rep stosq
-       movl %r8d,%ecx
-       rep stosb
-       movq %r9,%rax
-       ret
-       CFI_ENDPROC
-ENDPROC(memset_c)
-
-ENTRY(memset)
-ENTRY(__memset)
-       CFI_STARTPROC
-       movq %rdi,%r10
-       movq %rdx,%r11
-
-       /* expand byte value  */
-       movzbl %sil,%ecx
-       movabs $0x0101010101010101,%rax
-       mul    %rcx             /* with rax, clobbers rdx */
-
-       /* align dst */
-       movl  %edi,%r9d
-       andl  $7,%r9d
-       jnz  .Lbad_alignment
-       CFI_REMEMBER_STATE
-.Lafter_bad_alignment:
-
-       movl %r11d,%ecx
-       shrl $6,%ecx
-       jz       .Lhandle_tail
-
-       .p2align 4
-.Lloop_64:
-       decl   %ecx
-       movq  %rax,(%rdi)
-       movq  %rax,8(%rdi)
-       movq  %rax,16(%rdi)
-       movq  %rax,24(%rdi)
-       movq  %rax,32(%rdi)
-       movq  %rax,40(%rdi)
-       movq  %rax,48(%rdi)
-       movq  %rax,56(%rdi)
-       leaq  64(%rdi),%rdi
-       jnz    .Lloop_64
-
-       /* Handle tail in loops. The loops should be faster than hard
-          to predict jump tables. */
-       .p2align 4
-.Lhandle_tail:
-       movl    %r11d,%ecx
-       andl    $63&(~7),%ecx
-       jz              .Lhandle_7
-       shrl    $3,%ecx
-       .p2align 4
-.Lloop_8:
-       decl   %ecx
-       movq  %rax,(%rdi)
-       leaq  8(%rdi),%rdi
-       jnz    .Lloop_8
-
-.Lhandle_7:
-       movl    %r11d,%ecx
-       andl    $7,%ecx
-       jz      .Lende
-       .p2align 4
-.Lloop_1:
-       decl    %ecx
-       movb    %al,(%rdi)
-       leaq    1(%rdi),%rdi
-       jnz     .Lloop_1
-
-.Lende:
-       movq    %r10,%rax
-       ret
-
-       CFI_RESTORE_STATE
-.Lbad_alignment:
-       cmpq $7,%r11
-       jbe     .Lhandle_7
-       movq %rax,(%rdi)        /* unaligned store */
-       movq $8,%r8
-       subq %r9,%r8
-       addq %r8,%rdi
-       subq %r8,%r11
-       jmp .Lafter_bad_alignment
-.Lfinal:
-       CFI_ENDPROC
-ENDPROC(memset)
-ENDPROC(__memset)
-
-       /* Some CPUs run faster using the string instructions.
-          It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>
-
-       .section .altinstr_replacement,"ax"
-1:     .byte 0xeb                              /* jmp <disp8> */
-       .byte (memset_c - memset) - (2f - 1b)   /* offset */
-2:
-       .previous
-       .section .altinstructions,"a"
-       .align 8
-       .quad memset
-       .quad 1b
-       .byte X86_FEATURE_REP_GOOD
-       .byte .Lfinal - memset
-       .byte 2b - 1b
-       .previous
diff --git a/arch/x86_64/lib/msr-on-cpu.c b/arch/x86_64/lib/msr-on-cpu.c

deleted file mode 100644 (file)

index 5672d41..0000000
--- a/arch/x86_64/lib/msr-on-cpu.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../x86/lib/msr-on-cpu.c"
diff --git a/arch/x86_64/lib/putuser_64.S b/arch/x86_64/lib/putuser_64.S

deleted file mode 100644 (file)

index 4989f5a..0000000
--- a/arch/x86_64/lib/putuser_64.S
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * __put_user functions.
- *
- * (C) Copyright 1998 Linus Torvalds
- * (C) Copyright 2005 Andi Kleen
- *
- * These functions have a non-standard call interface
- * to make them more efficient, especially as they
- * return an error value in addition to the "real"
- * return value.
- */
-
-/*
- * __put_user_X
- *
- * Inputs:     %rcx contains the address
- *             %rdx contains new value
- *
- * Outputs:    %rax is error code (0 or -EFAULT)
- *
- * %r8 is destroyed.
- *
- * These functions should not modify any other registers,
- * as they get called from within inline assembly.
- */
-
-#include <linux/linkage.h>
-#include <asm/dwarf2.h>
-#include <asm/page.h>
-#include <asm/errno.h>
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-
-       .text
-ENTRY(__put_user_1)
-       CFI_STARTPROC
-       GET_THREAD_INFO(%r8)
-       cmpq threadinfo_addr_limit(%r8),%rcx
-       jae bad_put_user
-1:     movb %dl,(%rcx)
-       xorl %eax,%eax
-       ret
-       CFI_ENDPROC
-ENDPROC(__put_user_1)
-
-ENTRY(__put_user_2)
-       CFI_STARTPROC
-       GET_THREAD_INFO(%r8)
-       addq $1,%rcx
-       jc 20f
-       cmpq threadinfo_addr_limit(%r8),%rcx
-       jae 20f
-       decq %rcx
-2:     movw %dx,(%rcx)
-       xorl %eax,%eax
-       ret
-20:    decq %rcx
-       jmp bad_put_user
-       CFI_ENDPROC
-ENDPROC(__put_user_2)
-
-ENTRY(__put_user_4)
-       CFI_STARTPROC
-       GET_THREAD_INFO(%r8)
-       addq $3,%rcx
-       jc 30f
-       cmpq threadinfo_addr_limit(%r8),%rcx
-       jae 30f
-       subq $3,%rcx
-3:     movl %edx,(%rcx)
-       xorl %eax,%eax
-       ret
-30:    subq $3,%rcx
-       jmp bad_put_user
-       CFI_ENDPROC
-ENDPROC(__put_user_4)
-
-ENTRY(__put_user_8)
-       CFI_STARTPROC
-       GET_THREAD_INFO(%r8)
-       addq $7,%rcx
-       jc 40f
-       cmpq threadinfo_addr_limit(%r8),%rcx
-       jae 40f
-       subq $7,%rcx
-4:     movq %rdx,(%rcx)
-       xorl %eax,%eax
-       ret
-40:    subq $7,%rcx
-       jmp bad_put_user
-       CFI_ENDPROC
-ENDPROC(__put_user_8)
-
-bad_put_user:
-       CFI_STARTPROC
-       movq $(-EFAULT),%rax
-       ret
-       CFI_ENDPROC
-END(bad_put_user)
-
-.section __ex_table,"a"
-       .quad 1b,bad_put_user
-       .quad 2b,bad_put_user
-       .quad 3b,bad_put_user
-       .quad 4b,bad_put_user
-.previous
diff --git a/arch/x86_64/lib/rwlock_64.S b/arch/x86_64/lib/rwlock_64.S

deleted file mode 100644 (file)

index 0cde1f8..0000000
--- a/arch/x86_64/lib/rwlock_64.S
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Slow paths of read/write spinlocks. */
-
-#include <linux/linkage.h>
-#include <asm/rwlock.h>
-#include <asm/alternative-asm.i>
-#include <asm/dwarf2.h>
-
-/* rdi:        pointer to rwlock_t */
-ENTRY(__write_lock_failed)
-       CFI_STARTPROC
-       LOCK_PREFIX
-       addl $RW_LOCK_BIAS,(%rdi)
-1:     rep
-       nop
-       cmpl $RW_LOCK_BIAS,(%rdi)
-       jne 1b
-       LOCK_PREFIX
-       subl $RW_LOCK_BIAS,(%rdi)
-       jnz  __write_lock_failed
-       ret
-       CFI_ENDPROC
-END(__write_lock_failed)
-
-/* rdi:        pointer to rwlock_t */
-ENTRY(__read_lock_failed)
-       CFI_STARTPROC
-       LOCK_PREFIX
-       incl (%rdi)
-1:     rep
-       nop
-       cmpl $1,(%rdi)
-       js 1b
-       LOCK_PREFIX
-       decl (%rdi)
-       js __read_lock_failed
-       ret
-       CFI_ENDPROC
-END(__read_lock_failed)
diff --git a/arch/x86_64/lib/thunk_64.S b/arch/x86_64/lib/thunk_64.S

deleted file mode 100644 (file)

index 55e586d..0000000
--- a/arch/x86_64/lib/thunk_64.S
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Save registers before calling assembly functions. This avoids
- * disturbance of register allocation in some inline assembly constructs.
- * Copyright 2001,2002 by Andi Kleen, SuSE Labs.
- * Subject to the GNU public license, v.2. No warranty of any kind.
- */
-
-       #include <linux/linkage.h>
-       #include <asm/dwarf2.h>
-       #include <asm/calling.h>                        
-       #include <asm/rwlock.h>
-               
-       /* rdi: arg1 ... normal C conventions. rax is saved/restored. */        
-       .macro thunk name,func
-       .globl \name
-\name: 
-       CFI_STARTPROC
-       SAVE_ARGS
-       call \func
-       jmp  restore
-       CFI_ENDPROC
-       .endm
-
-       /* rdi: arg1 ... normal C conventions. rax is passed from C. */         
-       .macro thunk_retrax name,func
-       .globl \name
-\name: 
-       CFI_STARTPROC
-       SAVE_ARGS
-       call \func
-       jmp  restore_norax
-       CFI_ENDPROC
-       .endm
-       
-
-       .section .sched.text
-#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
-       thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed
-       thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed
-       thunk rwsem_wake_thunk,rwsem_wake
-       thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
-#endif 
-       
-       thunk __down_failed,__down
-       thunk_retrax __down_failed_interruptible,__down_interruptible
-       thunk_retrax __down_failed_trylock,__down_trylock
-       thunk __up_wakeup,__up
-
-#ifdef CONFIG_TRACE_IRQFLAGS
-       thunk trace_hardirqs_on_thunk,trace_hardirqs_on
-       thunk trace_hardirqs_off_thunk,trace_hardirqs_off
-#endif
-       
-       /* SAVE_ARGS below is used only for the .cfi directives it contains. */
-       CFI_STARTPROC
-       SAVE_ARGS
-restore:
-       RESTORE_ARGS
-       ret     
-       CFI_ENDPROC
-       
-       CFI_STARTPROC
-       SAVE_ARGS
-restore_norax: 
-       RESTORE_ARGS 1
-       ret
-       CFI_ENDPROC
diff --git a/arch/x86_64/lib/usercopy_64.c b/arch/x86_64/lib/usercopy_64.c

deleted file mode 100644 (file)

index 893d43f..0000000
--- a/arch/x86_64/lib/usercopy_64.c
+++ /dev/null
@@ -1,166 +0,0 @@
-/* 
- * User address space access functions.
- *
- * Copyright 1997 Andi Kleen <ak@muc.de>
- * Copyright 1997 Linus Torvalds
- * Copyright 2002 Andi Kleen <ak@suse.de>
- */
-#include <linux/module.h>
-#include <asm/uaccess.h>
-
-/*
- * Copy a null terminated string from userspace.
- */
-
-#define __do_strncpy_from_user(dst,src,count,res)                         \
-do {                                                                      \
-       long __d0, __d1, __d2;                                             \
-       might_sleep();                                                     \
-       __asm__ __volatile__(                                              \
-               "       testq %1,%1\n"                                     \
-               "       jz 2f\n"                                           \
-               "0:     lodsb\n"                                           \
-               "       stosb\n"                                           \
-               "       testb %%al,%%al\n"                                 \
-               "       jz 1f\n"                                           \
-               "       decq %1\n"                                         \
-               "       jnz 0b\n"                                          \
-               "1:     subq %1,%0\n"                                      \
-               "2:\n"                                                     \
-               ".section .fixup,\"ax\"\n"                                 \
-               "3:     movq %5,%0\n"                                      \
-               "       jmp 2b\n"                                          \
-               ".previous\n"                                              \
-               ".section __ex_table,\"a\"\n"                              \
-               "       .align 8\n"                                        \
-               "       .quad 0b,3b\n"                                     \
-               ".previous"                                                \
-               : "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1),      \
-                 "=&D" (__d2)                                             \
-               : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
-               : "memory");                                               \
-} while (0)
-
-long
-__strncpy_from_user(char *dst, const char __user *src, long count)
-{
-       long res;
-       __do_strncpy_from_user(dst, src, count, res);
-       return res;
-}
-EXPORT_SYMBOL(__strncpy_from_user);
-
-long
-strncpy_from_user(char *dst, const char __user *src, long count)
-{
-       long res = -EFAULT;
-       if (access_ok(VERIFY_READ, src, 1))
-               return __strncpy_from_user(dst, src, count);
-       return res;
-}
-EXPORT_SYMBOL(strncpy_from_user);
-
-/*
- * Zero Userspace
- */
-
-unsigned long __clear_user(void __user *addr, unsigned long size)
-{
-       long __d0;
-       might_sleep();
-       /* no memory constraint because it doesn't change any memory gcc knows
-          about */
-       asm volatile(
-               "       testq  %[size8],%[size8]\n"
-               "       jz     4f\n"
-               "0:     movq %[zero],(%[dst])\n"
-               "       addq   %[eight],%[dst]\n"
-               "       decl %%ecx ; jnz   0b\n"
-               "4:     movq  %[size1],%%rcx\n"
-               "       testl %%ecx,%%ecx\n"
-               "       jz     2f\n"
-               "1:     movb   %b[zero],(%[dst])\n"
-               "       incq   %[dst]\n"
-               "       decl %%ecx ; jnz  1b\n"
-               "2:\n"
-               ".section .fixup,\"ax\"\n"
-               "3:     lea 0(%[size1],%[size8],8),%[size8]\n"
-               "       jmp 2b\n"
-               ".previous\n"
-               ".section __ex_table,\"a\"\n"
-               "       .align 8\n"
-               "       .quad 0b,3b\n"
-               "       .quad 1b,2b\n"
-               ".previous"
-               : [size8] "=c"(size), [dst] "=&D" (__d0)
-               : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr),
-                 [zero] "r" (0UL), [eight] "r" (8UL));
-       return size;
-}
-EXPORT_SYMBOL(__clear_user);
-
-unsigned long clear_user(void __user *to, unsigned long n)
-{
-       if (access_ok(VERIFY_WRITE, to, n))
-               return __clear_user(to, n);
-       return n;
-}
-EXPORT_SYMBOL(clear_user);
-
-/*
- * Return the size of a string (including the ending 0)
- *
- * Return 0 on exception, a value greater than N if too long
- */
-
-long __strnlen_user(const char __user *s, long n)
-{
-       long res = 0;
-       char c;
-
-       while (1) {
-               if (res>n)
-                       return n+1;
-               if (__get_user(c, s))
-                       return 0;
-               if (!c)
-                       return res+1;
-               res++;
-               s++;
-       }
-}
-EXPORT_SYMBOL(__strnlen_user);
-
-long strnlen_user(const char __user *s, long n)
-{
-       if (!access_ok(VERIFY_READ, s, n))
-               return 0;
-       return __strnlen_user(s, n);
-}
-EXPORT_SYMBOL(strnlen_user);
-
-long strlen_user(const char __user *s)
-{
-       long res = 0;
-       char c;
-
-       for (;;) {
-               if (get_user(c, s))
-                       return 0;
-               if (!c)
-                       return res+1;
-               res++;
-               s++;
-       }
-}
-EXPORT_SYMBOL(strlen_user);
-
-unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len)
-{
-       if (access_ok(VERIFY_WRITE, to, len) && access_ok(VERIFY_READ, from, len)) { 
-               return copy_user_generic((__force void *)to, (__force void *)from, len);
-       } 
-       return len;             
-}
-EXPORT_SYMBOL(copy_in_user);
-
author	Thomas Gleixner <tglx@linutronix.de>
	Thu, 11 Oct 2007 09:17:08 +0000 (11:17 +0200)
committer	Thomas Gleixner <tglx@linutronix.de>
	Thu, 11 Oct 2007 09:17:08 +0000 (11:17 +0200)
arch/x86/lib/Makefile		patch \| blob \| history
arch/x86/lib/Makefile_64	[new file with mode: 0644]	patch \| blob
arch/x86/lib/bitops_64.c	[new file with mode: 0644]	patch \| blob
arch/x86/lib/bitstr_64.c	[new file with mode: 0644]	patch \| blob
arch/x86/lib/clear_page_64.S	[new file with mode: 0644]	patch \| blob
arch/x86/lib/copy_page_64.S	[new file with mode: 0644]	patch \| blob
arch/x86/lib/copy_user_64.S	[new file with mode: 0644]	patch \| blob
arch/x86/lib/copy_user_nocache_64.S	[new file with mode: 0644]	patch \| blob
arch/x86/lib/csum-copy_64.S	[new file with mode: 0644]	patch \| blob
arch/x86/lib/csum-partial_64.c	[new file with mode: 0644]	patch \| blob
arch/x86/lib/csum-wrappers_64.c	[new file with mode: 0644]	patch \| blob
arch/x86/lib/delay_64.c	[new file with mode: 0644]	patch \| blob
arch/x86/lib/getuser_64.S	[new file with mode: 0644]	patch \| blob
arch/x86/lib/io_64.c	[new file with mode: 0644]	patch \| blob
arch/x86/lib/iomap_copy_64.S	[new file with mode: 0644]	patch \| blob
arch/x86/lib/memcpy_64.S	[new file with mode: 0644]	patch \| blob
arch/x86/lib/memmove_64.c	[new file with mode: 0644]	patch \| blob
arch/x86/lib/memset_64.S	[new file with mode: 0644]	patch \| blob
arch/x86/lib/putuser_64.S	[new file with mode: 0644]	patch \| blob
arch/x86/lib/rwlock_64.S	[new file with mode: 0644]	patch \| blob
arch/x86/lib/thunk_64.S	[new file with mode: 0644]	patch \| blob
arch/x86/lib/usercopy_64.c	[new file with mode: 0644]	patch \| blob
arch/x86_64/Makefile		patch \| blob \| history
arch/x86_64/lib/Makefile	[deleted file]	patch \| blob \| history
arch/x86_64/lib/Makefile_64	[deleted file]	patch \| blob \| history
arch/x86_64/lib/bitops_64.c	[deleted file]	patch \| blob \| history
arch/x86_64/lib/bitstr_64.c	[deleted file]	patch \| blob \| history
arch/x86_64/lib/clear_page_64.S	[deleted file]	patch \| blob \| history
arch/x86_64/lib/copy_page_64.S	[deleted file]	patch \| blob \| history
arch/x86_64/lib/copy_user_64.S	[deleted file]	patch \| blob \| history
arch/x86_64/lib/copy_user_nocache_64.S	[deleted file]	patch \| blob \| history
arch/x86_64/lib/csum-copy_64.S	[deleted file]	patch \| blob \| history
arch/x86_64/lib/csum-partial_64.c	[deleted file]	patch \| blob \| history
arch/x86_64/lib/csum-wrappers_64.c	[deleted file]	patch \| blob \| history
arch/x86_64/lib/delay_64.c	[deleted file]	patch \| blob \| history
arch/x86_64/lib/getuser_64.S	[deleted file]	patch \| blob \| history
arch/x86_64/lib/io_64.c	[deleted file]	patch \| blob \| history
arch/x86_64/lib/iomap_copy_64.S	[deleted file]	patch \| blob \| history
arch/x86_64/lib/memcpy_64.S	[deleted file]	patch \| blob \| history
arch/x86_64/lib/memmove_64.c	[deleted file]	patch \| blob \| history
arch/x86_64/lib/memset_64.S	[deleted file]	patch \| blob \| history
arch/x86_64/lib/msr-on-cpu.c	[deleted file]	patch \| blob \| history
arch/x86_64/lib/putuser_64.S	[deleted file]	patch \| blob \| history
arch/x86_64/lib/rwlock_64.S	[deleted file]	patch \| blob \| history
arch/x86_64/lib/thunk_64.S	[deleted file]	patch \| blob \| history
arch/x86_64/lib/usercopy_64.c	[deleted file]	patch \| blob \| history