summaryrefslogtreecommitdiff
path: root/arch/x86_64/lib/memcpy.S
diff options
context:
space:
mode:
authorAndi Kleen <ak@suse.de>2006-02-03 20:51:02 (GMT)
committerLinus Torvalds <torvalds@g5.osdl.org>2006-02-05 00:43:13 (GMT)
commit7bcd3f34e262bbebffa954d80eab3a84f053da31 (patch)
treef0765da9eaa8024a2b1d67d3e43730cb32f99fa7 /arch/x86_64/lib/memcpy.S
parent6bca52b544489b626c7d0db801df6b4aa3d5adb5 (diff)
downloadlinux-fsl-qoriq-7bcd3f34e262bbebffa954d80eab3a84f053da31.tar.xz
[PATCH] x86_64: Undo the earlier changes to remove unrolled copy/memset functions
They cause quite bad performance regressions on Netburst This is temporary until we can get new optimized functions for these CPUs. This undoes changes that were done in 2.6.15 and in 2.6.16-rc1, essentially bringing the code back to 2.6.14 level. Only change is I renamed the X86_FEATURE_K8_C flag to X86_FEATURE_REP_GOOD and fixed the check for the flag and also fixed some comments. Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'arch/x86_64/lib/memcpy.S')
-rw-r--r--arch/x86_64/lib/memcpy.S93
1 files changed, 91 insertions, 2 deletions
diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S
index 92dd805..5554948 100644
--- a/arch/x86_64/lib/memcpy.S
+++ b/arch/x86_64/lib/memcpy.S
@@ -11,8 +11,6 @@
*
* Output:
* rax original destination
- *
- * TODO: check best memcpy for PSC
*/
.globl __memcpy
@@ -20,6 +18,95 @@
.p2align 4
__memcpy:
memcpy:
+ pushq %rbx
+ movq %rdi,%rax
+
+ movl %edx,%ecx
+ shrl $6,%ecx
+ jz .Lhandle_tail
+
+ .p2align 4
+.Lloop_64:
+ decl %ecx
+
+ movq (%rsi),%r11
+ movq 8(%rsi),%r8
+
+ movq %r11,(%rdi)
+ movq %r8,1*8(%rdi)
+
+ movq 2*8(%rsi),%r9
+ movq 3*8(%rsi),%r10
+
+ movq %r9,2*8(%rdi)
+ movq %r10,3*8(%rdi)
+
+ movq 4*8(%rsi),%r11
+ movq 5*8(%rsi),%r8
+
+ movq %r11,4*8(%rdi)
+ movq %r8,5*8(%rdi)
+
+ movq 6*8(%rsi),%r9
+ movq 7*8(%rsi),%r10
+
+ movq %r9,6*8(%rdi)
+ movq %r10,7*8(%rdi)
+
+ leaq 64(%rsi),%rsi
+ leaq 64(%rdi),%rdi
+ jnz .Lloop_64
+
+.Lhandle_tail:
+ movl %edx,%ecx
+ andl $63,%ecx
+ shrl $3,%ecx
+ jz .Lhandle_7
+ .p2align 4
+.Lloop_8:
+ decl %ecx
+ movq (%rsi),%r8
+ movq %r8,(%rdi)
+ leaq 8(%rdi),%rdi
+ leaq 8(%rsi),%rsi
+ jnz .Lloop_8
+
+.Lhandle_7:
+ movl %edx,%ecx
+ andl $7,%ecx
+ jz .Lende
+ .p2align 4
+.Lloop_1:
+ movb (%rsi),%r8b
+ movb %r8b,(%rdi)
+ incq %rdi
+ incq %rsi
+ decl %ecx
+ jnz .Lloop_1
+
+.Lende:
+ popq %rbx
+ ret
+.Lfinal:
+
+ /* Some CPUs run faster using the string copy instructions.
+ It is also a lot simpler. Use this when possible */
+
+ .section .altinstructions,"a"
+ .align 8
+ .quad memcpy
+ .quad memcpy_c
+ .byte X86_FEATURE_REP_GOOD
+ .byte .Lfinal-memcpy
+ .byte memcpy_c_end-memcpy_c
+ .previous
+
+ .section .altinstr_replacement,"ax"
+ /* rdi destination
+ * rsi source
+ * rdx count
+ */
+memcpy_c:
movq %rdi,%rax
movl %edx,%ecx
shrl $3,%ecx
@@ -30,3 +117,5 @@ memcpy:
rep
movsb
ret
+memcpy_c_end:
+ .previous