From 789c299ca280f96368c0296b739e89c0bb232f8a Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 10 Feb 2010 14:56:26 +0000 Subject: powerpc: Improve 64bit copy_tofrom_user Here is a patch from Paul Mackerras that improves the ppc64 copy_tofrom_user. The loop now does 32 bytes at a time and as well as pairing loads and stores. A quick test case that reads 8kB over and over shows the improvement: POWER6: 53% faster POWER7: 51% faster #define _XOPEN_SOURCE 500 #include #include #include #include #include #include #define BUFSIZE (8 * 1024) #define ITERATIONS 10000000 int main() { char tmpfile[] = "/tmp/copy_to_user_testXXXXXX"; int fd; char *buf[BUFSIZE]; unsigned long i; fd = mkstemp(tmpfile); if (fd < 0) { perror("open"); exit(1); } if (write(fd, buf, BUFSIZE) != BUFSIZE) { perror("open"); exit(1); } for (i = 0; i < 10000000; i++) { if (pread(fd, buf, BUFSIZE, 0) != BUFSIZE) { perror("pread"); exit(1); } } unlink(tmpfile); return 0; } Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt diff --git a/arch/powerpc/lib/copyuser_64.S b/arch/powerpc/lib/copyuser_64.S index 693b14a..578b625 100644 --- a/arch/powerpc/lib/copyuser_64.S +++ b/arch/powerpc/lib/copyuser_64.S @@ -44,37 +44,55 @@ BEGIN_FTR_SECTION andi. r0,r4,7 bne .Lsrc_unaligned END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) - srdi r7,r5,4 -20: ld r9,0(r4) - addi r4,r4,-8 - mtctr r7 - andi. r5,r5,7 - bf cr7*4+0,22f - addi r3,r3,8 - addi r4,r4,8 - mr r8,r9 - blt cr1,72f -21: ld r9,8(r4) -70: std r8,8(r3) -22: ldu r8,16(r4) -71: stdu r9,16(r3) + blt cr1,.Ldo_tail /* if < 16 bytes to copy */ + srdi r0,r5,5 + cmpdi cr1,r0,0 +20: ld r7,0(r4) +220: ld r6,8(r4) + addi r4,r4,16 + mtctr r0 + andi. r0,r5,0x10 + beq 22f + addi r3,r3,16 + addi r4,r4,-16 + mr r9,r7 + mr r8,r6 + beq cr1,72f +21: ld r7,16(r4) +221: ld r6,24(r4) + addi r4,r4,32 +70: std r9,0(r3) +270: std r8,8(r3) +22: ld r9,0(r4) +222: ld r8,8(r4) +71: std r7,16(r3) +271: std r6,24(r3) + addi r3,r3,32 bdnz 21b -72: std r8,8(r3) +72: std r9,0(r3) +272: std r8,8(r3) + andi. r5,r5,0xf beq+ 3f - addi r3,r3,16 + addi r4,r4,16 .Ldo_tail: - bf cr7*4+1,1f -23: lwz r9,8(r4) + addi r3,r3,16 + bf cr7*4+0,246f +244: ld r9,0(r4) + addi r4,r4,8 +245: std r9,0(r3) + addi r3,r3,8 +246: bf cr7*4+1,1f +23: lwz r9,0(r4) addi r4,r4,4 73: stw r9,0(r3) addi r3,r3,4 1: bf cr7*4+2,2f -44: lhz r9,8(r4) +44: lhz r9,0(r4) addi r4,r4,2 74: sth r9,0(r3) addi r3,r3,2 2: bf cr7*4+3,3f -45: lbz r9,8(r4) +45: lbz r9,0(r4) 75: stb r9,0(r3) 3: li r3,0 blr @@ -220,7 +238,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) 131: addi r3,r3,8 120: +320: 122: +322: 124: 125: 126: @@ -229,9 +249,11 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) 129: 133: addi r3,r3,8 -121: 132: addi r3,r3,8 +121: +321: +344: 134: 135: 138: @@ -303,18 +325,22 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) 183: add r3,r3,r7 b 1f +371: 180: addi r3,r3,8 171: 177: addi r3,r3,8 -170: -172: +370: +372: 176: 178: addi r3,r3,4 185: addi r3,r3,4 +170: +172: +345: 173: 174: 175: @@ -341,11 +367,19 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) .section __ex_table,"a" .align 3 .llong 20b,120b + .llong 220b,320b .llong 21b,121b + .llong 221b,321b .llong 70b,170b + .llong 270b,370b .llong 22b,122b + .llong 222b,322b .llong 71b,171b + .llong 271b,371b .llong 72b,172b + .llong 272b,372b + .llong 244b,344b + .llong 245b,345b .llong 23b,123b .llong 73b,173b .llong 44b,144b -- cgit v0.10.2