diff options
Diffstat (limited to 'arch/x86')
58 files changed, 1425 insertions, 2832 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b094816..b32ebf9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -65,6 +65,7 @@ config X86 select HAVE_KERNEL_LZMA select HAVE_KERNEL_XZ select HAVE_KERNEL_LZO + select HAVE_KERNEL_LZ4 select HAVE_HW_BREAKPOINT select HAVE_MIXED_BREAKPOINTS_REGS select PERF_EVENTS @@ -102,6 +103,7 @@ config X86 select HAVE_ARCH_SECCOMP_FILTER select BUILDTIME_EXTABLE_SORT select GENERIC_CMOS_UPDATE + select HAVE_ARCH_SOFT_DIRTY select CLOCKSOURCE_WATCHDOG select GENERIC_CLOCKEVENTS select ARCH_CLOCKSOURCE_DATA if X86_64 @@ -121,6 +123,7 @@ config X86 select OLD_SIGACTION if X86_32 select COMPAT_OLD_SIGACTION if IA32_EMULATION select RTC_LIB + select HAVE_DEBUG_STACKOVERFLOW config INSTRUCTION_DECODER def_bool y @@ -2258,11 +2261,11 @@ source "drivers/pcmcia/Kconfig" source "drivers/pci/hotplug/Kconfig" config RAPIDIO - bool "RapidIO support" + tristate "RapidIO support" depends on PCI default n help - If you say Y here, the kernel will include drivers and + If enabled this option will include drivers and the core infrastructure code to support RapidIO interconnect devices. source "drivers/rapidio/Kconfig" diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index c963881..78d91af 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -59,16 +59,6 @@ config EARLY_PRINTK_DBGP with klogd/syslogd or the X server. You should normally N here, unless you want to debug such a crash. You need usb debug device. -config DEBUG_STACKOVERFLOW - bool "Check for stack overflows" - depends on DEBUG_KERNEL - ---help--- - Say Y here if you want to check the overflows of kernel, IRQ - and exception stacks. This option will cause messages of the - stacks in detail when free stack space drops below a certain - limit. - If in doubt, say "N". - config X86_PTDUMP bool "Export kernel pagetable layout to userspace via debugfs" depends on DEBUG_KERNEL diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 5ef205c..dcd90df 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -4,7 +4,8 @@ # create a compressed vmlinux image from the original vmlinux # -targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.xz vmlinux.bin.lzo +targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ + vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 KBUILD_CFLAGS += -fno-strict-aliasing -fPIC @@ -63,12 +64,15 @@ $(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE $(call if_changed,xzkern) $(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE $(call if_changed,lzo) +$(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE + $(call if_changed,lz4) suffix-$(CONFIG_KERNEL_GZIP) := gz suffix-$(CONFIG_KERNEL_BZIP2) := bz2 suffix-$(CONFIG_KERNEL_LZMA) := lzma suffix-$(CONFIG_KERNEL_XZ) := xz suffix-$(CONFIG_KERNEL_LZO) := lzo +suffix-$(CONFIG_KERNEL_LZ4) := lz4 quiet_cmd_mkpiggy = MKPIGGY $@ cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false ) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 7cb56c6..0319c88 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -145,6 +145,10 @@ static int lines, cols; #include "../../../../lib/decompress_unlzo.c" #endif +#ifdef CONFIG_KERNEL_LZ4 +#include "../../../../lib/decompress_unlz4.c" +#endif + static void scroll(void) { int i; diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index a3a0ed8..7d6ba9d 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -3,8 +3,6 @@ # avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no) -avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ - $(comma)4)$(comma)%ymm2,yes,no) obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o @@ -29,6 +27,7 @@ obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o +obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o # These modules require assembler to support AVX. ifeq ($(avx_supported),yes) @@ -42,10 +41,8 @@ endif # These modules require assembler to support AVX2. ifeq ($(avx2_supported),yes) - obj-$(CONFIG_CRYPTO_BLOWFISH_AVX2_X86_64) += blowfish-avx2.o obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o - obj-$(CONFIG_CRYPTO_TWOFISH_AVX2_X86_64) += twofish-avx2.o endif aes-i586-y := aes-i586-asm_32.o aes_glue.o @@ -73,10 +70,8 @@ ifeq ($(avx_supported),yes) endif ifeq ($(avx2_supported),yes) - blowfish-avx2-y := blowfish-avx2-asm_64.o blowfish_avx2_glue.o camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o - twofish-avx2-y := twofish-avx2-asm_64.o twofish_avx2_glue.o endif aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o @@ -87,3 +82,4 @@ crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o +crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o diff --git a/arch/x86/crypto/blowfish-avx2-asm_64.S b/arch/x86/crypto/blowfish-avx2-asm_64.S deleted file mode 100644 index 784452e..0000000 --- a/arch/x86/crypto/blowfish-avx2-asm_64.S +++ /dev/null @@ -1,449 +0,0 @@ -/* - * x86_64/AVX2 assembler optimized version of Blowfish - * - * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -#include <linux/linkage.h> - -.file "blowfish-avx2-asm_64.S" - -.data -.align 32 - -.Lprefetch_mask: -.long 0*64 -.long 1*64 -.long 2*64 -.long 3*64 -.long 4*64 -.long 5*64 -.long 6*64 -.long 7*64 - -.Lbswap32_mask: -.long 0x00010203 -.long 0x04050607 -.long 0x08090a0b -.long 0x0c0d0e0f - -.Lbswap128_mask: - .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 -.Lbswap_iv_mask: - .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0 - -.text -/* structure of crypto context */ -#define p 0 -#define s0 ((16 + 2) * 4) -#define s1 ((16 + 2 + (1 * 256)) * 4) -#define s2 ((16 + 2 + (2 * 256)) * 4) -#define s3 ((16 + 2 + (3 * 256)) * 4) - -/* register macros */ -#define CTX %rdi -#define RIO %rdx - -#define RS0 %rax -#define RS1 %r8 -#define RS2 %r9 -#define RS3 %r10 - -#define RLOOP %r11 -#define RLOOPd %r11d - -#define RXr0 %ymm8 -#define RXr1 %ymm9 -#define RXr2 %ymm10 -#define RXr3 %ymm11 -#define RXl0 %ymm12 -#define RXl1 %ymm13 -#define RXl2 %ymm14 -#define RXl3 %ymm15 - -/* temp regs */ -#define RT0 %ymm0 -#define RT0x %xmm0 -#define RT1 %ymm1 -#define RT1x %xmm1 -#define RIDX0 %ymm2 -#define RIDX1 %ymm3 -#define RIDX1x %xmm3 -#define RIDX2 %ymm4 -#define RIDX3 %ymm5 - -/* vpgatherdd mask and '-1' */ -#define RNOT %ymm6 - -/* byte mask, (-1 >> 24) */ -#define RBYTE %ymm7 - -/*********************************************************************** - * 32-way AVX2 blowfish - ***********************************************************************/ -#define F(xl, xr) \ - vpsrld $24, xl, RIDX0; \ - vpsrld $16, xl, RIDX1; \ - vpsrld $8, xl, RIDX2; \ - vpand RBYTE, RIDX1, RIDX1; \ - vpand RBYTE, RIDX2, RIDX2; \ - vpand RBYTE, xl, RIDX3; \ - \ - vpgatherdd RNOT, (RS0, RIDX0, 4), RT0; \ - vpcmpeqd RNOT, RNOT, RNOT; \ - vpcmpeqd RIDX0, RIDX0, RIDX0; \ - \ - vpgatherdd RNOT, (RS1, RIDX1, 4), RT1; \ - vpcmpeqd RIDX1, RIDX1, RIDX1; \ - vpaddd RT0, RT1, RT0; \ - \ - vpgatherdd RIDX0, (RS2, RIDX2, 4), RT1; \ - vpxor RT0, RT1, RT0; \ - \ - vpgatherdd RIDX1, (RS3, RIDX3, 4), RT1; \ - vpcmpeqd RNOT, RNOT, RNOT; \ - vpaddd RT0, RT1, RT0; \ - \ - vpxor RT0, xr, xr; - -#define add_roundkey(xl, nmem) \ - vpbroadcastd nmem, RT0; \ - vpxor RT0, xl ## 0, xl ## 0; \ - vpxor RT0, xl ## 1, xl ## 1; \ - vpxor RT0, xl ## 2, xl ## 2; \ - vpxor RT0, xl ## 3, xl ## 3; - -#define round_enc() \ - add_roundkey(RXr, p(CTX,RLOOP,4)); \ - F(RXl0, RXr0); \ - F(RXl1, RXr1); \ - F(RXl2, RXr2); \ - F(RXl3, RXr3); \ - \ - add_roundkey(RXl, p+4(CTX,RLOOP,4)); \ - F(RXr0, RXl0); \ - F(RXr1, RXl1); \ - F(RXr2, RXl2); \ - F(RXr3, RXl3); - -#define round_dec() \ - add_roundkey(RXr, p+4*2(CTX,RLOOP,4)); \ - F(RXl0, RXr0); \ - F(RXl1, RXr1); \ - F(RXl2, RXr2); \ - F(RXl3, RXr3); \ - \ - add_roundkey(RXl, p+4(CTX,RLOOP,4)); \ - F(RXr0, RXl0); \ - F(RXr1, RXl1); \ - F(RXr2, RXl2); \ - F(RXr3, RXl3); - -#define init_round_constants() \ - vpcmpeqd RNOT, RNOT, RNOT; \ - leaq s0(CTX), RS0; \ - leaq s1(CTX), RS1; \ - leaq s2(CTX), RS2; \ - leaq s3(CTX), RS3; \ - vpsrld $24, RNOT, RBYTE; - -#define transpose_2x2(x0, x1, t0) \ - vpunpckldq x0, x1, t0; \ - vpunpckhdq x0, x1, x1; \ - \ - vpunpcklqdq t0, x1, x0; \ - vpunpckhqdq t0, x1, x1; - -#define read_block(xl, xr) \ - vbroadcasti128 .Lbswap32_mask, RT1; \ - \ - vpshufb RT1, xl ## 0, xl ## 0; \ - vpshufb RT1, xr ## 0, xr ## 0; \ - vpshufb RT1, xl ## 1, xl ## 1; \ - vpshufb RT1, xr ## 1, xr ## 1; \ - vpshufb RT1, xl ## 2, xl ## 2; \ - vpshufb RT1, xr ## 2, xr ## 2; \ - vpshufb RT1, xl ## 3, xl ## 3; \ - vpshufb RT1, xr ## 3, xr ## 3; \ - \ - transpose_2x2(xl ## 0, xr ## 0, RT0); \ - transpose_2x2(xl ## 1, xr ## 1, RT0); \ - transpose_2x2(xl ## 2, xr ## 2, RT0); \ - transpose_2x2(xl ## 3, xr ## 3, RT0); - -#define write_block(xl, xr) \ - vbroadcasti128 .Lbswap32_mask, RT1; \ - \ - transpose_2x2(xl ## 0, xr ## 0, RT0); \ - transpose_2x2(xl ## 1, xr ## 1, RT0); \ - transpose_2x2(xl ## 2, xr ## 2, RT0); \ - transpose_2x2(xl ## 3, xr ## 3, RT0); \ - \ - vpshufb RT1, xl ## 0, xl ## 0; \ - vpshufb RT1, xr ## 0, xr ## 0; \ - vpshufb RT1, xl ## 1, xl ## 1; \ - vpshufb RT1, xr ## 1, xr ## 1; \ - vpshufb RT1, xl ## 2, xl ## 2; \ - vpshufb RT1, xr ## 2, xr ## 2; \ - vpshufb RT1, xl ## 3, xl ## 3; \ - vpshufb RT1, xr ## 3, xr ## 3; - -.align 8 -__blowfish_enc_blk32: - /* input: - * %rdi: ctx, CTX - * RXl0..4, RXr0..4: plaintext - * output: - * RXl0..4, RXr0..4: ciphertext (RXl <=> RXr swapped) - */ - init_round_constants(); - - read_block(RXl, RXr); - - movl $1, RLOOPd; - add_roundkey(RXl, p+4*(0)(CTX)); - -.align 4 -.L__enc_loop: - round_enc(); - - leal 2(RLOOPd), RLOOPd; - cmpl $17, RLOOPd; - jne .L__enc_loop; - - add_roundkey(RXr, p+4*(17)(CTX)); - - write_block(RXl, RXr); - - ret; -ENDPROC(__blowfish_enc_blk32) - -.align 8 -__blowfish_dec_blk32: - /* input: - * %rdi: ctx, CTX - * RXl0..4, RXr0..4: ciphertext - * output: - * RXl0..4, RXr0..4: plaintext (RXl <=> RXr swapped) - */ - init_round_constants(); - - read_block(RXl, RXr); - - movl $14, RLOOPd; - add_roundkey(RXl, p+4*(17)(CTX)); - -.align 4 -.L__dec_loop: - round_dec(); - - addl $-2, RLOOPd; - jns .L__dec_loop; - - add_roundkey(RXr, p+4*(0)(CTX)); - - write_block(RXl, RXr); - - ret; -ENDPROC(__blowfish_dec_blk32) - -ENTRY(blowfish_ecb_enc_32way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - */ - - vzeroupper; - - vmovdqu 0*32(%rdx), RXl0; - vmovdqu 1*32(%rdx), RXr0; - vmovdqu 2*32(%rdx), RXl1; - vmovdqu 3*32(%rdx), RXr1; - vmovdqu 4*32(%rdx), RXl2; - vmovdqu 5*32(%rdx), RXr2; - vmovdqu 6*32(%rdx), RXl3; - vmovdqu 7*32(%rdx), RXr3; - - call __blowfish_enc_blk32; - - vmovdqu RXr0, 0*32(%rsi); - vmovdqu RXl0, 1*32(%rsi); - vmovdqu RXr1, 2*32(%rsi); - vmovdqu RXl1, 3*32(%rsi); - vmovdqu RXr2, 4*32(%rsi); - vmovdqu RXl2, 5*32(%rsi); - vmovdqu RXr3, 6*32(%rsi); - vmovdqu RXl3, 7*32(%rsi); - - vzeroupper; - - ret; -ENDPROC(blowfish_ecb_enc_32way) - -ENTRY(blowfish_ecb_dec_32way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - */ - - vzeroupper; - - vmovdqu 0*32(%rdx), RXl0; - vmovdqu 1*32(%rdx), RXr0; - vmovdqu 2*32(%rdx), RXl1; - vmovdqu 3*32(%rdx), RXr1; - vmovdqu 4*32(%rdx), RXl2; - vmovdqu 5*32(%rdx), RXr2; - vmovdqu 6*32(%rdx), RXl3; - vmovdqu 7*32(%rdx), RXr3; - - call __blowfish_dec_blk32; - - vmovdqu RXr0, 0*32(%rsi); - vmovdqu RXl0, 1*32(%rsi); - vmovdqu RXr1, 2*32(%rsi); - vmovdqu RXl1, 3*32(%rsi); - vmovdqu RXr2, 4*32(%rsi); - vmovdqu RXl2, 5*32(%rsi); - vmovdqu RXr3, 6*32(%rsi); - vmovdqu RXl3, 7*32(%rsi); - - vzeroupper; - - ret; -ENDPROC(blowfish_ecb_dec_32way) - -ENTRY(blowfish_cbc_dec_32way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - */ - - vzeroupper; - - vmovdqu 0*32(%rdx), RXl0; - vmovdqu 1*32(%rdx), RXr0; - vmovdqu 2*32(%rdx), RXl1; - vmovdqu 3*32(%rdx), RXr1; - vmovdqu 4*32(%rdx), RXl2; - vmovdqu 5*32(%rdx), RXr2; - vmovdqu 6*32(%rdx), RXl3; - vmovdqu 7*32(%rdx), RXr3; - - call __blowfish_dec_blk32; - - /* xor with src */ - vmovq (%rdx), RT0x; - vpshufd $0x4f, RT0x, RT0x; - vinserti128 $1, 8(%rdx), RT0, RT0; - vpxor RT0, RXr0, RXr0; - vpxor 0*32+24(%rdx), RXl0, RXl0; - vpxor 1*32+24(%rdx), RXr1, RXr1; - vpxor 2*32+24(%rdx), RXl1, RXl1; - vpxor 3*32+24(%rdx), RXr2, RXr2; - vpxor 4*32+24(%rdx), RXl2, RXl2; - vpxor 5*32+24(%rdx), RXr3, RXr3; - vpxor 6*32+24(%rdx), RXl3, RXl3; - - vmovdqu RXr0, (0*32)(%rsi); - vmovdqu RXl0, (1*32)(%rsi); - vmovdqu RXr1, (2*32)(%rsi); - vmovdqu RXl1, (3*32)(%rsi); - vmovdqu RXr2, (4*32)(%rsi); - vmovdqu RXl2, (5*32)(%rsi); - vmovdqu RXr3, (6*32)(%rsi); - vmovdqu RXl3, (7*32)(%rsi); - - vzeroupper; - - ret; -ENDPROC(blowfish_cbc_dec_32way) - -ENTRY(blowfish_ctr_32way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: iv (big endian, 64bit) - */ - - vzeroupper; - - vpcmpeqd RT0, RT0, RT0; - vpsrldq $8, RT0, RT0; /* a: -1, b: 0, c: -1, d: 0 */ - - vpcmpeqd RT1x, RT1x, RT1x; - vpaddq RT1x, RT1x, RT1x; /* a: -2, b: -2 */ - vpxor RIDX0, RIDX0, RIDX0; - vinserti128 $1, RT1x, RIDX0, RIDX0; /* a: 0, b: 0, c: -2, d: -2 */ - - vpaddq RIDX0, RT0, RT0; /* a: -1, b: 0, c: -3, d: -2 */ - - vpcmpeqd RT1, RT1, RT1; - vpaddq RT1, RT1, RT1; /* a: -2, b: -2, c: -2, d: -2 */ - vpaddq RT1, RT1, RIDX2; /* a: -4, b: -4, c: -4, d: -4 */ - - vbroadcasti128 .Lbswap_iv_mask, RIDX0; - vbroadcasti128 .Lbswap128_mask, RIDX1; - - /* load IV and byteswap */ - vmovq (%rcx), RT1x; - vinserti128 $1, RT1x, RT1, RT1; /* a: BE, b: 0, c: BE, d: 0 */ - vpshufb RIDX0, RT1, RT1; /* a: LE, b: LE, c: LE, d: LE */ - - /* construct IVs */ - vpsubq RT0, RT1, RT1; /* a: le1, b: le0, c: le3, d: le2 */ - vpshufb RIDX1, RT1, RXl0; /* a: be0, b: be1, c: be2, d: be3 */ - vpsubq RIDX2, RT1, RT1; /* le5, le4, le7, le6 */ - vpshufb RIDX1, RT1, RXr0; /* be4, be5, be6, be7 */ - vpsubq RIDX2, RT1, RT1; - vpshufb RIDX1, RT1, RXl1; - vpsubq RIDX2, RT1, RT1; - vpshufb RIDX1, RT1, RXr1; - vpsubq RIDX2, RT1, RT1; - vpshufb RIDX1, RT1, RXl2; - vpsubq RIDX2, RT1, RT1; - vpshufb RIDX1, RT1, RXr2; - vpsubq RIDX2, RT1, RT1; - vpshufb RIDX1, RT1, RXl3; - vpsubq RIDX2, RT1, RT1; - vpshufb RIDX1, RT1, RXr3; - - /* store last IV */ - vpsubq RIDX2, RT1, RT1; /* a: le33, b: le32, ... */ - vpshufb RIDX1x, RT1x, RT1x; /* a: be32, ... */ - vmovq RT1x, (%rcx); - - call __blowfish_enc_blk32; - - /* dst = src ^ iv */ - vpxor 0*32(%rdx), RXr0, RXr0; - vpxor 1*32(%rdx), RXl0, RXl0; - vpxor 2*32(%rdx), RXr1, RXr1; - vpxor 3*32(%rdx), RXl1, RXl1; - vpxor 4*32(%rdx), RXr2, RXr2; - vpxor 5*32(%rdx), RXl2, RXl2; - vpxor 6*32(%rdx), RXr3, RXr3; - vpxor 7*32(%rdx), RXl3, RXl3; - vmovdqu RXr0, (0*32)(%rsi); - vmovdqu RXl0, (1*32)(%rsi); - vmovdqu RXr1, (2*32)(%rsi); - vmovdqu RXl1, (3*32)(%rsi); - vmovdqu RXr2, (4*32)(%rsi); - vmovdqu RXl2, (5*32)(%rsi); - vmovdqu RXr3, (6*32)(%rsi); - vmovdqu RXl3, (7*32)(%rsi); - - vzeroupper; - - ret; -ENDPROC(blowfish_ctr_32way) diff --git a/arch/x86/crypto/blowfish_avx2_glue.c b/arch/x86/crypto/blowfish_avx2_glue.c deleted file mode 100644 index 4417e9a..0000000 --- a/arch/x86/crypto/blowfish_avx2_glue.c +++ /dev/null @@ -1,585 +0,0 @@ -/* - * Glue Code for x86_64/AVX2 assembler optimized version of Blowfish - * - * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> - * - * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: - * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> - * CTR part based on code (crypto/ctr.c) by: - * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/crypto.h> -#include <linux/err.h> -#include <crypto/algapi.h> -#include <crypto/blowfish.h> -#include <crypto/cryptd.h> -#include <crypto/ctr.h> -#include <asm/i387.h> -#include <asm/xcr.h> -#include <asm/xsave.h> -#include <asm/crypto/blowfish.h> -#include <asm/crypto/ablk_helper.h> -#include <crypto/scatterwalk.h> - -#define BF_AVX2_PARALLEL_BLOCKS 32 - -/* 32-way AVX2 parallel cipher functions */ -asmlinkage void blowfish_ecb_enc_32way(struct bf_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void blowfish_ecb_dec_32way(struct bf_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void blowfish_cbc_dec_32way(struct bf_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void blowfish_ctr_32way(struct bf_ctx *ctx, u8 *dst, const u8 *src, - __be64 *iv); - -static inline bool bf_fpu_begin(bool fpu_enabled, unsigned int nbytes) -{ - if (fpu_enabled) - return true; - - /* FPU is only used when chunk to be processed is large enough, so - * do not enable FPU until it is necessary. - */ - if (nbytes < BF_BLOCK_SIZE * BF_AVX2_PARALLEL_BLOCKS) - return false; - - kernel_fpu_begin(); - return true; -} - -static inline void bf_fpu_end(bool fpu_enabled) -{ - if (fpu_enabled) - kernel_fpu_end(); -} - -static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, - bool enc) -{ - bool fpu_enabled = false; - struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - const unsigned int bsize = BF_BLOCK_SIZE; - unsigned int nbytes; - int err; - - err = blkcipher_walk_virt(desc, walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - while ((nbytes = walk->nbytes)) { - u8 *wsrc = walk->src.virt.addr; - u8 *wdst = walk->dst.virt.addr; - - fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes); - - /* Process multi-block AVX2 batch */ - if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) { - do { - if (enc) - blowfish_ecb_enc_32way(ctx, wdst, wsrc); - else - blowfish_ecb_dec_32way(ctx, wdst, wsrc); - - wsrc += bsize * BF_AVX2_PARALLEL_BLOCKS; - wdst += bsize * BF_AVX2_PARALLEL_BLOCKS; - nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS; - } while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - /* Process multi-block batch */ - if (nbytes >= bsize * BF_PARALLEL_BLOCKS) { - do { - if (enc) - blowfish_enc_blk_4way(ctx, wdst, wsrc); - else - blowfish_dec_blk_4way(ctx, wdst, wsrc); - - wsrc += bsize * BF_PARALLEL_BLOCKS; - wdst += bsize * BF_PARALLEL_BLOCKS; - nbytes -= bsize * BF_PARALLEL_BLOCKS; - } while (nbytes >= bsize * BF_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - if (enc) - blowfish_enc_blk(ctx, wdst, wsrc); - else - blowfish_dec_blk(ctx, wdst, wsrc); - - wsrc += bsize; - wdst += bsize; - nbytes -= bsize; - } while (nbytes >= bsize); - -done: - err = blkcipher_walk_done(desc, walk, nbytes); - } - - bf_fpu_end(fpu_enabled); - return err; -} - -static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_crypt(desc, &walk, true); -} - -static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_crypt(desc, &walk, false); -} - -static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - unsigned int bsize = BF_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u64 *src = (u64 *)walk->src.virt.addr; - u64 *dst = (u64 *)walk->dst.virt.addr; - u64 *iv = (u64 *)walk->iv; - - do { - *dst = *src ^ *iv; - blowfish_enc_blk(ctx, (u8 *)dst, (u8 *)dst); - iv = dst; - - src += 1; - dst += 1; - nbytes -= bsize; - } while (nbytes >= bsize); - - *(u64 *)walk->iv = *iv; - return nbytes; -} - -static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - - while ((nbytes = walk.nbytes)) { - nbytes = __cbc_encrypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } - - return err; -} - -static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - const unsigned int bsize = BF_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u64 *src = (u64 *)walk->src.virt.addr; - u64 *dst = (u64 *)walk->dst.virt.addr; - u64 last_iv; - int i; - - /* Start of the last block. */ - src += nbytes / bsize - 1; - dst += nbytes / bsize - 1; - - last_iv = *src; - - /* Process multi-block AVX2 batch */ - if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) { - do { - nbytes -= bsize * (BF_AVX2_PARALLEL_BLOCKS - 1); - src -= BF_AVX2_PARALLEL_BLOCKS - 1; - dst -= BF_AVX2_PARALLEL_BLOCKS - 1; - - blowfish_cbc_dec_32way(ctx, (u8 *)dst, (u8 *)src); - - nbytes -= bsize; - if (nbytes < bsize) - goto done; - - *dst ^= *(src - 1); - src -= 1; - dst -= 1; - } while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - /* Process multi-block batch */ - if (nbytes >= bsize * BF_PARALLEL_BLOCKS) { - u64 ivs[BF_PARALLEL_BLOCKS - 1]; - - do { - nbytes -= bsize * (BF_PARALLEL_BLOCKS - 1); - src -= BF_PARALLEL_BLOCKS - 1; - dst -= BF_PARALLEL_BLOCKS - 1; - - for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++) - ivs[i] = src[i]; - - blowfish_dec_blk_4way(ctx, (u8 *)dst, (u8 *)src); - - for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++) - dst[i + 1] ^= ivs[i]; - - nbytes -= bsize; - if (nbytes < bsize) - goto done; - - *dst ^= *(src - 1); - src -= 1; - dst -= 1; - } while (nbytes >= bsize * BF_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - for (;;) { - blowfish_dec_blk(ctx, (u8 *)dst, (u8 *)src); - - nbytes -= bsize; - if (nbytes < bsize) - break; - - *dst ^= *(src - 1); - src -= 1; - dst -= 1; - } - -done: - *dst ^= *(u64 *)walk->iv; - *(u64 *)walk->iv = last_iv; - - return nbytes; -} - -static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - bool fpu_enabled = false; - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - while ((nbytes = walk.nbytes)) { - fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes); - nbytes = __cbc_decrypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } - - bf_fpu_end(fpu_enabled); - return err; -} - -static void ctr_crypt_final(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - u8 *ctrblk = walk->iv; - u8 keystream[BF_BLOCK_SIZE]; - u8 *src = walk->src.virt.addr; - u8 *dst = walk->dst.virt.addr; - unsigned int nbytes = walk->nbytes; - - blowfish_enc_blk(ctx, keystream, ctrblk); - crypto_xor(keystream, src, nbytes); - memcpy(dst, keystream, nbytes); - - crypto_inc(ctrblk, BF_BLOCK_SIZE); -} - -static unsigned int __ctr_crypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - unsigned int bsize = BF_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u64 *src = (u64 *)walk->src.virt.addr; - u64 *dst = (u64 *)walk->dst.virt.addr; - int i; - - /* Process multi-block AVX2 batch */ - if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) { - do { - blowfish_ctr_32way(ctx, (u8 *)dst, (u8 *)src, - (__be64 *)walk->iv); - - src += BF_AVX2_PARALLEL_BLOCKS; - dst += BF_AVX2_PARALLEL_BLOCKS; - nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS; - } while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - /* Process four block batch */ - if (nbytes >= bsize * BF_PARALLEL_BLOCKS) { - __be64 ctrblocks[BF_PARALLEL_BLOCKS]; - u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv); - - do { - /* create ctrblks for parallel encrypt */ - for (i = 0; i < BF_PARALLEL_BLOCKS; i++) { - if (dst != src) - dst[i] = src[i]; - - ctrblocks[i] = cpu_to_be64(ctrblk++); - } - - blowfish_enc_blk_xor_4way(ctx, (u8 *)dst, - (u8 *)ctrblocks); - - src += BF_PARALLEL_BLOCKS; - dst += BF_PARALLEL_BLOCKS; - nbytes -= bsize * BF_PARALLEL_BLOCKS; - } while (nbytes >= bsize * BF_PARALLEL_BLOCKS); - - *(__be64 *)walk->iv = cpu_to_be64(ctrblk); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - u64 ctrblk; - - if (dst != src) - *dst = *src; - - ctrblk = *(u64 *)walk->iv; - be64_add_cpu((__be64 *)walk->iv, 1); - - blowfish_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk); - - src += 1; - dst += 1; - } while ((nbytes -= bsize) >= bsize); - -done: - return nbytes; -} - -static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - bool fpu_enabled = false; - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt_block(desc, &walk, BF_BLOCK_SIZE); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - while ((nbytes = walk.nbytes) >= BF_BLOCK_SIZE) { - fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes); - nbytes = __ctr_crypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } - - bf_fpu_end(fpu_enabled); - - if (walk.nbytes) { - ctr_crypt_final(desc, &walk); - err = blkcipher_walk_done(desc, &walk, 0); - } - - return err; -} - -static struct crypto_alg bf_algs[6] = { { - .cra_name = "__ecb-blowfish-avx2", - .cra_driver_name = "__driver-ecb-blowfish-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = BF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct bf_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = BF_MIN_KEY_SIZE, - .max_keysize = BF_MAX_KEY_SIZE, - .setkey = blowfish_setkey, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, - }, - }, -}, { - .cra_name = "__cbc-blowfish-avx2", - .cra_driver_name = "__driver-cbc-blowfish-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = BF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct bf_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = BF_MIN_KEY_SIZE, - .max_keysize = BF_MAX_KEY_SIZE, - .setkey = blowfish_setkey, - .encrypt = cbc_encrypt, - .decrypt = cbc_decrypt, - }, - }, -}, { - .cra_name = "__ctr-blowfish-avx2", - .cra_driver_name = "__driver-ctr-blowfish-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct bf_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = BF_MIN_KEY_SIZE, - .max_keysize = BF_MAX_KEY_SIZE, - .ivsize = BF_BLOCK_SIZE, - .setkey = blowfish_setkey, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, - }, -}, { - .cra_name = "ecb(blowfish)", - .cra_driver_name = "ecb-blowfish-avx2", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = BF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = BF_MIN_KEY_SIZE, - .max_keysize = BF_MAX_KEY_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "cbc(blowfish)", - .cra_driver_name = "cbc-blowfish-avx2", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = BF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = BF_MIN_KEY_SIZE, - .max_keysize = BF_MAX_KEY_SIZE, - .ivsize = BF_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = __ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "ctr(blowfish)", - .cra_driver_name = "ctr-blowfish-avx2", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = BF_MIN_KEY_SIZE, - .max_keysize = BF_MAX_KEY_SIZE, - .ivsize = BF_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_encrypt, - .geniv = "chainiv", - }, - }, -} }; - - -static int __init init(void) -{ - u64 xcr0; - - if (!cpu_has_avx2 || !cpu_has_osxsave) { - pr_info("AVX2 instructions are not detected.\n"); - return -ENODEV; - } - - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - pr_info("AVX detected but unusable.\n"); - return -ENODEV; - } - - return crypto_register_algs(bf_algs, ARRAY_SIZE(bf_algs)); -} - -static void __exit fini(void) -{ - crypto_unregister_algs(bf_algs, ARRAY_SIZE(bf_algs)); -} - -module_init(init); -module_exit(fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Blowfish Cipher Algorithm, AVX2 optimized"); -MODULE_ALIAS("blowfish"); -MODULE_ALIAS("blowfish-asm"); diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index 3548d76..50ec333 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -1,7 +1,7 @@ /* * Glue Code for assembler optimized version of Blowfish * - * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> * * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> @@ -32,24 +32,40 @@ #include <linux/module.h> #include <linux/types.h> #include <crypto/algapi.h> -#include <asm/crypto/blowfish.h> /* regular block cipher functions */ asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src, bool xor); -EXPORT_SYMBOL_GPL(__blowfish_enc_blk); - asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src); -EXPORT_SYMBOL_GPL(blowfish_dec_blk); /* 4-way parallel cipher functions */ asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, const u8 *src, bool xor); -EXPORT_SYMBOL_GPL(__blowfish_enc_blk_4way); - asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst, const u8 *src); -EXPORT_SYMBOL_GPL(blowfish_dec_blk_4way); + +static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src) +{ + __blowfish_enc_blk(ctx, dst, src, false); +} + +static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst, + const u8 *src) +{ + __blowfish_enc_blk(ctx, dst, src, true); +} + +static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, + const u8 *src) +{ + __blowfish_enc_blk_4way(ctx, dst, src, false); +} + +static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst, + const u8 *src) +{ + __blowfish_enc_blk_4way(ctx, dst, src, true); +} static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index 91a1878..0e0b886 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -51,16 +51,6 @@ #define ymm14_x xmm14 #define ymm15_x xmm15 -/* - * AES-NI instructions do not support ymmX registers, so we need splitting and - * merging. - */ -#define vaesenclast256(zero, yreg, tmp) \ - vextracti128 $1, yreg, tmp##_x; \ - vaesenclast zero##_x, yreg##_x, yreg##_x; \ - vaesenclast zero##_x, tmp##_x, tmp##_x; \ - vinserti128 $1, tmp##_x, yreg, yreg; - /********************************************************************** 32-way camellia **********************************************************************/ @@ -79,46 +69,70 @@ * S-function with AES subbytes \ */ \ vbroadcasti128 .Linv_shift_row, t4; \ - vpbroadcastb .L0f0f0f0f, t7; \ - vbroadcasti128 .Lpre_tf_lo_s1, t0; \ - vbroadcasti128 .Lpre_tf_hi_s1, t1; \ + vpbroadcastd .L0f0f0f0f, t7; \ + vbroadcasti128 .Lpre_tf_lo_s1, t5; \ + vbroadcasti128 .Lpre_tf_hi_s1, t6; \ + vbroadcasti128 .Lpre_tf_lo_s4, t2; \ + vbroadcasti128 .Lpre_tf_hi_s4, t3; \ \ /* AES inverse shift rows */ \ vpshufb t4, x0, x0; \ vpshufb t4, x7, x7; \ - vpshufb t4, x1, x1; \ - vpshufb t4, x4, x4; \ - vpshufb t4, x2, x2; \ - vpshufb t4, x5, x5; \ vpshufb t4, x3, x3; \ vpshufb t4, x6, x6; \ + vpshufb t4, x2, x2; \ + vpshufb t4, x5, x5; \ + vpshufb t4, x1, x1; \ + vpshufb t4, x4, x4; \ \ /* prefilter sboxes 1, 2 and 3 */ \ - vbroadcasti128 .Lpre_tf_lo_s4, t2; \ - vbroadcasti128 .Lpre_tf_hi_s4, t3; \ - filter_8bit(x0, t0, t1, t7, t6); \ - filter_8bit(x7, t0, t1, t7, t6); \ - filter_8bit(x1, t0, t1, t7, t6); \ - filter_8bit(x4, t0, t1, t7, t6); \ - filter_8bit(x2, t0, t1, t7, t6); \ - filter_8bit(x5, t0, t1, t7, t6); \ - \ /* prefilter sbox 4 */ \ + filter_8bit(x0, t5, t6, t7, t4); \ + filter_8bit(x7, t5, t6, t7, t4); \ + vextracti128 $1, x0, t0##_x; \ + vextracti128 $1, x7, t1##_x; \ + filter_8bit(x3, t2, t3, t7, t4); \ + filter_8bit(x6, t2, t3, t7, t4); \ + vextracti128 $1, x3, t3##_x; \ + vextracti128 $1, x6, t2##_x; \ + filter_8bit(x2, t5, t6, t7, t4); \ + filter_8bit(x5, t5, t6, t7, t4); \ + filter_8bit(x1, t5, t6, t7, t4); \ + filter_8bit(x4, t5, t6, t7, t4); \ + \ vpxor t4##_x, t4##_x, t4##_x; \ - filter_8bit(x3, t2, t3, t7, t6); \ - filter_8bit(x6, t2, t3, t7, t6); \ \ /* AES subbytes + AES shift rows */ \ + vextracti128 $1, x2, t6##_x; \ + vextracti128 $1, x5, t5##_x; \ + vaesenclast t4##_x, x0##_x, x0##_x; \ + vaesenclast t4##_x, t0##_x, t0##_x; \ + vinserti128 $1, t0##_x, x0, x0; \ + vaesenclast t4##_x, x7##_x, x7##_x; \ + vaesenclast t4##_x, t1##_x, t1##_x; \ + vinserti128 $1, t1##_x, x7, x7; \ + vaesenclast t4##_x, x3##_x, x3##_x; \ + vaesenclast t4##_x, t3##_x, t3##_x; \ + vinserti128 $1, t3##_x, x3, x3; \ + vaesenclast t4##_x, x6##_x, x6##_x; \ + vaesenclast t4##_x, t2##_x, t2##_x; \ + vinserti128 $1, t2##_x, x6, x6; \ + vextracti128 $1, x1, t3##_x; \ + vextracti128 $1, x4, t2##_x; \ vbroadcasti128 .Lpost_tf_lo_s1, t0; \ vbroadcasti128 .Lpost_tf_hi_s1, t1; \ - vaesenclast256(t4, x0, t5); \ - vaesenclast256(t4, x7, t5); \ - vaesenclast256(t4, x1, t5); \ - vaesenclast256(t4, x4, t5); \ - vaesenclast256(t4, x2, t5); \ - vaesenclast256(t4, x5, t5); \ - vaesenclast256(t4, x3, t5); \ - vaesenclast256(t4, x6, t5); \ + vaesenclast t4##_x, x2##_x, x2##_x; \ + vaesenclast t4##_x, t6##_x, t6##_x; \ + vinserti128 $1, t6##_x, x2, x2; \ + vaesenclast t4##_x, x5##_x, x5##_x; \ + vaesenclast t4##_x, t5##_x, t5##_x; \ + vinserti128 $1, t5##_x, x5, x5; \ + vaesenclast t4##_x, x1##_x, x1##_x; \ + vaesenclast t4##_x, t3##_x, t3##_x; \ + vinserti128 $1, t3##_x, x1, x1; \ + vaesenclast t4##_x, x4##_x, x4##_x; \ + vaesenclast t4##_x, t2##_x, t2##_x; \ + vinserti128 $1, t2##_x, x4, x4; \ \ /* postfilter sboxes 1 and 4 */ \ vbroadcasti128 .Lpost_tf_lo_s3, t2; \ @@ -139,22 +153,12 @@ /* postfilter sbox 2 */ \ filter_8bit(x1, t4, t5, t7, t2); \ filter_8bit(x4, t4, t5, t7, t2); \ + vpxor t7, t7, t7; \ \ vpsrldq $1, t0, t1; \ vpsrldq $2, t0, t2; \ + vpshufb t7, t1, t1; \ vpsrldq $3, t0, t3; \ - vpsrldq $4, t0, t4; \ - vpsrldq $5, t0, t5; \ - vpsrldq $6, t0, t6; \ - vpsrldq $7, t0, t7; \ - vpbroadcastb t0##_x, t0; \ - vpbroadcastb t1##_x, t1; \ - vpbroadcastb t2##_x, t2; \ - vpbroadcastb t3##_x, t3; \ - vpbroadcastb t4##_x, t4; \ - vpbroadcastb t6##_x, t6; \ - vpbroadcastb t5##_x, t5; \ - vpbroadcastb t7##_x, t7; \ \ /* P-function */ \ vpxor x5, x0, x0; \ @@ -162,11 +166,21 @@ vpxor x7, x2, x2; \ vpxor x4, x3, x3; \ \ + vpshufb t7, t2, t2; \ + vpsrldq $4, t0, t4; \ + vpshufb t7, t3, t3; \ + vpsrldq $5, t0, t5; \ + vpshufb t7, t4, t4; \ + \ vpxor x2, x4, x4; \ vpxor x3, x5, x5; \ vpxor x0, x6, x6; \ vpxor x1, x7, x7; \ \ + vpsrldq $6, t0, t6; \ + vpshufb t7, t5, t5; \ + vpshufb t7, t6, t6; \ + \ vpxor x7, x0, x0; \ vpxor x4, x1, x1; \ vpxor x5, x2, x2; \ @@ -179,12 +193,16 @@ \ /* Add key material and result to CD (x becomes new CD) */ \ \ - vpxor t7, x0, x0; \ - vpxor 4 * 32(mem_cd), x0, x0; \ - \ vpxor t6, x1, x1; \ vpxor 5 * 32(mem_cd), x1, x1; \ \ + vpsrldq $7, t0, t6; \ + vpshufb t7, t0, t0; \ + vpshufb t7, t6, t7; \ + \ + vpxor t7, x0, x0; \ + vpxor 4 * 32(mem_cd), x0, x0; \ + \ vpxor t5, x2, x2; \ vpxor 6 * 32(mem_cd), x2, x2; \ \ @@ -204,7 +222,7 @@ vpxor 3 * 32(mem_cd), x7, x7; /* - * Size optimization... with inlined roundsm16 binary would be over 5 times + * Size optimization... with inlined roundsm32 binary would be over 5 times * larger and would only marginally faster. */ .align 8 @@ -324,13 +342,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) */ \ vpbroadcastd kll, t0; /* only lowest 32-bit used */ \ vpxor tt0, tt0, tt0; \ - vpbroadcastb t0##_x, t3; \ + vpshufb tt0, t0, t3; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t2; \ + vpshufb tt0, t0, t2; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t1; \ + vpshufb tt0, t0, t1; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t0; \ + vpshufb tt0, t0, t0; \ \ vpand l0, t0, t0; \ vpand l1, t1, t1; \ @@ -340,6 +358,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ \ vpxor l4, t0, l4; \ + vpbroadcastd krr, t0; /* only lowest 32-bit used */ \ vmovdqu l4, 4 * 32(l); \ vpxor l5, t1, l5; \ vmovdqu l5, 5 * 32(l); \ @@ -354,14 +373,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) * rl ^= t2; \ */ \ \ - vpbroadcastd krr, t0; /* only lowest 32-bit used */ \ - vpbroadcastb t0##_x, t3; \ + vpshufb tt0, t0, t3; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t2; \ + vpshufb tt0, t0, t2; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t1; \ + vpshufb tt0, t0, t1; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t0; \ + vpshufb tt0, t0, t0; \ \ vpor 4 * 32(r), t0, t0; \ vpor 5 * 32(r), t1, t1; \ @@ -373,6 +391,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) vpxor 2 * 32(r), t2, t2; \ vpxor 3 * 32(r), t3, t3; \ vmovdqu t0, 0 * 32(r); \ + vpbroadcastd krl, t0; /* only lowest 32-bit used */ \ vmovdqu t1, 1 * 32(r); \ vmovdqu t2, 2 * 32(r); \ vmovdqu t3, 3 * 32(r); \ @@ -382,14 +401,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) * t2 &= rl; \ * rr ^= rol32(t2, 1); \ */ \ - vpbroadcastd krl, t0; /* only lowest 32-bit used */ \ - vpbroadcastb t0##_x, t3; \ + vpshufb tt0, t0, t3; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t2; \ + vpshufb tt0, t0, t2; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t1; \ + vpshufb tt0, t0, t1; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t0; \ + vpshufb tt0, t0, t0; \ \ vpand 0 * 32(r), t0, t0; \ vpand 1 * 32(r), t1, t1; \ @@ -403,6 +421,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) vpxor 6 * 32(r), t2, t2; \ vpxor 7 * 32(r), t3, t3; \ vmovdqu t0, 4 * 32(r); \ + vpbroadcastd klr, t0; /* only lowest 32-bit used */ \ vmovdqu t1, 5 * 32(r); \ vmovdqu t2, 6 * 32(r); \ vmovdqu t3, 7 * 32(r); \ @@ -413,14 +432,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) * ll ^= t0; \ */ \ \ - vpbroadcastd klr, t0; /* only lowest 32-bit used */ \ - vpbroadcastb t0##_x, t3; \ + vpshufb tt0, t0, t3; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t2; \ + vpshufb tt0, t0, t2; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t1; \ + vpshufb tt0, t0, t1; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t0; \ + vpshufb tt0, t0, t0; \ \ vpor l4, t0, t0; \ vpor l5, t1, t1; \ diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S new file mode 100644 index 0000000..35e9756 --- /dev/null +++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S @@ -0,0 +1,643 @@ +######################################################################## +# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions +# +# Copyright (c) 2013, Intel Corporation +# +# Authors: +# Erdinc Ozturk <erdinc.ozturk@intel.com> +# Vinodh Gopal <vinodh.gopal@intel.com> +# James Guilford <james.guilford@intel.com> +# Tim Chen <tim.c.chen@linux.intel.com> +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the +# distribution. +# +# * Neither the name of the Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# +# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## +# Function API: +# UINT16 crc_t10dif_pcl( +# UINT16 init_crc, //initial CRC value, 16 bits +# const unsigned char *buf, //buffer pointer to calculate CRC on +# UINT64 len //buffer length in bytes (64-bit data) +# ); +# +# Reference paper titled "Fast CRC Computation for Generic +# Polynomials Using PCLMULQDQ Instruction" +# URL: http://www.intel.com/content/dam/www/public/us/en/documents +# /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf +# +# + +#include <linux/linkage.h> + +.text + +#define arg1 %rdi +#define arg2 %rsi +#define arg3 %rdx + +#define arg1_low32 %edi + +ENTRY(crc_t10dif_pcl) +.align 16 + + # adjust the 16-bit initial_crc value, scale it to 32 bits + shl $16, arg1_low32 + + # Allocate Stack Space + mov %rsp, %rcx + sub $16*2, %rsp + # align stack to 16 byte boundary + and $~(0x10 - 1), %rsp + + # check if smaller than 256 + cmp $256, arg3 + + # for sizes less than 128, we can't fold 64B at a time... + jl _less_than_128 + + + # load the initial crc value + movd arg1_low32, %xmm10 # initial crc + + # crc value does not need to be byte-reflected, but it needs + # to be moved to the high part of the register. + # because data will be byte-reflected and will align with + # initial crc at correct place. + pslldq $12, %xmm10 + + movdqa SHUF_MASK(%rip), %xmm11 + # receive the initial 64B data, xor the initial crc value + movdqu 16*0(arg2), %xmm0 + movdqu 16*1(arg2), %xmm1 + movdqu 16*2(arg2), %xmm2 + movdqu 16*3(arg2), %xmm3 + movdqu 16*4(arg2), %xmm4 + movdqu 16*5(arg2), %xmm5 + movdqu 16*6(arg2), %xmm6 + movdqu 16*7(arg2), %xmm7 + + pshufb %xmm11, %xmm0 + # XOR the initial_crc value + pxor %xmm10, %xmm0 + pshufb %xmm11, %xmm1 + pshufb %xmm11, %xmm2 + pshufb %xmm11, %xmm3 + pshufb %xmm11, %xmm4 + pshufb %xmm11, %xmm5 + pshufb %xmm11, %xmm6 + pshufb %xmm11, %xmm7 + + movdqa rk3(%rip), %xmm10 #xmm10 has rk3 and rk4 + #imm value of pclmulqdq instruction + #will determine which constant to use + + ################################################################# + # we subtract 256 instead of 128 to save one instruction from the loop + sub $256, arg3 + + # at this section of the code, there is 64*x+y (0<=y<64) bytes of + # buffer. The _fold_64_B_loop will fold 64B at a time + # until we have 64+y Bytes of buffer + + + # fold 64B at a time. This section of the code folds 4 xmm + # registers in parallel +_fold_64_B_loop: + + # update the buffer pointer + add $128, arg2 # buf += 64# + + movdqu 16*0(arg2), %xmm9 + movdqu 16*1(arg2), %xmm12 + pshufb %xmm11, %xmm9 + pshufb %xmm11, %xmm12 + movdqa %xmm0, %xmm8 + movdqa %xmm1, %xmm13 + pclmulqdq $0x0 , %xmm10, %xmm0 + pclmulqdq $0x11, %xmm10, %xmm8 + pclmulqdq $0x0 , %xmm10, %xmm1 + pclmulqdq $0x11, %xmm10, %xmm13 + pxor %xmm9 , %xmm0 + xorps %xmm8 , %xmm0 + pxor %xmm12, %xmm1 + xorps %xmm13, %xmm1 + + movdqu 16*2(arg2), %xmm9 + movdqu 16*3(arg2), %xmm12 + pshufb %xmm11, %xmm9 + pshufb %xmm11, %xmm12 + movdqa %xmm2, %xmm8 + movdqa %xmm3, %xmm13 + pclmulqdq $0x0, %xmm10, %xmm2 + pclmulqdq $0x11, %xmm10, %xmm8 + pclmulqdq $0x0, %xmm10, %xmm3 + pclmulqdq $0x11, %xmm10, %xmm13 + pxor %xmm9 , %xmm2 + xorps %xmm8 , %xmm2 + pxor %xmm12, %xmm3 + xorps %xmm13, %xmm3 + + movdqu 16*4(arg2), %xmm9 + movdqu 16*5(arg2), %xmm12 + pshufb %xmm11, %xmm9 + pshufb %xmm11, %xmm12 + movdqa %xmm4, %xmm8 + movdqa %xmm5, %xmm13 + pclmulqdq $0x0, %xmm10, %xmm4 + pclmulqdq $0x11, %xmm10, %xmm8 + pclmulqdq $0x0, %xmm10, %xmm5 + pclmulqdq $0x11, %xmm10, %xmm13 + pxor %xmm9 , %xmm4 + xorps %xmm8 , %xmm4 + pxor %xmm12, %xmm5 + xorps %xmm13, %xmm5 + + movdqu 16*6(arg2), %xmm9 + movdqu 16*7(arg2), %xmm12 + pshufb %xmm11, %xmm9 + pshufb %xmm11, %xmm12 + movdqa %xmm6 , %xmm8 + movdqa %xmm7 , %xmm13 + pclmulqdq $0x0 , %xmm10, %xmm6 + pclmulqdq $0x11, %xmm10, %xmm8 + pclmulqdq $0x0 , %xmm10, %xmm7 + pclmulqdq $0x11, %xmm10, %xmm13 + pxor %xmm9 , %xmm6 + xorps %xmm8 , %xmm6 + pxor %xmm12, %xmm7 + xorps %xmm13, %xmm7 + + sub $128, arg3 + + # check if there is another 64B in the buffer to be able to fold + jge _fold_64_B_loop + ################################################################## + + + add $128, arg2 + # at this point, the buffer pointer is pointing at the last y Bytes + # of the buffer the 64B of folded data is in 4 of the xmm + # registers: xmm0, xmm1, xmm2, xmm3 + + + # fold the 8 xmm registers to 1 xmm register with different constants + + movdqa rk9(%rip), %xmm10 + movdqa %xmm0, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm0 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + xorps %xmm0, %xmm7 + + movdqa rk11(%rip), %xmm10 + movdqa %xmm1, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm1 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + xorps %xmm1, %xmm7 + + movdqa rk13(%rip), %xmm10 + movdqa %xmm2, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm2 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + pxor %xmm2, %xmm7 + + movdqa rk15(%rip), %xmm10 + movdqa %xmm3, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm3 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + xorps %xmm3, %xmm7 + + movdqa rk17(%rip), %xmm10 + movdqa %xmm4, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm4 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + pxor %xmm4, %xmm7 + + movdqa rk19(%rip), %xmm10 + movdqa %xmm5, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm5 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + xorps %xmm5, %xmm7 + + movdqa rk1(%rip), %xmm10 #xmm10 has rk1 and rk2 + #imm value of pclmulqdq instruction + #will determine which constant to use + movdqa %xmm6, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm6 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + pxor %xmm6, %xmm7 + + + # instead of 64, we add 48 to the loop counter to save 1 instruction + # from the loop instead of a cmp instruction, we use the negative + # flag with the jl instruction + add $128-16, arg3 + jl _final_reduction_for_128 + + # now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 + # and the rest is in memory. We can fold 16 bytes at a time if y>=16 + # continue folding 16B at a time + +_16B_reduction_loop: + movdqa %xmm7, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm7 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + movdqu (arg2), %xmm0 + pshufb %xmm11, %xmm0 + pxor %xmm0 , %xmm7 + add $16, arg2 + sub $16, arg3 + # instead of a cmp instruction, we utilize the flags with the + # jge instruction equivalent of: cmp arg3, 16-16 + # check if there is any more 16B in the buffer to be able to fold + jge _16B_reduction_loop + + #now we have 16+z bytes left to reduce, where 0<= z < 16. + #first, we reduce the data in the xmm7 register + + +_final_reduction_for_128: + # check if any more data to fold. If not, compute the CRC of + # the final 128 bits + add $16, arg3 + je _128_done + + # here we are getting data that is less than 16 bytes. + # since we know that there was data before the pointer, we can + # offset the input pointer before the actual point, to receive + # exactly 16 bytes. after that the registers need to be adjusted. +_get_last_two_xmms: + movdqa %xmm7, %xmm2 + + movdqu -16(arg2, arg3), %xmm1 + pshufb %xmm11, %xmm1 + + # get rid of the extra data that was loaded before + # load the shift constant + lea pshufb_shf_table+16(%rip), %rax + sub arg3, %rax + movdqu (%rax), %xmm0 + + # shift xmm2 to the left by arg3 bytes + pshufb %xmm0, %xmm2 + + # shift xmm7 to the right by 16-arg3 bytes + pxor mask1(%rip), %xmm0 + pshufb %xmm0, %xmm7 + pblendvb %xmm2, %xmm1 #xmm0 is implicit + + # fold 16 Bytes + movdqa %xmm1, %xmm2 + movdqa %xmm7, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm7 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + pxor %xmm2, %xmm7 + +_128_done: + # compute crc of a 128-bit value + movdqa rk5(%rip), %xmm10 # rk5 and rk6 in xmm10 + movdqa %xmm7, %xmm0 + + #64b fold + pclmulqdq $0x1, %xmm10, %xmm7 + pslldq $8 , %xmm0 + pxor %xmm0, %xmm7 + + #32b fold + movdqa %xmm7, %xmm0 + + pand mask2(%rip), %xmm0 + + psrldq $12, %xmm7 + pclmulqdq $0x10, %xmm10, %xmm7 + pxor %xmm0, %xmm7 + + #barrett reduction +_barrett: + movdqa rk7(%rip), %xmm10 # rk7 and rk8 in xmm10 + movdqa %xmm7, %xmm0 + pclmulqdq $0x01, %xmm10, %xmm7 + pslldq $4, %xmm7 + pclmulqdq $0x11, %xmm10, %xmm7 + + pslldq $4, %xmm7 + pxor %xmm0, %xmm7 + pextrd $1, %xmm7, %eax + +_cleanup: + # scale the result back to 16 bits + shr $16, %eax + mov %rcx, %rsp + ret + +######################################################################## + +.align 16 +_less_than_128: + + # check if there is enough buffer to be able to fold 16B at a time + cmp $32, arg3 + jl _less_than_32 + movdqa SHUF_MASK(%rip), %xmm11 + + # now if there is, load the constants + movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10 + + movd arg1_low32, %xmm0 # get the initial crc value + pslldq $12, %xmm0 # align it to its correct place + movdqu (arg2), %xmm7 # load the plaintext + pshufb %xmm11, %xmm7 # byte-reflect the plaintext + pxor %xmm0, %xmm7 + + + # update the buffer pointer + add $16, arg2 + + # update the counter. subtract 32 instead of 16 to save one + # instruction from the loop + sub $32, arg3 + + jmp _16B_reduction_loop + + +.align 16 +_less_than_32: + # mov initial crc to the return value. this is necessary for + # zero-length buffers. + mov arg1_low32, %eax + test arg3, arg3 + je _cleanup + + movdqa SHUF_MASK(%rip), %xmm11 + + movd arg1_low32, %xmm0 # get the initial crc value + pslldq $12, %xmm0 # align it to its correct place + + cmp $16, arg3 + je _exact_16_left + jl _less_than_16_left + + movdqu (arg2), %xmm7 # load the plaintext + pshufb %xmm11, %xmm7 # byte-reflect the plaintext + pxor %xmm0 , %xmm7 # xor the initial crc value + add $16, arg2 + sub $16, arg3 + movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10 + jmp _get_last_two_xmms + + +.align 16 +_less_than_16_left: + # use stack space to load data less than 16 bytes, zero-out + # the 16B in memory first. + + pxor %xmm1, %xmm1 + mov %rsp, %r11 + movdqa %xmm1, (%r11) + + cmp $4, arg3 + jl _only_less_than_4 + + # backup the counter value + mov arg3, %r9 + cmp $8, arg3 + jl _less_than_8_left + + # load 8 Bytes + mov (arg2), %rax + mov %rax, (%r11) + add $8, %r11 + sub $8, arg3 + add $8, arg2 +_less_than_8_left: + + cmp $4, arg3 + jl _less_than_4_left + + # load 4 Bytes + mov (arg2), %eax + mov %eax, (%r11) + add $4, %r11 + sub $4, arg3 + add $4, arg2 +_less_than_4_left: + + cmp $2, arg3 + jl _less_than_2_left + + # load 2 Bytes + mov (arg2), %ax + mov %ax, (%r11) + add $2, %r11 + sub $2, arg3 + add $2, arg2 +_less_than_2_left: + cmp $1, arg3 + jl _zero_left + + # load 1 Byte + mov (arg2), %al + mov %al, (%r11) +_zero_left: + movdqa (%rsp), %xmm7 + pshufb %xmm11, %xmm7 + pxor %xmm0 , %xmm7 # xor the initial crc value + + # shl r9, 4 + lea pshufb_shf_table+16(%rip), %rax + sub %r9, %rax + movdqu (%rax), %xmm0 + pxor mask1(%rip), %xmm0 + + pshufb %xmm0, %xmm7 + jmp _128_done + +.align 16 +_exact_16_left: + movdqu (arg2), %xmm7 + pshufb %xmm11, %xmm7 + pxor %xmm0 , %xmm7 # xor the initial crc value + + jmp _128_done + +_only_less_than_4: + cmp $3, arg3 + jl _only_less_than_3 + + # load 3 Bytes + mov (arg2), %al + mov %al, (%r11) + + mov 1(arg2), %al + mov %al, 1(%r11) + + mov 2(arg2), %al + mov %al, 2(%r11) + + movdqa (%rsp), %xmm7 + pshufb %xmm11, %xmm7 + pxor %xmm0 , %xmm7 # xor the initial crc value + + psrldq $5, %xmm7 + + jmp _barrett +_only_less_than_3: + cmp $2, arg3 + jl _only_less_than_2 + + # load 2 Bytes + mov (arg2), %al + mov %al, (%r11) + + mov 1(arg2), %al + mov %al, 1(%r11) + + movdqa (%rsp), %xmm7 + pshufb %xmm11, %xmm7 + pxor %xmm0 , %xmm7 # xor the initial crc value + + psrldq $6, %xmm7 + + jmp _barrett +_only_less_than_2: + + # load 1 Byte + mov (arg2), %al + mov %al, (%r11) + + movdqa (%rsp), %xmm7 + pshufb %xmm11, %xmm7 + pxor %xmm0 , %xmm7 # xor the initial crc value + + psrldq $7, %xmm7 + + jmp _barrett + +ENDPROC(crc_t10dif_pcl) + +.data + +# precomputed constants +# these constants are precomputed from the poly: +# 0x8bb70000 (0x8bb7 scaled to 32 bits) +.align 16 +# Q = 0x18BB70000 +# rk1 = 2^(32*3) mod Q << 32 +# rk2 = 2^(32*5) mod Q << 32 +# rk3 = 2^(32*15) mod Q << 32 +# rk4 = 2^(32*17) mod Q << 32 +# rk5 = 2^(32*3) mod Q << 32 +# rk6 = 2^(32*2) mod Q << 32 +# rk7 = floor(2^64/Q) +# rk8 = Q +rk1: +.quad 0x2d56000000000000 +rk2: +.quad 0x06df000000000000 +rk3: +.quad 0x9d9d000000000000 +rk4: +.quad 0x7cf5000000000000 +rk5: +.quad 0x2d56000000000000 +rk6: +.quad 0x1368000000000000 +rk7: +.quad 0x00000001f65a57f8 +rk8: +.quad 0x000000018bb70000 + +rk9: +.quad 0xceae000000000000 +rk10: +.quad 0xbfd6000000000000 +rk11: +.quad 0x1e16000000000000 +rk12: +.quad 0x713c000000000000 +rk13: +.quad 0xf7f9000000000000 +rk14: +.quad 0x80a6000000000000 +rk15: +.quad 0x044c000000000000 +rk16: +.quad 0xe658000000000000 +rk17: +.quad 0xad18000000000000 +rk18: +.quad 0xa497000000000000 +rk19: +.quad 0x6ee3000000000000 +rk20: +.quad 0xe7b5000000000000 + + + +mask1: +.octa 0x80808080808080808080808080808080 +mask2: +.octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF + +SHUF_MASK: +.octa 0x000102030405060708090A0B0C0D0E0F + +pshufb_shf_table: +# use these values for shift constants for the pshufb instruction +# different alignments result in values as shown: +# DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1 +# DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2 +# DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3 +# DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4 +# DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5 +# DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6 +# DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7 +# DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8 +# DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9 +# DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10 +# DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11 +# DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12 +# DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13 +# DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14 +# DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15 +.octa 0x8f8e8d8c8b8a89888786858483828100 +.octa 0x000e0d0c0b0a09080706050403020100 diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c new file mode 100644 index 0000000..7845d7f --- /dev/null +++ b/arch/x86/crypto/crct10dif-pclmul_glue.c @@ -0,0 +1,151 @@ +/* + * Cryptographic API. + * + * T10 Data Integrity Field CRC16 Crypto Transform using PCLMULQDQ Instructions + * + * Copyright (C) 2013 Intel Corporation + * Author: Tim Chen <tim.c.chen@linux.intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <linux/types.h> +#include <linux/module.h> +#include <linux/crc-t10dif.h> +#include <crypto/internal/hash.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <asm/i387.h> +#include <asm/cpufeature.h> +#include <asm/cpu_device_id.h> + +asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf, + size_t len); + +struct chksum_desc_ctx { + __u16 crc; +}; + +/* + * Steps through buffer one byte at at time, calculates reflected + * crc using table. + */ + +static int chksum_init(struct shash_desc *desc) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + ctx->crc = 0; + + return 0; +} + +static int chksum_update(struct shash_desc *desc, const u8 *data, + unsigned int length) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + if (irq_fpu_usable()) { + kernel_fpu_begin(); + ctx->crc = crc_t10dif_pcl(ctx->crc, data, length); + kernel_fpu_end(); + } else + ctx->crc = crc_t10dif_generic(ctx->crc, data, length); + return 0; +} + +static int chksum_final(struct shash_desc *desc, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + *(__u16 *)out = ctx->crc; + return 0; +} + +static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len, + u8 *out) +{ + if (irq_fpu_usable()) { + kernel_fpu_begin(); + *(__u16 *)out = crc_t10dif_pcl(*crcp, data, len); + kernel_fpu_end(); + } else + *(__u16 *)out = crc_t10dif_generic(*crcp, data, len); + return 0; +} + +static int chksum_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + return __chksum_finup(&ctx->crc, data, len, out); +} + +static int chksum_digest(struct shash_desc *desc, const u8 *data, + unsigned int length, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + return __chksum_finup(&ctx->crc, data, length, out); +} + +static struct shash_alg alg = { + .digestsize = CRC_T10DIF_DIGEST_SIZE, + .init = chksum_init, + .update = chksum_update, + .final = chksum_final, + .finup = chksum_finup, + .digest = chksum_digest, + .descsize = sizeof(struct chksum_desc_ctx), + .base = { + .cra_name = "crct10dif", + .cra_driver_name = "crct10dif-pclmul", + .cra_priority = 200, + .cra_blocksize = CRC_T10DIF_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static const struct x86_cpu_id crct10dif_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, crct10dif_cpu_id); + +static int __init crct10dif_intel_mod_init(void) +{ + if (!x86_match_cpu(crct10dif_cpu_id)) + return -ENODEV; + + return crypto_register_shash(&alg); +} + +static void __exit crct10dif_intel_mod_fini(void) +{ + crypto_unregister_shash(&alg); +} + +module_init(crct10dif_intel_mod_init); +module_exit(crct10dif_intel_mod_fini); + +MODULE_AUTHOR("Tim Chen <tim.c.chen@linux.intel.com>"); +MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with PCLMULQDQ."); +MODULE_LICENSE("GPL"); + +MODULE_ALIAS("crct10dif"); +MODULE_ALIAS("crct10dif-pclmul"); diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index 597d4da..50226c4 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -187,7 +187,36 @@ static int sha256_ssse3_import(struct shash_desc *desc, const void *in) return 0; } -static struct shash_alg alg = { +static int sha224_ssse3_init(struct shash_desc *desc) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + + sctx->state[0] = SHA224_H0; + sctx->state[1] = SHA224_H1; + sctx->state[2] = SHA224_H2; + sctx->state[3] = SHA224_H3; + sctx->state[4] = SHA224_H4; + sctx->state[5] = SHA224_H5; + sctx->state[6] = SHA224_H6; + sctx->state[7] = SHA224_H7; + sctx->count = 0; + + return 0; +} + +static int sha224_ssse3_final(struct shash_desc *desc, u8 *hash) +{ + u8 D[SHA256_DIGEST_SIZE]; + + sha256_ssse3_final(desc, D); + + memcpy(hash, D, SHA224_DIGEST_SIZE); + memset(D, 0, SHA256_DIGEST_SIZE); + + return 0; +} + +static struct shash_alg algs[] = { { .digestsize = SHA256_DIGEST_SIZE, .init = sha256_ssse3_init, .update = sha256_ssse3_update, @@ -204,7 +233,24 @@ static struct shash_alg alg = { .cra_blocksize = SHA256_BLOCK_SIZE, .cra_module = THIS_MODULE, } -}; +}, { + .digestsize = SHA224_DIGEST_SIZE, + .init = sha224_ssse3_init, + .update = sha256_ssse3_update, + .final = sha224_ssse3_final, + .export = sha256_ssse3_export, + .import = sha256_ssse3_import, + .descsize = sizeof(struct sha256_state), + .statesize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha224", + .cra_driver_name = "sha224-ssse3", + .cra_priority = 150, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA224_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; #ifdef CONFIG_AS_AVX static bool __init avx_usable(void) @@ -227,7 +273,7 @@ static bool __init avx_usable(void) static int __init sha256_ssse3_mod_init(void) { - /* test for SSE3 first */ + /* test for SSSE3 first */ if (cpu_has_ssse3) sha256_transform_asm = sha256_transform_ssse3; @@ -254,7 +300,7 @@ static int __init sha256_ssse3_mod_init(void) else #endif pr_info("Using SSSE3 optimized SHA-256 implementation\n"); - return crypto_register_shash(&alg); + return crypto_register_shashes(algs, ARRAY_SIZE(algs)); } pr_info("Neither AVX nor SSSE3 is available/usable.\n"); @@ -263,7 +309,7 @@ static int __init sha256_ssse3_mod_init(void) static void __exit sha256_ssse3_mod_fini(void) { - crypto_unregister_shash(&alg); + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); } module_init(sha256_ssse3_mod_init); @@ -273,3 +319,4 @@ MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated"); MODULE_ALIAS("sha256"); +MODULE_ALIAS("sha384"); diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index 6cbd8df..f30cd10 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -194,7 +194,37 @@ static int sha512_ssse3_import(struct shash_desc *desc, const void *in) return 0; } -static struct shash_alg alg = { +static int sha384_ssse3_init(struct shash_desc *desc) +{ + struct sha512_state *sctx = shash_desc_ctx(desc); + + sctx->state[0] = SHA384_H0; + sctx->state[1] = SHA384_H1; + sctx->state[2] = SHA384_H2; + sctx->state[3] = SHA384_H3; + sctx->state[4] = SHA384_H4; + sctx->state[5] = SHA384_H5; + sctx->state[6] = SHA384_H6; + sctx->state[7] = SHA384_H7; + + sctx->count[0] = sctx->count[1] = 0; + + return 0; +} + +static int sha384_ssse3_final(struct shash_desc *desc, u8 *hash) +{ + u8 D[SHA512_DIGEST_SIZE]; + + sha512_ssse3_final(desc, D); + + memcpy(hash, D, SHA384_DIGEST_SIZE); + memset(D, 0, SHA512_DIGEST_SIZE); + + return 0; +} + +static struct shash_alg algs[] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_ssse3_init, .update = sha512_ssse3_update, @@ -211,7 +241,24 @@ static struct shash_alg alg = { .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } -}; +}, { + .digestsize = SHA384_DIGEST_SIZE, + .init = sha384_ssse3_init, + .update = sha512_ssse3_update, + .final = sha384_ssse3_final, + .export = sha512_ssse3_export, + .import = sha512_ssse3_import, + .descsize = sizeof(struct sha512_state), + .statesize = sizeof(struct sha512_state), + .base = { + .cra_name = "sha384", + .cra_driver_name = "sha384-ssse3", + .cra_priority = 150, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA384_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; #ifdef CONFIG_AS_AVX static bool __init avx_usable(void) @@ -234,7 +281,7 @@ static bool __init avx_usable(void) static int __init sha512_ssse3_mod_init(void) { - /* test for SSE3 first */ + /* test for SSSE3 first */ if (cpu_has_ssse3) sha512_transform_asm = sha512_transform_ssse3; @@ -261,7 +308,7 @@ static int __init sha512_ssse3_mod_init(void) else #endif pr_info("Using SSSE3 optimized SHA-512 implementation\n"); - return crypto_register_shash(&alg); + return crypto_register_shashes(algs, ARRAY_SIZE(algs)); } pr_info("Neither AVX nor SSSE3 is available/usable.\n"); @@ -270,7 +317,7 @@ static int __init sha512_ssse3_mod_init(void) static void __exit sha512_ssse3_mod_fini(void) { - crypto_unregister_shash(&alg); + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); } module_init(sha512_ssse3_mod_init); @@ -280,3 +327,4 @@ MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated"); MODULE_ALIAS("sha512"); +MODULE_ALIAS("sha384"); diff --git a/arch/x86/crypto/twofish-avx2-asm_64.S b/arch/x86/crypto/twofish-avx2-asm_64.S deleted file mode 100644 index e1a83b9..0000000 --- a/arch/x86/crypto/twofish-avx2-asm_64.S +++ /dev/null @@ -1,600 +0,0 @@ -/* - * x86_64/AVX2 assembler optimized version of Twofish - * - * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -#include <linux/linkage.h> -#include "glue_helper-asm-avx2.S" - -.file "twofish-avx2-asm_64.S" - -.data -.align 16 - -.Lvpshufb_mask0: -.long 0x80808000 -.long 0x80808004 -.long 0x80808008 -.long 0x8080800c - -.Lbswap128_mask: - .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 -.Lxts_gf128mul_and_shl1_mask_0: - .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 -.Lxts_gf128mul_and_shl1_mask_1: - .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 - -.text - -/* structure of crypto context */ -#define s0 0 -#define s1 1024 -#define s2 2048 -#define s3 3072 -#define w 4096 -#define k 4128 - -/* register macros */ -#define CTX %rdi - -#define RS0 CTX -#define RS1 %r8 -#define RS2 %r9 -#define RS3 %r10 -#define RK %r11 -#define RW %rax -#define RROUND %r12 -#define RROUNDd %r12d - -#define RA0 %ymm8 -#define RB0 %ymm9 -#define RC0 %ymm10 -#define RD0 %ymm11 -#define RA1 %ymm12 -#define RB1 %ymm13 -#define RC1 %ymm14 -#define RD1 %ymm15 - -/* temp regs */ -#define RX0 %ymm0 -#define RY0 %ymm1 -#define RX1 %ymm2 -#define RY1 %ymm3 -#define RT0 %ymm4 -#define RIDX %ymm5 - -#define RX0x %xmm0 -#define RY0x %xmm1 -#define RX1x %xmm2 -#define RY1x %xmm3 -#define RT0x %xmm4 - -/* vpgatherdd mask and '-1' */ -#define RNOT %ymm6 - -/* byte mask, (-1 >> 24) */ -#define RBYTE %ymm7 - -/********************************************************************** - 16-way AVX2 twofish - **********************************************************************/ -#define init_round_constants() \ - vpcmpeqd RNOT, RNOT, RNOT; \ - vpsrld $24, RNOT, RBYTE; \ - leaq k(CTX), RK; \ - leaq w(CTX), RW; \ - leaq s1(CTX), RS1; \ - leaq s2(CTX), RS2; \ - leaq s3(CTX), RS3; \ - -#define g16(ab, rs0, rs1, rs2, rs3, xy) \ - vpand RBYTE, ab ## 0, RIDX; \ - vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 0; \ - vpcmpeqd RNOT, RNOT, RNOT; \ - \ - vpand RBYTE, ab ## 1, RIDX; \ - vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 1; \ - vpcmpeqd RNOT, RNOT, RNOT; \ - \ - vpsrld $8, ab ## 0, RIDX; \ - vpand RBYTE, RIDX, RIDX; \ - vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \ - vpcmpeqd RNOT, RNOT, RNOT; \ - vpxor RT0, xy ## 0, xy ## 0; \ - \ - vpsrld $8, ab ## 1, RIDX; \ - vpand RBYTE, RIDX, RIDX; \ - vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \ - vpcmpeqd RNOT, RNOT, RNOT; \ - vpxor RT0, xy ## 1, xy ## 1; \ - \ - vpsrld $16, ab ## 0, RIDX; \ - vpand RBYTE, RIDX, RIDX; \ - vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \ - vpcmpeqd RNOT, RNOT, RNOT; \ - vpxor RT0, xy ## 0, xy ## 0; \ - \ - vpsrld $16, ab ## 1, RIDX; \ - vpand RBYTE, RIDX, RIDX; \ - vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \ - vpcmpeqd RNOT, RNOT, RNOT; \ - vpxor RT0, xy ## 1, xy ## 1; \ - \ - vpsrld $24, ab ## 0, RIDX; \ - vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \ - vpcmpeqd RNOT, RNOT, RNOT; \ - vpxor RT0, xy ## 0, xy ## 0; \ - \ - vpsrld $24, ab ## 1, RIDX; \ - vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \ - vpcmpeqd RNOT, RNOT, RNOT; \ - vpxor RT0, xy ## 1, xy ## 1; - -#define g1_16(a, x) \ - g16(a, RS0, RS1, RS2, RS3, x); - -#define g2_16(b, y) \ - g16(b, RS1, RS2, RS3, RS0, y); - -#define encrypt_round_end16(a, b, c, d, nk) \ - vpaddd RY0, RX0, RX0; \ - vpaddd RX0, RY0, RY0; \ - vpbroadcastd nk(RK,RROUND,8), RT0; \ - vpaddd RT0, RX0, RX0; \ - vpbroadcastd 4+nk(RK,RROUND,8), RT0; \ - vpaddd RT0, RY0, RY0; \ - \ - vpxor RY0, d ## 0, d ## 0; \ - \ - vpxor RX0, c ## 0, c ## 0; \ - vpsrld $1, c ## 0, RT0; \ - vpslld $31, c ## 0, c ## 0; \ - vpor RT0, c ## 0, c ## 0; \ - \ - vpaddd RY1, RX1, RX1; \ - vpaddd RX1, RY1, RY1; \ - vpbroadcastd nk(RK,RROUND,8), RT0; \ - vpaddd RT0, RX1, RX1; \ - vpbroadcastd 4+nk(RK,RROUND,8), RT0; \ - vpaddd RT0, RY1, RY1; \ - \ - vpxor RY1, d ## 1, d ## 1; \ - \ - vpxor RX1, c ## 1, c ## 1; \ - vpsrld $1, c ## 1, RT0; \ - vpslld $31, c ## 1, c ## 1; \ - vpor RT0, c ## 1, c ## 1; \ - -#define encrypt_round16(a, b, c, d, nk) \ - g2_16(b, RY); \ - \ - vpslld $1, b ## 0, RT0; \ - vpsrld $31, b ## 0, b ## 0; \ - vpor RT0, b ## 0, b ## 0; \ - \ - vpslld $1, b ## 1, RT0; \ - vpsrld $31, b ## 1, b ## 1; \ - vpor RT0, b ## 1, b ## 1; \ - \ - g1_16(a, RX); \ - \ - encrypt_round_end16(a, b, c, d, nk); - -#define encrypt_round_first16(a, b, c, d, nk) \ - vpslld $1, d ## 0, RT0; \ - vpsrld $31, d ## 0, d ## 0; \ - vpor RT0, d ## 0, d ## 0; \ - \ - vpslld $1, d ## 1, RT0; \ - vpsrld $31, d ## 1, d ## 1; \ - vpor RT0, d ## 1, d ## 1; \ - \ - encrypt_round16(a, b, c, d, nk); - -#define encrypt_round_last16(a, b, c, d, nk) \ - g2_16(b, RY); \ - \ - g1_16(a, RX); \ - \ - encrypt_round_end16(a, b, c, d, nk); - -#define decrypt_round_end16(a, b, c, d, nk) \ - vpaddd RY0, RX0, RX0; \ - vpaddd RX0, RY0, RY0; \ - vpbroadcastd nk(RK,RROUND,8), RT0; \ - vpaddd RT0, RX0, RX0; \ - vpbroadcastd 4+nk(RK,RROUND,8), RT0; \ - vpaddd RT0, RY0, RY0; \ - \ - vpxor RX0, c ## 0, c ## 0; \ - \ - vpxor RY0, d ## 0, d ## 0; \ - vpsrld $1, d ## 0, RT0; \ - vpslld $31, d ## 0, d ## 0; \ - vpor RT0, d ## 0, d ## 0; \ - \ - vpaddd RY1, RX1, RX1; \ - vpaddd RX1, RY1, RY1; \ - vpbroadcastd nk(RK,RROUND,8), RT0; \ - vpaddd RT0, RX1, RX1; \ - vpbroadcastd 4+nk(RK,RROUND,8), RT0; \ - vpaddd RT0, RY1, RY1; \ - \ - vpxor RX1, c ## 1, c ## 1; \ - \ - vpxor RY1, d ## 1, d ## 1; \ - vpsrld $1, d ## 1, RT0; \ - vpslld $31, d ## 1, d ## 1; \ - vpor RT0, d ## 1, d ## 1; - -#define decrypt_round16(a, b, c, d, nk) \ - g1_16(a, RX); \ - \ - vpslld $1, a ## 0, RT0; \ - vpsrld $31, a ## 0, a ## 0; \ - vpor RT0, a ## 0, a ## 0; \ - \ - vpslld $1, a ## 1, RT0; \ - vpsrld $31, a ## 1, a ## 1; \ - vpor RT0, a ## 1, a ## 1; \ - \ - g2_16(b, RY); \ - \ - decrypt_round_end16(a, b, c, d, nk); - -#define decrypt_round_first16(a, b, c, d, nk) \ - vpslld $1, c ## 0, RT0; \ - vpsrld $31, c ## 0, c ## 0; \ - vpor RT0, c ## 0, c ## 0; \ - \ - vpslld $1, c ## 1, RT0; \ - vpsrld $31, c ## 1, c ## 1; \ - vpor RT0, c ## 1, c ## 1; \ - \ - decrypt_round16(a, b, c, d, nk) - -#define decrypt_round_last16(a, b, c, d, nk) \ - g1_16(a, RX); \ - \ - g2_16(b, RY); \ - \ - decrypt_round_end16(a, b, c, d, nk); - -#define encrypt_cycle16() \ - encrypt_round16(RA, RB, RC, RD, 0); \ - encrypt_round16(RC, RD, RA, RB, 8); - -#define encrypt_cycle_first16() \ - encrypt_round_first16(RA, RB, RC, RD, 0); \ - encrypt_round16(RC, RD, RA, RB, 8); - -#define encrypt_cycle_last16() \ - encrypt_round16(RA, RB, RC, RD, 0); \ - encrypt_round_last16(RC, RD, RA, RB, 8); - -#define decrypt_cycle16(n) \ - decrypt_round16(RC, RD, RA, RB, 8); \ - decrypt_round16(RA, RB, RC, RD, 0); - -#define decrypt_cycle_first16(n) \ - decrypt_round_first16(RC, RD, RA, RB, 8); \ - decrypt_round16(RA, RB, RC, RD, 0); - -#define decrypt_cycle_last16(n) \ - decrypt_round16(RC, RD, RA, RB, 8); \ - decrypt_round_last16(RA, RB, RC, RD, 0); - -#define transpose_4x4(x0,x1,x2,x3,t1,t2) \ - vpunpckhdq x1, x0, t2; \ - vpunpckldq x1, x0, x0; \ - \ - vpunpckldq x3, x2, t1; \ - vpunpckhdq x3, x2, x2; \ - \ - vpunpckhqdq t1, x0, x1; \ - vpunpcklqdq t1, x0, x0; \ - \ - vpunpckhqdq x2, t2, x3; \ - vpunpcklqdq x2, t2, x2; - -#define read_blocks8(offs,a,b,c,d) \ - transpose_4x4(a, b, c, d, RX0, RY0); - -#define write_blocks8(offs,a,b,c,d) \ - transpose_4x4(a, b, c, d, RX0, RY0); - -#define inpack_enc8(a,b,c,d) \ - vpbroadcastd 4*0(RW), RT0; \ - vpxor RT0, a, a; \ - \ - vpbroadcastd 4*1(RW), RT0; \ - vpxor RT0, b, b; \ - \ - vpbroadcastd 4*2(RW), RT0; \ - vpxor RT0, c, c; \ - \ - vpbroadcastd 4*3(RW), RT0; \ - vpxor RT0, d, d; - -#define outunpack_enc8(a,b,c,d) \ - vpbroadcastd 4*4(RW), RX0; \ - vpbroadcastd 4*5(RW), RY0; \ - vpxor RX0, c, RX0; \ - vpxor RY0, d, RY0; \ - \ - vpbroadcastd 4*6(RW), RT0; \ - vpxor RT0, a, c; \ - vpbroadcastd 4*7(RW), RT0; \ - vpxor RT0, b, d; \ - \ - vmovdqa RX0, a; \ - vmovdqa RY0, b; - -#define inpack_dec8(a,b,c,d) \ - vpbroadcastd 4*4(RW), RX0; \ - vpbroadcastd 4*5(RW), RY0; \ - vpxor RX0, a, RX0; \ - vpxor RY0, b, RY0; \ - \ - vpbroadcastd 4*6(RW), RT0; \ - vpxor RT0, c, a; \ - vpbroadcastd 4*7(RW), RT0; \ - vpxor RT0, d, b; \ - \ - vmovdqa RX0, c; \ - vmovdqa RY0, d; - -#define outunpack_dec8(a,b,c,d) \ - vpbroadcastd 4*0(RW), RT0; \ - vpxor RT0, a, a; \ - \ - vpbroadcastd 4*1(RW), RT0; \ - vpxor RT0, b, b; \ - \ - vpbroadcastd 4*2(RW), RT0; \ - vpxor RT0, c, c; \ - \ - vpbroadcastd 4*3(RW), RT0; \ - vpxor RT0, d, d; - -#define read_blocks16(a,b,c,d) \ - read_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \ - read_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1); - -#define write_blocks16(a,b,c,d) \ - write_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \ - write_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1); - -#define xor_blocks16(a,b,c,d) \ - xor_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \ - xor_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1); - -#define inpack_enc16(a,b,c,d) \ - inpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \ - inpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1); - -#define outunpack_enc16(a,b,c,d) \ - outunpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \ - outunpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1); - -#define inpack_dec16(a,b,c,d) \ - inpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \ - inpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1); - -#define outunpack_dec16(a,b,c,d) \ - outunpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \ - outunpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1); - -.align 8 -__twofish_enc_blk16: - /* input: - * %rdi: ctx, CTX - * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext - * output: - * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext - */ - init_round_constants(); - - read_blocks16(RA, RB, RC, RD); - inpack_enc16(RA, RB, RC, RD); - - xorl RROUNDd, RROUNDd; - encrypt_cycle_first16(); - movl $2, RROUNDd; - -.align 4 -.L__enc_loop: - encrypt_cycle16(); - - addl $2, RROUNDd; - cmpl $14, RROUNDd; - jne .L__enc_loop; - - encrypt_cycle_last16(); - - outunpack_enc16(RA, RB, RC, RD); - write_blocks16(RA, RB, RC, RD); - - ret; -ENDPROC(__twofish_enc_blk16) - -.align 8 -__twofish_dec_blk16: - /* input: - * %rdi: ctx, CTX - * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext - * output: - * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext - */ - init_round_constants(); - - read_blocks16(RA, RB, RC, RD); - inpack_dec16(RA, RB, RC, RD); - - movl $14, RROUNDd; - decrypt_cycle_first16(); - movl $12, RROUNDd; - -.align 4 -.L__dec_loop: - decrypt_cycle16(); - - addl $-2, RROUNDd; - jnz .L__dec_loop; - - decrypt_cycle_last16(); - - outunpack_dec16(RA, RB, RC, RD); - write_blocks16(RA, RB, RC, RD); - - ret; -ENDPROC(__twofish_dec_blk16) - -ENTRY(twofish_ecb_enc_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - */ - - vzeroupper; - pushq %r12; - - load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); - - call __twofish_enc_blk16; - - store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); - - popq %r12; - vzeroupper; - - ret; -ENDPROC(twofish_ecb_enc_16way) - -ENTRY(twofish_ecb_dec_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - */ - - vzeroupper; - pushq %r12; - - load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); - - call __twofish_dec_blk16; - - store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); - - popq %r12; - vzeroupper; - - ret; -ENDPROC(twofish_ecb_dec_16way) - -ENTRY(twofish_cbc_dec_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - */ - - vzeroupper; - pushq %r12; - - load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); - - call __twofish_dec_blk16; - - store_cbc_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1, - RX0); - - popq %r12; - vzeroupper; - - ret; -ENDPROC(twofish_cbc_dec_16way) - -ENTRY(twofish_ctr_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (little endian, 128bit) - */ - - vzeroupper; - pushq %r12; - - load_ctr_16way(%rcx, .Lbswap128_mask, RA0, RB0, RC0, RD0, RA1, RB1, RC1, - RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT, - RBYTE); - - call __twofish_enc_blk16; - - store_ctr_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); - - popq %r12; - vzeroupper; - - ret; -ENDPROC(twofish_ctr_16way) - -.align 8 -twofish_xts_crypt_16way: - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - * %r8: pointer to __twofish_enc_blk16 or __twofish_dec_blk16 - */ - - vzeroupper; - pushq %r12; - - load_xts_16way(%rcx, %rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, - RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT, - .Lxts_gf128mul_and_shl1_mask_0, - .Lxts_gf128mul_and_shl1_mask_1); - - call *%r8; - - store_xts_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); - - popq %r12; - vzeroupper; - - ret; -ENDPROC(twofish_xts_crypt_16way) - -ENTRY(twofish_xts_enc_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - leaq __twofish_enc_blk16, %r8; - jmp twofish_xts_crypt_16way; -ENDPROC(twofish_xts_enc_16way) - -ENTRY(twofish_xts_dec_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - leaq __twofish_dec_blk16, %r8; - jmp twofish_xts_crypt_16way; -ENDPROC(twofish_xts_dec_16way) diff --git a/arch/x86/crypto/twofish_avx2_glue.c b/arch/x86/crypto/twofish_avx2_glue.c deleted file mode 100644 index ce33b5b..0000000 --- a/arch/x86/crypto/twofish_avx2_glue.c +++ /dev/null @@ -1,584 +0,0 @@ -/* - * Glue Code for x86_64/AVX2 assembler optimized version of Twofish - * - * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/crypto.h> -#include <linux/err.h> -#include <crypto/algapi.h> -#include <crypto/ctr.h> -#include <crypto/twofish.h> -#include <crypto/lrw.h> -#include <crypto/xts.h> -#include <asm/xcr.h> -#include <asm/xsave.h> -#include <asm/crypto/twofish.h> -#include <asm/crypto/ablk_helper.h> -#include <asm/crypto/glue_helper.h> -#include <crypto/scatterwalk.h> - -#define TF_AVX2_PARALLEL_BLOCKS 16 - -/* 16-way AVX2 parallel cipher functions */ -asmlinkage void twofish_ecb_enc_16way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void twofish_ecb_dec_16way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void twofish_cbc_dec_16way(void *ctx, u128 *dst, const u128 *src); - -asmlinkage void twofish_ctr_16way(void *ctx, u128 *dst, const u128 *src, - le128 *iv); - -asmlinkage void twofish_xts_enc_16way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); -asmlinkage void twofish_xts_dec_16way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); - -static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src) -{ - __twofish_enc_blk_3way(ctx, dst, src, false); -} - -static const struct common_glue_ctx twofish_enc = { - .num_funcs = 4, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_16way) } - }, { - .num_blocks = 8, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) } - }, { - .num_blocks = 3, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) } - }, { - .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) } - } } -}; - -static const struct common_glue_ctx twofish_ctr = { - .num_funcs = 4, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_16way) } - }, { - .num_blocks = 8, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) } - }, { - .num_blocks = 3, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) } - }, { - .num_blocks = 1, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr) } - } } -}; - -static const struct common_glue_ctx twofish_enc_xts = { - .num_funcs = 3, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_16way) } - }, { - .num_blocks = 8, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_8way) } - }, { - .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc) } - } } -}; - -static const struct common_glue_ctx twofish_dec = { - .num_funcs = 4, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_16way) } - }, { - .num_blocks = 8, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) } - }, { - .num_blocks = 3, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) } - }, { - .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) } - } } -}; - -static const struct common_glue_ctx twofish_dec_cbc = { - .num_funcs = 4, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_16way) } - }, { - .num_blocks = 8, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) } - }, { - .num_blocks = 3, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) } - }, { - .num_blocks = 1, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) } - } } -}; - -static const struct common_glue_ctx twofish_dec_xts = { - .num_funcs = 3, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_16way) } - }, { - .num_blocks = 8, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_8way) } - }, { - .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec) } - } } -}; - -static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes); -} - -static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes); -} - -static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc, - dst, src, nbytes); -} - -static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src, - nbytes); -} - -static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes); -} - -static inline bool twofish_fpu_begin(bool fpu_enabled, unsigned int nbytes) -{ - /* since reusing AVX functions, starts using FPU at 8 parallel blocks */ - return glue_fpu_begin(TF_BLOCK_SIZE, 8, NULL, fpu_enabled, nbytes); -} - -static inline void twofish_fpu_end(bool fpu_enabled) -{ - glue_fpu_end(fpu_enabled); -} - -struct crypt_priv { - struct twofish_ctx *ctx; - bool fpu_enabled; -}; - -static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) -{ - const unsigned int bsize = TF_BLOCK_SIZE; - struct crypt_priv *ctx = priv; - int i; - - ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); - - while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) { - twofish_ecb_enc_16way(ctx->ctx, srcdst, srcdst); - srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS; - nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS; - } - - while (nbytes >= 8 * bsize) { - twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst); - srcdst += bsize * 8; - nbytes -= bsize * 8; - } - - while (nbytes >= 3 * bsize) { - twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst); - srcdst += bsize * 3; - nbytes -= bsize * 3; - } - - for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) - twofish_enc_blk(ctx->ctx, srcdst, srcdst); -} - -static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) -{ - const unsigned int bsize = TF_BLOCK_SIZE; - struct crypt_priv *ctx = priv; - int i; - - ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); - - while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) { - twofish_ecb_dec_16way(ctx->ctx, srcdst, srcdst); - srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS; - nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS; - } - - while (nbytes >= 8 * bsize) { - twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst); - srcdst += bsize * 8; - nbytes -= bsize * 8; - } - - while (nbytes >= 3 * bsize) { - twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst); - srcdst += bsize * 3; - nbytes -= bsize * 3; - } - - for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) - twofish_dec_blk(ctx->ctx, srcdst, srcdst); -} - -static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[TF_AVX2_PARALLEL_BLOCKS]; - struct crypt_priv crypt_ctx = { - .ctx = &ctx->twofish_ctx, - .fpu_enabled = false, - }; - struct lrw_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .table_ctx = &ctx->lrw_table, - .crypt_ctx = &crypt_ctx, - .crypt_fn = encrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - ret = lrw_crypt(desc, dst, src, nbytes, &req); - twofish_fpu_end(crypt_ctx.fpu_enabled); - - return ret; -} - -static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[TF_AVX2_PARALLEL_BLOCKS]; - struct crypt_priv crypt_ctx = { - .ctx = &ctx->twofish_ctx, - .fpu_enabled = false, - }; - struct lrw_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .table_ctx = &ctx->lrw_table, - .crypt_ctx = &crypt_ctx, - .crypt_fn = decrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - ret = lrw_crypt(desc, dst, src, nbytes, &req); - twofish_fpu_end(crypt_ctx.fpu_enabled); - - return ret; -} - -static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - - return glue_xts_crypt_128bit(&twofish_enc_xts, desc, dst, src, nbytes, - XTS_TWEAK_CAST(twofish_enc_blk), - &ctx->tweak_ctx, &ctx->crypt_ctx); -} - -static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - - return glue_xts_crypt_128bit(&twofish_dec_xts, desc, dst, src, nbytes, - XTS_TWEAK_CAST(twofish_enc_blk), - &ctx->tweak_ctx, &ctx->crypt_ctx); -} - -static struct crypto_alg tf_algs[10] = { { - .cra_name = "__ecb-twofish-avx2", - .cra_driver_name = "__driver-ecb-twofish-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct twofish_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .setkey = twofish_setkey, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, - }, - }, -}, { - .cra_name = "__cbc-twofish-avx2", - .cra_driver_name = "__driver-cbc-twofish-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct twofish_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .setkey = twofish_setkey, - .encrypt = cbc_encrypt, - .decrypt = cbc_decrypt, - }, - }, -}, { - .cra_name = "__ctr-twofish-avx2", - .cra_driver_name = "__driver-ctr-twofish-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct twofish_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .ivsize = TF_BLOCK_SIZE, - .setkey = twofish_setkey, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, - }, -}, { - .cra_name = "__lrw-twofish-avx2", - .cra_driver_name = "__driver-lrw-twofish-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct twofish_lrw_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_exit = lrw_twofish_exit_tfm, - .cra_u = { - .blkcipher = { - .min_keysize = TF_MIN_KEY_SIZE + - TF_BLOCK_SIZE, - .max_keysize = TF_MAX_KEY_SIZE + - TF_BLOCK_SIZE, - .ivsize = TF_BLOCK_SIZE, - .setkey = lrw_twofish_setkey, - .encrypt = lrw_encrypt, - .decrypt = lrw_decrypt, - }, - }, -}, { - .cra_name = "__xts-twofish-avx2", - .cra_driver_name = "__driver-xts-twofish-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct twofish_xts_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = TF_MIN_KEY_SIZE * 2, - .max_keysize = TF_MAX_KEY_SIZE * 2, - .ivsize = TF_BLOCK_SIZE, - .setkey = xts_twofish_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, - }, - }, -}, { - .cra_name = "ecb(twofish)", - .cra_driver_name = "ecb-twofish-avx2", - .cra_priority = 500, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "cbc(twofish)", - .cra_driver_name = "cbc-twofish-avx2", - .cra_priority = 500, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .ivsize = TF_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = __ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "ctr(twofish)", - .cra_driver_name = "ctr-twofish-avx2", - .cra_priority = 500, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .ivsize = TF_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_encrypt, - .geniv = "chainiv", - }, - }, -}, { - .cra_name = "lrw(twofish)", - .cra_driver_name = "lrw-twofish-avx2", - .cra_priority = 500, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = TF_MIN_KEY_SIZE + - TF_BLOCK_SIZE, - .max_keysize = TF_MAX_KEY_SIZE + - TF_BLOCK_SIZE, - .ivsize = TF_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "xts(twofish)", - .cra_driver_name = "xts-twofish-avx2", - .cra_priority = 500, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = TF_MIN_KEY_SIZE * 2, - .max_keysize = TF_MAX_KEY_SIZE * 2, - .ivsize = TF_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -} }; - -static int __init init(void) -{ - u64 xcr0; - - if (!cpu_has_avx2 || !cpu_has_osxsave) { - pr_info("AVX2 instructions are not detected.\n"); - return -ENODEV; - } - - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - pr_info("AVX2 detected but unusable.\n"); - return -ENODEV; - } - - return crypto_register_algs(tf_algs, ARRAY_SIZE(tf_algs)); -} - -static void __exit fini(void) -{ - crypto_unregister_algs(tf_algs, ARRAY_SIZE(tf_algs)); -} - -module_init(init); -module_exit(fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX2 optimized"); -MODULE_ALIAS("twofish"); -MODULE_ALIAS("twofish-asm"); diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index 2047a56..a62ba54 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -50,26 +50,18 @@ /* 8-way parallel cipher functions */ asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst, const u8 *src); -EXPORT_SYMBOL_GPL(twofish_ecb_enc_8way); - asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst, const u8 *src); -EXPORT_SYMBOL_GPL(twofish_ecb_dec_8way); asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst, const u8 *src); -EXPORT_SYMBOL_GPL(twofish_cbc_dec_8way); - asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst, const u8 *src, le128 *iv); -EXPORT_SYMBOL_GPL(twofish_ctr_8way); asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst, const u8 *src, le128 *iv); -EXPORT_SYMBOL_GPL(twofish_xts_enc_8way); asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst, const u8 *src, le128 *iv); -EXPORT_SYMBOL_GPL(twofish_xts_dec_8way); static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, const u8 *src) @@ -77,19 +69,17 @@ static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, __twofish_enc_blk_3way(ctx, dst, src, false); } -void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) +static void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) { glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(twofish_enc_blk)); } -EXPORT_SYMBOL_GPL(twofish_xts_enc); -void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) +static void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) { glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(twofish_dec_blk)); } -EXPORT_SYMBOL_GPL(twofish_xts_dec); static const struct common_glue_ctx twofish_enc = { diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 52ff81c..bae3aba 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -308,8 +308,6 @@ static int load_aout_binary(struct linux_binprm *bprm) (current->mm->start_data = N_DATADDR(ex)); current->mm->brk = ex.a_bss + (current->mm->start_brk = N_BSSADDR(ex)); - current->mm->free_area_cache = TASK_UNMAPPED_BASE; - current->mm->cached_hole_size = 0; retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT); if (retval < 0) { diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index b31bf97..2dfac58 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -111,7 +111,7 @@ static inline void acpi_disable_pci(void) } /* Low-level suspend routine. */ -extern int acpi_suspend_lowlevel(void); +extern int (*acpi_suspend_lowlevel)(void); /* Physical address to resume after wakeup */ #define acpi_wakeup_address ((unsigned long)(real_mode_header->wakeup_start)) diff --git a/arch/x86/include/asm/crypto/blowfish.h b/arch/x86/include/asm/crypto/blowfish.h deleted file mode 100644 index f097b2f..0000000 --- a/arch/x86/include/asm/crypto/blowfish.h +++ /dev/null @@ -1,43 +0,0 @@ -#ifndef ASM_X86_BLOWFISH_H -#define ASM_X86_BLOWFISH_H - -#include <linux/crypto.h> -#include <crypto/blowfish.h> - -#define BF_PARALLEL_BLOCKS 4 - -/* regular block cipher functions */ -asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src, - bool xor); -asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src); - -/* 4-way parallel cipher functions */ -asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, - const u8 *src, bool xor); -asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst, - const u8 *src); - -static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src) -{ - __blowfish_enc_blk(ctx, dst, src, false); -} - -static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst, - const u8 *src) -{ - __blowfish_enc_blk(ctx, dst, src, true); -} - -static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, - const u8 *src) -{ - __blowfish_enc_blk_4way(ctx, dst, src, false); -} - -static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst, - const u8 *src) -{ - __blowfish_enc_blk_4way(ctx, dst, src, true); -} - -#endif diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/include/asm/crypto/twofish.h index e655c60..878c51c 100644 --- a/arch/x86/include/asm/crypto/twofish.h +++ b/arch/x86/include/asm/crypto/twofish.h @@ -28,20 +28,6 @@ asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst, const u8 *src); -/* 8-way parallel cipher functions */ -asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); -asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); -asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); - /* helpers from twofish_x86_64-3way module */ extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src); extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, @@ -57,8 +43,4 @@ extern void lrw_twofish_exit_tfm(struct crypto_tfm *tfm); extern int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen); -/* helpers from twofish-avx module */ -extern void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv); -extern void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv); - #endif /* ASM_X86_TWOFISH_H */ diff --git a/arch/x86/include/asm/emergency-restart.h b/arch/x86/include/asm/emergency-restart.h index 75ce3f4..77a99ac 100644 --- a/arch/x86/include/asm/emergency-restart.h +++ b/arch/x86/include/asm/emergency-restart.h @@ -1,18 +1,6 @@ #ifndef _ASM_X86_EMERGENCY_RESTART_H #define _ASM_X86_EMERGENCY_RESTART_H -enum reboot_type { - BOOT_TRIPLE = 't', - BOOT_KBD = 'k', - BOOT_BIOS = 'b', - BOOT_ACPI = 'a', - BOOT_EFI = 'e', - BOOT_CF9 = 'p', - BOOT_CF9_COND = 'q', -}; - -extern enum reboot_type reboot_type; - extern void machine_emergency_restart(void); #endif /* _ASM_X86_EMERGENCY_RESTART_H */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index d8e8eef..34f69cb 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -345,4 +345,11 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, #define IO_SPACE_LIMIT 0xffff +#ifdef CONFIG_MTRR +extern int __must_check arch_phys_wc_add(unsigned long base, + unsigned long size); +extern void arch_phys_wc_del(int handle); +#define arch_phys_wc_add arch_phys_wc_add +#endif + #endif /* _ASM_X86_IO_H */ diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h index d354fb7..a55c7ef 100644 --- a/arch/x86/include/asm/mc146818rtc.h +++ b/arch/x86/include/asm/mc146818rtc.h @@ -95,8 +95,8 @@ static inline unsigned char current_lock_cmos_reg(void) unsigned char rtc_cmos_read(unsigned char addr); void rtc_cmos_write(unsigned char val, unsigned char addr); -extern int mach_set_rtc_mmss(unsigned long nowtime); -extern unsigned long mach_get_cmos_time(void); +extern int mach_set_rtc_mmss(const struct timespec *now); +extern void mach_get_cmos_time(struct timespec *now); #define RTC_IRQ 8 diff --git a/arch/x86/include/asm/mrst-vrtc.h b/arch/x86/include/asm/mrst-vrtc.h index 73668ab..1e69a75 100644 --- a/arch/x86/include/asm/mrst-vrtc.h +++ b/arch/x86/include/asm/mrst-vrtc.h @@ -3,7 +3,7 @@ extern unsigned char vrtc_cmos_read(unsigned char reg); extern void vrtc_cmos_write(unsigned char val, unsigned char reg); -extern unsigned long vrtc_get_time(void); -extern int vrtc_set_mmss(unsigned long nowtime); +extern void vrtc_get_time(struct timespec *now); +extern int vrtc_set_mmss(const struct timespec *now); #endif diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index e235582..f768f62 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -26,7 +26,10 @@ #include <uapi/asm/mtrr.h> -/* The following functions are for use by other drivers */ +/* + * The following functions are for use by other drivers that cannot use + * arch_phys_wc_add and arch_phys_wc_del. + */ # ifdef CONFIG_MTRR extern u8 mtrr_type_lookup(u64 addr, u64 end); extern void mtrr_save_fixed_ranges(void *); @@ -45,6 +48,7 @@ extern void mtrr_aps_init(void); extern void mtrr_bp_restore(void); extern int mtrr_trim_uncached_memory(unsigned long end_pfn); extern int amd_special_default_mtrr(void); +extern int phys_wc_to_mtrr_index(int handle); # else static inline u8 mtrr_type_lookup(u64 addr, u64 end) { @@ -80,6 +84,10 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn) static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) { } +static inline int phys_wc_to_mtrr_index(int handle) +{ + return -1; +} #define mtrr_ap_init() do {} while (0) #define mtrr_bp_init() do {} while (0) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5b0818b..7dc305a 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -207,7 +207,7 @@ static inline pte_t pte_mkexec(pte_t pte) static inline pte_t pte_mkdirty(pte_t pte) { - return pte_set_flags(pte, _PAGE_DIRTY); + return pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); } static inline pte_t pte_mkyoung(pte_t pte) @@ -271,7 +271,7 @@ static inline pmd_t pmd_wrprotect(pmd_t pmd) static inline pmd_t pmd_mkdirty(pmd_t pmd) { - return pmd_set_flags(pmd, _PAGE_DIRTY); + return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); } static inline pmd_t pmd_mkhuge(pmd_t pmd) @@ -294,6 +294,26 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd) return pmd_clear_flags(pmd, _PAGE_PRESENT); } +static inline int pte_soft_dirty(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SOFT_DIRTY; +} + +static inline int pmd_soft_dirty(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_SOFT_DIRTY; +} + +static inline pte_t pte_mksoft_dirty(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_SOFT_DIRTY); +} + +static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); +} + /* * Mask out unsupported bits in a present pgprot. Non-present pgprots * can use those bits for other purposes, so leave them be. diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index e642300..c98ac63 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -55,6 +55,18 @@ #define _PAGE_HIDDEN (_AT(pteval_t, 0)) #endif +/* + * The same hidden bit is used by kmemcheck, but since kmemcheck + * works on kernel pages while soft-dirty engine on user space, + * they do not conflict with each other. + */ + +#ifdef CONFIG_MEM_SOFT_DIRTY +#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) +#else +#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) +#endif + #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) #else diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index a1df6e8..2781119 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -89,7 +89,6 @@ struct thread_info { #define TIF_FORK 18 /* ret_from_fork */ #define TIF_NOHZ 19 /* in adaptive nohz mode */ #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ -#define TIF_DEBUG 21 /* uses debug registers */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ @@ -113,7 +112,6 @@ struct thread_info { #define _TIF_IA32 (1 << TIF_IA32) #define _TIF_FORK (1 << TIF_FORK) #define _TIF_NOHZ (1 << TIF_NOHZ) -#define _TIF_DEBUG (1 << TIF_DEBUG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) #define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) @@ -154,7 +152,7 @@ struct thread_info { (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP) #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY) -#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) +#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) #define PREEMPT_ACTIVE 0x10000000 diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index d8d9922..828a156 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -142,6 +142,8 @@ struct x86_cpuinit_ops { void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node); }; +struct timespec; + /** * struct x86_platform_ops - platform specific runtime functions * @calibrate_tsc: calibrate TSC @@ -156,8 +158,8 @@ struct x86_cpuinit_ops { */ struct x86_platform_ops { unsigned long (*calibrate_tsc)(void); - unsigned long (*get_wallclock)(void); - int (*set_wallclock)(unsigned long nowtime); + void (*get_wallclock)(struct timespec *ts); + int (*set_wallclock)(const struct timespec *ts); void (*iommu_shutdown)(void); bool (*is_untracked_pat_range)(u64 start, u64 end); void (*nmi_init)(void); diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 230c8ea..d81a972 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -44,6 +44,7 @@ #include <asm/mpspec.h> #include <asm/smp.h> +#include "sleep.h" /* To include x86_acpi_suspend_lowlevel */ static int __initdata acpi_force = 0; u32 acpi_rsdt_forced; int acpi_disabled; @@ -559,6 +560,12 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi, int (*__acpi_register_gsi)(struct device *dev, u32 gsi, int trigger, int polarity) = acpi_register_gsi_pic; +#ifdef CONFIG_ACPI_SLEEP +int (*acpi_suspend_lowlevel)(void) = x86_acpi_suspend_lowlevel; +#else +int (*acpi_suspend_lowlevel)(void); +#endif + /* * success: return IRQ number (>=0) * failure: return < 0 diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index b44577b..2a34aaf 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -26,12 +26,12 @@ static char temp_stack[4096]; #endif /** - * acpi_suspend_lowlevel - save kernel state + * x86_acpi_suspend_lowlevel - save kernel state * * Create an identity mapped page table and copy the wakeup routine to * low memory. */ -int acpi_suspend_lowlevel(void) +int x86_acpi_suspend_lowlevel(void) { struct wakeup_header *header = (struct wakeup_header *) __va(real_mode_header->wakeup_header); diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h index 67f59f8..c9c2c98 100644 --- a/arch/x86/kernel/acpi/sleep.h +++ b/arch/x86/kernel/acpi/sleep.h @@ -15,3 +15,5 @@ extern unsigned long acpi_copy_wakeup_routine(unsigned long); extern void wakeup_long64(void); extern void do_suspend_lowlevel(void); + +extern int x86_acpi_suspend_lowlevel(void); diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 39cc7f7..63092af 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -25,6 +25,7 @@ #include <linux/kdebug.h> #include <linux/delay.h> #include <linux/crash_dump.h> +#include <linux/reboot.h> #include <asm/uv/uv_mmrs.h> #include <asm/uv/uv_hub.h> @@ -36,7 +37,6 @@ #include <asm/ipi.h> #include <asm/smp.h> #include <asm/x86_init.h> -#include <asm/emergency-restart.h> #include <asm/nmi.h> /* BMC sets a bit this MMR non-zero before sending an NMI */ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 5013a48..c587a87 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -90,7 +90,7 @@ static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c) static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) { u32 l, h; - int mbytes = num_physpages >> (20-PAGE_SHIFT); + int mbytes = get_num_physpages() >> (20-PAGE_SHIFT); if (c->x86_model < 6) { /* Based on AMD doc 20734R - June 2000 */ diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index ca22b73..f961de9 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -51,9 +51,13 @@ #include <asm/e820.h> #include <asm/mtrr.h> #include <asm/msr.h> +#include <asm/pat.h> #include "mtrr.h" +/* arch_phys_wc_add returns an MTRR register index plus this offset. */ +#define MTRR_TO_PHYS_WC_OFFSET 1000 + u32 num_var_ranges; unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; @@ -525,6 +529,73 @@ int mtrr_del(int reg, unsigned long base, unsigned long size) } EXPORT_SYMBOL(mtrr_del); +/** + * arch_phys_wc_add - add a WC MTRR and handle errors if PAT is unavailable + * @base: Physical base address + * @size: Size of region + * + * If PAT is available, this does nothing. If PAT is unavailable, it + * attempts to add a WC MTRR covering size bytes starting at base and + * logs an error if this fails. + * + * Drivers must store the return value to pass to mtrr_del_wc_if_needed, + * but drivers should not try to interpret that return value. + */ +int arch_phys_wc_add(unsigned long base, unsigned long size) +{ + int ret; + + if (pat_enabled) + return 0; /* Success! (We don't need to do anything.) */ + + ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true); + if (ret < 0) { + pr_warn("Failed to add WC MTRR for [%p-%p]; performance may suffer.", + (void *)base, (void *)(base + size - 1)); + return ret; + } + return ret + MTRR_TO_PHYS_WC_OFFSET; +} +EXPORT_SYMBOL(arch_phys_wc_add); + +/* + * arch_phys_wc_del - undoes arch_phys_wc_add + * @handle: Return value from arch_phys_wc_add + * + * This cleans up after mtrr_add_wc_if_needed. + * + * The API guarantees that mtrr_del_wc_if_needed(error code) and + * mtrr_del_wc_if_needed(0) do nothing. + */ +void arch_phys_wc_del(int handle) +{ + if (handle >= 1) { + WARN_ON(handle < MTRR_TO_PHYS_WC_OFFSET); + mtrr_del(handle - MTRR_TO_PHYS_WC_OFFSET, 0, 0); + } +} +EXPORT_SYMBOL(arch_phys_wc_del); + +/* + * phys_wc_to_mtrr_index - translates arch_phys_wc_add's return value + * @handle: Return value from arch_phys_wc_add + * + * This will turn the return value from arch_phys_wc_add into an mtrr + * index suitable for debugging. + * + * Note: There is no legitimate use for this function, except possibly + * in printk line. Alas there is an illegitimate use in some ancient + * drm ioctls. + */ +int phys_wc_to_mtrr_index(int handle) +{ + if (handle < MTRR_TO_PHYS_WC_OFFSET) + return -1; + else + return handle - MTRR_TO_PHYS_WC_OFFSET; +} +EXPORT_SYMBOL_GPL(phys_wc_to_mtrr_index); + /* * HACK ALERT! * These should be called implicitly, but we can't yet until all the initcall diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.c b/arch/x86/kernel/cpu/perf_event_amd_iommu.c index 0db655e..639d128 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_iommu.c +++ b/arch/x86/kernel/cpu/perf_event_amd_iommu.c @@ -491,10 +491,8 @@ static struct perf_amd_iommu __perf_iommu = { static __init int amd_iommu_pc_init(void) { /* Make sure the IOMMU PC resource is available */ - if (!amd_iommu_pc_supported()) { - pr_err("perf: amd_iommu PMU not installed. No support!\n"); + if (!amd_iommu_pc_supported()) return -ENODEV; - } _init_perf_amd_iommu(&__perf_iommu, "amd_iommu"); diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c index 7b3fe56..31f0f33 100644 --- a/arch/x86/kernel/cpu/powerflags.c +++ b/arch/x86/kernel/cpu/powerflags.c @@ -11,10 +11,10 @@ const char *const x86_power_flags[32] = { "fid", /* frequency id control */ "vid", /* voltage id control */ "ttp", /* thermal trip */ - "tm", - "stc", - "100mhzsteps", - "hwpstate", + "tm", /* hardware thermal control */ + "stc", /* software thermal control */ + "100mhzsteps", /* 100 MHz multiplier control */ + "hwpstate", /* hardware P-state control */ "", /* tsc invariant mapped to constant_tsc */ "cpb", /* core performance boost */ "eff_freq_ro", /* Readonly aperf/mperf */ diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index b158152..4934890 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -364,9 +364,7 @@ static void dt_add_ioapic_domain(unsigned int ioapic_num, * and assigned so we can keep the 1:1 mapping which the ioapic * is having. */ - ret = irq_domain_associate_many(id, 0, 0, NR_IRQS_LEGACY); - if (ret) - pr_err("Error mapping legacy IRQs: %d\n", ret); + irq_domain_associate_many(id, 0, 0, NR_IRQS_LEGACY); if (num > NR_IRQS_LEGACY) { ret = irq_create_strict_mappings(id, NR_IRQS_LEGACY, diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 02f0763..f66ff16 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -393,6 +393,9 @@ void flush_ptrace_hw_breakpoint(struct task_struct *tsk) unregister_hw_breakpoint(t->ptrace_bps[i]); t->ptrace_bps[i] = NULL; } + + t->debugreg6 = 0; + t->ptrace_dr7 = 0; } void hw_breakpoint_restore(void) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 3dd37eb..1f354f4 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -48,10 +48,9 @@ static struct pvclock_wall_clock wall_clock; * have elapsed since the hypervisor wrote the data. So we try to account for * that with system time */ -static unsigned long kvm_get_wallclock(void) +static void kvm_get_wallclock(struct timespec *now) { struct pvclock_vcpu_time_info *vcpu_time; - struct timespec ts; int low, high; int cpu; @@ -64,14 +63,12 @@ static unsigned long kvm_get_wallclock(void) cpu = smp_processor_id(); vcpu_time = &hv_clock[cpu].pvti; - pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); + pvclock_read_wallclock(&wall_clock, vcpu_time, now); preempt_enable(); - - return ts.tv_sec; } -static int kvm_set_wallclock(unsigned long now) +static int kvm_set_wallclock(const struct timespec *now) { return -1; } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 29a8120..7461f50 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -601,30 +601,48 @@ static unsigned long ptrace_get_dr7(struct perf_event *bp[]) return dr7; } -static int -ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, - struct task_struct *tsk, int disabled) +static int ptrace_fill_bp_fields(struct perf_event_attr *attr, + int len, int type, bool disabled) +{ + int err, bp_len, bp_type; + + err = arch_bp_generic_fields(len, type, &bp_len, &bp_type); + if (!err) { + attr->bp_len = bp_len; + attr->bp_type = bp_type; + attr->disabled = disabled; + } + + return err; +} + +static struct perf_event * +ptrace_register_breakpoint(struct task_struct *tsk, int len, int type, + unsigned long addr, bool disabled) { - int err; - int gen_len, gen_type; struct perf_event_attr attr; + int err; - /* - * We should have at least an inactive breakpoint at this - * slot. It means the user is writing dr7 without having - * written the address register first - */ - if (!bp) - return -EINVAL; + ptrace_breakpoint_init(&attr); + attr.bp_addr = addr; - err = arch_bp_generic_fields(len, type, &gen_len, &gen_type); + err = ptrace_fill_bp_fields(&attr, len, type, disabled); if (err) - return err; + return ERR_PTR(err); + + return register_user_hw_breakpoint(&attr, ptrace_triggered, + NULL, tsk); +} - attr = bp->attr; - attr.bp_len = gen_len; - attr.bp_type = gen_type; - attr.disabled = disabled; +static int ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, + int disabled) +{ + struct perf_event_attr attr = bp->attr; + int err; + + err = ptrace_fill_bp_fields(&attr, len, type, disabled); + if (err) + return err; return modify_user_hw_breakpoint(bp, &attr); } @@ -634,67 +652,50 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, */ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) { - struct thread_struct *thread = &(tsk->thread); + struct thread_struct *thread = &tsk->thread; unsigned long old_dr7; - int i, orig_ret = 0, rc = 0; - int enabled, second_pass = 0; - unsigned len, type; - struct perf_event *bp; - - if (ptrace_get_breakpoints(tsk) < 0) - return -ESRCH; + bool second_pass = false; + int i, rc, ret = 0; data &= ~DR_CONTROL_RESERVED; old_dr7 = ptrace_get_dr7(thread->ptrace_bps); + restore: - /* - * Loop through all the hardware breakpoints, making the - * appropriate changes to each. - */ + rc = 0; for (i = 0; i < HBP_NUM; i++) { - enabled = decode_dr7(data, i, &len, &type); - bp = thread->ptrace_bps[i]; - - if (!enabled) { - if (bp) { - /* - * Don't unregister the breakpoints right-away, - * unless all register_user_hw_breakpoint() - * requests have succeeded. This prevents - * any window of opportunity for debug - * register grabbing by other users. - */ - if (!second_pass) - continue; - - rc = ptrace_modify_breakpoint(bp, len, type, - tsk, 1); - if (rc) - break; + unsigned len, type; + bool disabled = !decode_dr7(data, i, &len, &type); + struct perf_event *bp = thread->ptrace_bps[i]; + + if (!bp) { + if (disabled) + continue; + + bp = ptrace_register_breakpoint(tsk, + len, type, 0, disabled); + if (IS_ERR(bp)) { + rc = PTR_ERR(bp); + break; } + + thread->ptrace_bps[i] = bp; continue; } - rc = ptrace_modify_breakpoint(bp, len, type, tsk, 0); + rc = ptrace_modify_breakpoint(bp, len, type, disabled); if (rc) break; } - /* - * Make a second pass to free the remaining unused breakpoints - * or to restore the original breakpoints if an error occurred. - */ - if (!second_pass) { - second_pass = 1; - if (rc < 0) { - orig_ret = rc; - data = old_dr7; - } + + /* Restore if the first pass failed, second_pass shouldn't fail. */ + if (rc && !WARN_ON(second_pass)) { + ret = rc; + data = old_dr7; + second_pass = true; goto restore; } - ptrace_put_breakpoints(tsk); - - return ((orig_ret < 0) ? orig_ret : rc); + return ret; } /* @@ -702,25 +703,17 @@ restore: */ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) { - struct thread_struct *thread = &(tsk->thread); + struct thread_struct *thread = &tsk->thread; unsigned long val = 0; if (n < HBP_NUM) { - struct perf_event *bp; + struct perf_event *bp = thread->ptrace_bps[n]; - if (ptrace_get_breakpoints(tsk) < 0) - return -ESRCH; - - bp = thread->ptrace_bps[n]; - if (!bp) - val = 0; - else + if (bp) val = bp->hw.info.address; - - ptrace_put_breakpoints(tsk); } else if (n == 6) { val = thread->debugreg6; - } else if (n == 7) { + } else if (n == 7) { val = thread->ptrace_dr7; } return val; @@ -729,29 +722,14 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, unsigned long addr) { - struct perf_event *bp; struct thread_struct *t = &tsk->thread; - struct perf_event_attr attr; + struct perf_event *bp = t->ptrace_bps[nr]; int err = 0; - if (ptrace_get_breakpoints(tsk) < 0) - return -ESRCH; - - if (!t->ptrace_bps[nr]) { - ptrace_breakpoint_init(&attr); - /* - * Put stub len and type to register (reserve) an inactive but - * correct bp - */ - attr.bp_addr = addr; - attr.bp_len = HW_BREAKPOINT_LEN_1; - attr.bp_type = HW_BREAKPOINT_W; - attr.disabled = 1; - - bp = register_user_hw_breakpoint(&attr, ptrace_triggered, - NULL, tsk); - + if (!bp) { /* + * Put stub len and type to create an inactive but correct bp. + * * CHECKME: the previous code returned -EIO if the addr wasn't * a valid task virtual addr. The new one will return -EINVAL in * this case. @@ -760,22 +738,20 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, * writing for the user. And anyway this is the previous * behaviour. */ - if (IS_ERR(bp)) { + bp = ptrace_register_breakpoint(tsk, + X86_BREAKPOINT_LEN_1, X86_BREAKPOINT_WRITE, + addr, true); + if (IS_ERR(bp)) err = PTR_ERR(bp); - goto put; - } - - t->ptrace_bps[nr] = bp; + else + t->ptrace_bps[nr] = bp; } else { - bp = t->ptrace_bps[nr]; + struct perf_event_attr attr = bp->attr; - attr = bp->attr; attr.bp_addr = addr; err = modify_user_hw_breakpoint(bp, &attr); } -put: - ptrace_put_breakpoints(tsk); return err; } @@ -785,30 +761,20 @@ put: static int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) { - struct thread_struct *thread = &(tsk->thread); - int rc = 0; - + struct thread_struct *thread = &tsk->thread; /* There are no DR4 or DR5 registers */ - if (n == 4 || n == 5) - return -EIO; + int rc = -EIO; - if (n == 6) { - thread->debugreg6 = val; - goto ret_path; - } if (n < HBP_NUM) { rc = ptrace_set_breakpoint_addr(tsk, n, val); - if (rc) - return rc; - } - /* All that's left is DR7 */ - if (n == 7) { + } else if (n == 6) { + thread->debugreg6 = val; + rc = 0; + } else if (n == 7) { rc = ptrace_write_dr7(tsk, val); if (!rc) thread->ptrace_dr7 = val; } - -ret_path: return rc; } diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 76fa1e9..563ed91 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -36,22 +36,6 @@ void (*pm_power_off)(void); EXPORT_SYMBOL(pm_power_off); static const struct desc_ptr no_idt = {}; -static int reboot_mode; -enum reboot_type reboot_type = BOOT_ACPI; -int reboot_force; - -/* - * This variable is used privately to keep track of whether or not - * reboot_type is still set to its default value (i.e., reboot= hasn't - * been set on the command line). This is needed so that we can - * suppress DMI scanning for reboot quirks. Without it, it's - * impossible to override a faulty reboot quirk without recompiling. - */ -static int reboot_default = 1; - -#ifdef CONFIG_SMP -static int reboot_cpu = -1; -#endif /* * This is set if we need to go through the 'emergency' path. @@ -64,79 +48,6 @@ static int reboot_emergency; bool port_cf9_safe = false; /* - * reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci] - * warm Don't set the cold reboot flag - * cold Set the cold reboot flag - * bios Reboot by jumping through the BIOS - * smp Reboot by executing reset on BSP or other CPU - * triple Force a triple fault (init) - * kbd Use the keyboard controller. cold reset (default) - * acpi Use the RESET_REG in the FADT - * efi Use efi reset_system runtime service - * pci Use the so-called "PCI reset register", CF9 - * force Avoid anything that could hang. - */ -static int __init reboot_setup(char *str) -{ - for (;;) { - /* - * Having anything passed on the command line via - * reboot= will cause us to disable DMI checking - * below. - */ - reboot_default = 0; - - switch (*str) { - case 'w': - reboot_mode = 0x1234; - break; - - case 'c': - reboot_mode = 0; - break; - -#ifdef CONFIG_SMP - case 's': - if (isdigit(*(str+1))) { - reboot_cpu = (int) (*(str+1) - '0'); - if (isdigit(*(str+2))) - reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0'); - } - /* - * We will leave sorting out the final value - * when we are ready to reboot, since we might not - * have detected BSP APIC ID or smp_num_cpu - */ - break; -#endif /* CONFIG_SMP */ - - case 'b': - case 'a': - case 'k': - case 't': - case 'e': - case 'p': - reboot_type = *str; - break; - - case 'f': - reboot_force = 1; - break; - } - - str = strchr(str, ','); - if (str) - str++; - else - break; - } - return 1; -} - -__setup("reboot=", reboot_setup); - - -/* * Reboot options and system auto-detection code provided by * Dell Inc. so their systems "just work". :-) */ @@ -536,6 +447,7 @@ static void native_machine_emergency_restart(void) int i; int attempt = 0; int orig_reboot_type = reboot_type; + unsigned short mode; if (reboot_emergency) emergency_vmx_disable_all(); @@ -543,7 +455,8 @@ static void native_machine_emergency_restart(void) tboot_shutdown(TB_SHUTDOWN_REBOOT); /* Tell the BIOS if we want cold or warm reboot */ - *((unsigned short *)__va(0x472)) = reboot_mode; + mode = reboot_mode == REBOOT_WARM ? 0x1234 : 0; + *((unsigned short *)__va(0x472)) = mode; for (;;) { /* Could also try the reset bit in the Hammer NB */ @@ -585,7 +498,7 @@ static void native_machine_emergency_restart(void) case BOOT_EFI: if (efi_enabled(EFI_RUNTIME_SERVICES)) - efi.reset_system(reboot_mode ? + efi.reset_system(reboot_mode == REBOOT_WARM ? EFI_RESET_WARM : EFI_RESET_COLD, EFI_SUCCESS, 0, NULL); @@ -614,26 +527,10 @@ void native_machine_shutdown(void) { /* Stop the cpus and apics */ #ifdef CONFIG_SMP - - /* The boot cpu is always logical cpu 0 */ - int reboot_cpu_id = 0; - - /* See if there has been given a command line override */ - if ((reboot_cpu != -1) && (reboot_cpu < nr_cpu_ids) && - cpu_online(reboot_cpu)) - reboot_cpu_id = reboot_cpu; - - /* Make certain the cpu I'm about to reboot on is online */ - if (!cpu_online(reboot_cpu_id)) - reboot_cpu_id = smp_processor_id(); - - /* Make certain I only run on the appropriate processor */ - set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id)); - /* - * O.K Now that I'm on the appropriate processor, stop all of the - * others. Also disable the local irq to not receive the per-cpu - * timer interrupt which may trigger scheduler's load balance. + * Stop all of the others. Also disable the local irq to + * not receive the per-cpu timer interrupt which may trigger + * scheduler's load balance. */ local_irq_disable(); stop_other_cpus(); diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 198eb20..0aa2939 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -38,8 +38,9 @@ EXPORT_SYMBOL(rtc_lock); * jump to the next second precisely 500 ms later. Check the Motorola * MC146818A or Dallas DS12887 data sheet for details. */ -int mach_set_rtc_mmss(unsigned long nowtime) +int mach_set_rtc_mmss(const struct timespec *now) { + unsigned long nowtime = now->tv_sec; struct rtc_time tm; int retval = 0; @@ -58,7 +59,7 @@ int mach_set_rtc_mmss(unsigned long nowtime) return retval; } -unsigned long mach_get_cmos_time(void) +void mach_get_cmos_time(struct timespec *now) { unsigned int status, year, mon, day, hour, min, sec, century = 0; unsigned long flags; @@ -107,7 +108,8 @@ unsigned long mach_get_cmos_time(void) } else year += CMOS_YEARS_OFFS; - return mktime(year, mon, day, hour, min, sec); + now->tv_sec = mktime(year, mon, day, hour, min, sec); + now->tv_nsec = 0; } /* Routines for accessing the CMOS RAM/RTC. */ @@ -135,18 +137,13 @@ EXPORT_SYMBOL(rtc_cmos_write); int update_persistent_clock(struct timespec now) { - return x86_platform.set_wallclock(now.tv_sec); + return x86_platform.set_wallclock(&now); } /* not static: needed by APM */ void read_persistent_clock(struct timespec *ts) { - unsigned long retval; - - retval = x86_platform.get_wallclock(); - - ts->tv_sec = retval; - ts->tv_nsec = 0; + x86_platform.get_wallclock(ts); } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 56f7fcf..e68709d 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1040,8 +1040,6 @@ void __init setup_arch(char **cmdline_p) /* max_low_pfn get updated here */ find_low_pfn_range(); #else - num_physpages = max_pfn; - check_x2apic(); /* How many end-of-memory variables you have, grandma! */ diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index f4fe0b88..cdaa347 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -265,23 +265,30 @@ void smp_reschedule_interrupt(struct pt_regs *regs) */ } -void smp_trace_reschedule_interrupt(struct pt_regs *regs) +static inline void smp_entering_irq(void) { ack_APIC_irq(); + irq_enter(); +} + +void smp_trace_reschedule_interrupt(struct pt_regs *regs) +{ + /* + * Need to call irq_enter() before calling the trace point. + * __smp_reschedule_interrupt() calls irq_enter/exit() too (in + * scheduler_ipi(). This is OK, since those functions are allowed + * to nest. + */ + smp_entering_irq(); trace_reschedule_entry(RESCHEDULE_VECTOR); __smp_reschedule_interrupt(); trace_reschedule_exit(RESCHEDULE_VECTOR); + exiting_irq(); /* * KVM uses this interrupt to force a cpu out of guest mode */ } -static inline void call_function_entering_irq(void) -{ - ack_APIC_irq(); - irq_enter(); -} - static inline void __smp_call_function_interrupt(void) { generic_smp_call_function_interrupt(); @@ -290,14 +297,14 @@ static inline void __smp_call_function_interrupt(void) void smp_call_function_interrupt(struct pt_regs *regs) { - call_function_entering_irq(); + smp_entering_irq(); __smp_call_function_interrupt(); exiting_irq(); } void smp_trace_call_function_interrupt(struct pt_regs *regs) { - call_function_entering_irq(); + smp_entering_irq(); trace_call_function_entry(CALL_FUNCTION_VECTOR); __smp_call_function_interrupt(); trace_call_function_exit(CALL_FUNCTION_VECTOR); @@ -312,14 +319,14 @@ static inline void __smp_call_function_single_interrupt(void) void smp_call_function_single_interrupt(struct pt_regs *regs) { - call_function_entering_irq(); + smp_entering_irq(); __smp_call_function_single_interrupt(); exiting_irq(); } void smp_trace_call_function_single_interrupt(struct pt_regs *regs) { - call_function_entering_irq(); + smp_entering_irq(); trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); __smp_call_function_single_interrupt(); trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR); diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index d482bca..6a22c19 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -882,9 +882,9 @@ int lguest_setup_irq(unsigned int irq) * It would be far better for everyone if the Guest had its own clock, but * until then the Host gives us the time on every interrupt. */ -static unsigned long lguest_get_wallclock(void) +static void lguest_get_wallclock(struct timespec *now) { - return lguest_data.time.tv_sec; + *now = lguest_data.time; } /* diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 252b8f5..4500142 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -1,6 +1,7 @@ #include <linux/highmem.h> #include <linux/module.h> #include <linux/swap.h> /* for totalram_pages */ +#include <linux/bootmem.h> void *kmap(struct page *page) { @@ -121,6 +122,11 @@ void __init set_highmem_pages_init(void) struct zone *zone; int nid; + /* + * Explicitly reset zone->managed_pages because set_highmem_pages_init() + * is invoked before free_all_bootmem() + */ + reset_all_zones_managed_pages(); for_each_zone(zone) { unsigned long zone_start_pfn, zone_end_pfn; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 1f34e92..2ec29ac 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -494,7 +494,6 @@ int devmem_is_allowed(unsigned long pagenr) void free_init_pages(char *what, unsigned long begin, unsigned long end) { - unsigned long addr; unsigned long begin_aligned, end_aligned; /* Make sure boundaries are page aligned */ @@ -509,8 +508,6 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) if (begin >= end) return; - addr = begin; - /* * If debugging page accesses then do not free this memory but * mark them not present - any buggy init-section access will @@ -529,18 +526,13 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) set_memory_nx(begin, (end - begin) >> PAGE_SHIFT); set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); - printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); - - for (; addr < end; addr += PAGE_SIZE) { - memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); - free_reserved_page(virt_to_page(addr)); - } + free_reserved_area((void *)begin, (void *)end, POISON_FREE_INITMEM, what); #endif } void free_initmem(void) { - free_init_pages("unused kernel memory", + free_init_pages("unused kernel", (unsigned long)(&__init_begin), (unsigned long)(&__init_end)); } @@ -566,7 +558,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end) * - relocate_initrd() * So here We can do PAGE_ALIGN() safely to get partial page to be freed */ - free_init_pages("initrd memory", start, PAGE_ALIGN(end)); + free_init_pages("initrd", start, PAGE_ALIGN(end)); } #endif diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 3ac7e31..4287f1f 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -660,10 +660,8 @@ void __init initmem_init(void) highstart_pfn = max_low_pfn; printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); - num_physpages = highend_pfn; high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; #else - num_physpages = max_low_pfn; high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif @@ -671,7 +669,7 @@ void __init initmem_init(void) sparse_memory_present_with_active_regions(0); #ifdef CONFIG_FLATMEM - max_mapnr = num_physpages; + max_mapnr = IS_ENABLED(CONFIG_HIGHMEM) ? highend_pfn : max_low_pfn; #endif __vmalloc_start_set = true; @@ -739,9 +737,6 @@ static void __init test_wp_bit(void) void __init mem_init(void) { - int codesize, reservedpages, datasize, initsize; - int tmp; - pci_iommu_alloc(); #ifdef CONFIG_FLATMEM @@ -759,32 +754,11 @@ void __init mem_init(void) set_highmem_pages_init(); /* this will put all low memory onto the freelists */ - totalram_pages += free_all_bootmem(); - - reservedpages = 0; - for (tmp = 0; tmp < max_low_pfn; tmp++) - /* - * Only count reserved RAM pages: - */ - if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) - reservedpages++; + free_all_bootmem(); after_bootmem = 1; - codesize = (unsigned long) &_etext - (unsigned long) &_text; - datasize = (unsigned long) &_edata - (unsigned long) &_etext; - initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; - - printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, " - "%dk reserved, %dk data, %dk init, %ldk highmem)\n", - nr_free_pages() << (PAGE_SHIFT-10), - num_physpages << (PAGE_SHIFT-10), - codesize >> 10, - reservedpages << (PAGE_SHIFT-10), - datasize >> 10, - initsize >> 10, - totalhigh_pages << (PAGE_SHIFT-10)); - + mem_init_print_info(NULL); printk(KERN_INFO "virtual kernel memory layout:\n" " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" #ifdef CONFIG_HIGHMEM diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index b3940b6..104d56a 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -712,36 +712,22 @@ EXPORT_SYMBOL_GPL(arch_add_memory); static void __meminit free_pagetable(struct page *page, int order) { - struct zone *zone; - bool bootmem = false; unsigned long magic; unsigned int nr_pages = 1 << order; /* bootmem page has reserved flag */ if (PageReserved(page)) { __ClearPageReserved(page); - bootmem = true; magic = (unsigned long)page->lru.next; if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { while (nr_pages--) put_page_bootmem(page++); } else - __free_pages_bootmem(page, order); + while (nr_pages--) + free_reserved_page(page++); } else free_pages((unsigned long)page_address(page), order); - - /* - * SECTION_INFO pages and MIX_SECTION_INFO pages - * are all allocated by bootmem. - */ - if (bootmem) { - zone = page_zone(page); - zone_span_writelock(zone); - zone->present_pages += nr_pages; - zone_span_writeunlock(zone); - totalram_pages += nr_pages; - } } static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) @@ -1058,9 +1044,6 @@ static void __init register_page_bootmem_info(void) void __init mem_init(void) { - long codesize, reservedpages, datasize, initsize; - unsigned long absent_pages; - pci_iommu_alloc(); /* clear_bss() already clear the empty_zero_page */ @@ -1068,29 +1051,14 @@ void __init mem_init(void) register_page_bootmem_info(); /* this will put all memory onto the freelists */ - totalram_pages = free_all_bootmem(); - - absent_pages = absent_pages_in_range(0, max_pfn); - reservedpages = max_pfn - totalram_pages - absent_pages; + free_all_bootmem(); after_bootmem = 1; - codesize = (unsigned long) &_etext - (unsigned long) &_text; - datasize = (unsigned long) &_edata - (unsigned long) &_etext; - initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; - /* Register memory areas for /proc/kcore */ kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, VSYSCALL_END - VSYSCALL_START, KCORE_OTHER); - printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " - "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n", - nr_free_pages() << (PAGE_SHIFT-10), - max_pfn << (PAGE_SHIFT-10), - codesize >> 10, - absent_pages << (PAGE_SHIFT-10), - reservedpages << (PAGE_SHIFT-10), - datasize >> 10, - initsize >> 10); + mem_init_print_info(NULL); } #ifdef CONFIG_DEBUG_RODATA @@ -1166,11 +1134,10 @@ void mark_rodata_ro(void) set_memory_ro(start, (end-start) >> PAGE_SHIFT); #endif - free_init_pages("unused kernel memory", + free_init_pages("unused kernel", (unsigned long) __va(__pa_symbol(text_end)), (unsigned long) __va(__pa_symbol(rodata_start))); - - free_init_pages("unused kernel memory", + free_init_pages("unused kernel", (unsigned long) __va(__pa_symbol(rodata_end)), (unsigned long) __va(__pa_symbol(_sdata))); } diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 845df68..62c29a5 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -115,10 +115,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (mmap_is_legacy()) { mm->mmap_base = mmap_legacy_base(); mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 73a6d73..0342d27 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -83,10 +83,8 @@ void __init initmem_init(void) highstart_pfn = max_low_pfn; printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); - num_physpages = highend_pfn; high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; #else - num_physpages = max_low_pfn; high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif printk(KERN_NOTICE "%ldMB LOWMEM available.\n", diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 17fda6a..dfa537a 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -240,7 +240,6 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) { pud_t *pud; - unsigned long addr; int i; if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ @@ -248,8 +247,7 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) pud = pud_offset(pgd, 0); - for (addr = i = 0; i < PREALLOCATED_PMDS; - i++, pud++, addr += PUD_SIZE) { + for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { pmd_t *pmd = pmds[i]; if (i >= KERNEL_PGD_BOUNDARY) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index f66b540..79c216a 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -12,6 +12,7 @@ #include <linux/netdevice.h> #include <linux/filter.h> #include <linux/if_vlan.h> +#include <linux/random.h> /* * Conventions : @@ -144,6 +145,39 @@ static int pkt_type_offset(void) return -1; } +struct bpf_binary_header { + unsigned int pages; + /* Note : for security reasons, bpf code will follow a randomly + * sized amount of int3 instructions + */ + u8 image[]; +}; + +static struct bpf_binary_header *bpf_alloc_binary(unsigned int proglen, + u8 **image_ptr) +{ + unsigned int sz, hole; + struct bpf_binary_header *header; + + /* Most of BPF filters are really small, + * but if some of them fill a page, allow at least + * 128 extra bytes to insert a random section of int3 + */ + sz = round_up(proglen + sizeof(*header) + 128, PAGE_SIZE); + header = module_alloc(sz); + if (!header) + return NULL; + + memset(header, 0xcc, sz); /* fill whole space with int3 instructions */ + + header->pages = sz / PAGE_SIZE; + hole = sz - (proglen + sizeof(*header)); + + /* insert a random number of int3 instructions before BPF code */ + *image_ptr = &header->image[prandom_u32() % hole]; + return header; +} + void bpf_jit_compile(struct sk_filter *fp) { u8 temp[64]; @@ -153,6 +187,7 @@ void bpf_jit_compile(struct sk_filter *fp) int t_offset, f_offset; u8 t_op, f_op, seen = 0, pass; u8 *image = NULL; + struct bpf_binary_header *header = NULL; u8 *func; int pc_ret0 = -1; /* bpf index of first RET #0 instruction (if any) */ unsigned int cleanup_addr; /* epilogue code offset */ @@ -693,7 +728,7 @@ cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i]; if (unlikely(proglen + ilen > oldproglen)) { pr_err("bpb_jit_compile fatal error\n"); kfree(addrs); - module_free(NULL, image); + module_free(NULL, header); return; } memcpy(image + proglen, temp, ilen); @@ -717,10 +752,8 @@ cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i]; break; } if (proglen == oldproglen) { - image = module_alloc(max_t(unsigned int, - proglen, - sizeof(struct work_struct))); - if (!image) + header = bpf_alloc_binary(proglen, &image); + if (!header) goto out; } oldproglen = proglen; @@ -730,7 +763,8 @@ cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i]; bpf_jit_dump(flen, proglen, pass, image); if (image) { - bpf_flush_icache(image, image + proglen); + bpf_flush_icache(header, image + proglen); + set_memory_ro((unsigned long)header, header->pages); fp->bpf_func = (void *)image; } out: @@ -738,20 +772,13 @@ out: return; } -static void jit_free_defer(struct work_struct *arg) -{ - module_free(NULL, arg); -} - -/* run from softirq, we must use a work_struct to call - * module_free() from process context - */ void bpf_jit_free(struct sk_filter *fp) { if (fp->bpf_func != sk_run_filter) { - struct work_struct *work = (struct work_struct *)fp->bpf_func; + unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK; + struct bpf_binary_header *header = (void *)addr; - INIT_WORK(work, jit_free_defer); - schedule_work(work); + set_memory_rw(addr, header->pages); + module_free(NULL, header); } } diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 3e72425..d641897 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -324,14 +324,11 @@ setup_resource(struct acpi_resource *acpi_res, void *data) res->start = start; res->end = end; info->res_offset[info->res_num] = addr.translation_offset; + info->res_num++; - if (!pci_use_crs) { + if (!pci_use_crs) dev_printk(KERN_DEBUG, &info->bridge->dev, "host bridge window %pR (ignored)\n", res); - return AE_OK; - } - - info->res_num++; return AE_OK; } diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index b410b71..c8d5577 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -274,8 +274,9 @@ static efi_status_t __init phys_efi_get_time(efi_time_t *tm, return status; } -int efi_set_rtc_mmss(unsigned long nowtime) +int efi_set_rtc_mmss(const struct timespec *now) { + unsigned long nowtime = now->tv_sec; efi_status_t status; efi_time_t eft; efi_time_cap_t cap; @@ -310,7 +311,7 @@ int efi_set_rtc_mmss(unsigned long nowtime) return 0; } -unsigned long efi_get_time(void) +void efi_get_time(struct timespec *now) { efi_status_t status; efi_time_t eft; @@ -320,8 +321,9 @@ unsigned long efi_get_time(void) if (status != EFI_SUCCESS) pr_err("Oops: efitime: can't read time!\n"); - return mktime(eft.year, eft.month, eft.day, eft.hour, - eft.minute, eft.second); + now->tv_sec = mktime(eft.year, eft.month, eft.day, eft.hour, + eft.minute, eft.second); + now->tv_nsec = 0; } /* diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c index d62b0a3..5e355b1 100644 --- a/arch/x86/platform/mrst/vrtc.c +++ b/arch/x86/platform/mrst/vrtc.c @@ -56,7 +56,7 @@ void vrtc_cmos_write(unsigned char val, unsigned char reg) } EXPORT_SYMBOL_GPL(vrtc_cmos_write); -unsigned long vrtc_get_time(void) +void vrtc_get_time(struct timespec *now) { u8 sec, min, hour, mday, mon; unsigned long flags; @@ -82,17 +82,18 @@ unsigned long vrtc_get_time(void) printk(KERN_INFO "vRTC: sec: %d min: %d hour: %d day: %d " "mon: %d year: %d\n", sec, min, hour, mday, mon, year); - return mktime(year, mon, mday, hour, min, sec); + now->tv_sec = mktime(year, mon, mday, hour, min, sec); + now->tv_nsec = 0; } -int vrtc_set_mmss(unsigned long nowtime) +int vrtc_set_mmss(const struct timespec *now) { unsigned long flags; struct rtc_time tm; int year; int retval = 0; - rtc_time_to_tm(nowtime, &tm); + rtc_time_to_tm(now->tv_sec, &tm); if (!rtc_valid_tm(&tm) && tm.tm_year >= 72) { /* * tm.year is the number of years since 1900, and the @@ -110,7 +111,7 @@ int vrtc_set_mmss(unsigned long nowtime) } else { printk(KERN_ERR "%s: Invalid vRTC value: write of %lx to vRTC failed\n", - __FUNCTION__, nowtime); + __FUNCTION__, now->tv_sec); retval = -EINVAL; } return retval; diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index a690868..ee36589 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -15,6 +15,7 @@ #include <linux/math64.h> #include <linux/gfp.h> #include <linux/slab.h> +#include <linux/pvclock_gtod.h> #include <asm/pvclock.h> #include <asm/xen/hypervisor.h> @@ -179,34 +180,56 @@ static void xen_read_wallclock(struct timespec *ts) put_cpu_var(xen_vcpu); } -static unsigned long xen_get_wallclock(void) +static void xen_get_wallclock(struct timespec *now) { - struct timespec ts; + xen_read_wallclock(now); +} - xen_read_wallclock(&ts); - return ts.tv_sec; +static int xen_set_wallclock(const struct timespec *now) +{ + return -1; } -static int xen_set_wallclock(unsigned long now) +static int xen_pvclock_gtod_notify(struct notifier_block *nb, + unsigned long was_set, void *priv) { + /* Protected by the calling core code serialization */ + static struct timespec next_sync; + struct xen_platform_op op; - int rc; + struct timespec now; - /* do nothing for domU */ - if (!xen_initial_domain()) - return -1; + now = __current_kernel_time(); + + /* + * We only take the expensive HV call when the clock was set + * or when the 11 minutes RTC synchronization time elapsed. + */ + if (!was_set && timespec_compare(&now, &next_sync) < 0) + return NOTIFY_OK; op.cmd = XENPF_settime; - op.u.settime.secs = now; - op.u.settime.nsecs = 0; + op.u.settime.secs = now.tv_sec; + op.u.settime.nsecs = now.tv_nsec; op.u.settime.system_time = xen_clocksource_read(); - rc = HYPERVISOR_dom0_op(&op); - WARN(rc != 0, "XENPF_settime failed: now=%ld\n", now); + (void)HYPERVISOR_dom0_op(&op); - return rc; + /* + * Move the next drift compensation time 11 minutes + * ahead. That's emulating the sync_cmos_clock() update for + * the hardware RTC. + */ + next_sync = now; + next_sync.tv_sec += 11 * 60; + + return NOTIFY_OK; } +static struct notifier_block xen_pvclock_gtod_notifier = { + .notifier_call = xen_pvclock_gtod_notify, +}; + static struct clocksource xen_clocksource __read_mostly = { .name = "xen", .rating = 400, @@ -482,6 +505,9 @@ static void __init xen_time_init(void) xen_setup_runstate_info(cpu); xen_setup_timer(cpu); xen_setup_cpu_clockevents(); + + if (xen_initial_domain()) + pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier); } void __init xen_init_time_ops(void) @@ -494,7 +520,9 @@ void __init xen_init_time_ops(void) x86_platform.calibrate_tsc = xen_tsc_khz; x86_platform.get_wallclock = xen_get_wallclock; - x86_platform.set_wallclock = xen_set_wallclock; + /* Dom0 uses the native method to set the hardware RTC. */ + if (!xen_initial_domain()) + x86_platform.set_wallclock = xen_set_wallclock; } #ifdef CONFIG_XEN_PVHVM |