From a05248ed2d9a83ae7c3e6db7c4ef9331c3dedc81 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Mon, 8 Apr 2013 21:50:55 +0300 Subject: crypto: x86 - add more optimized XTS-mode for serpent-avx This patch adds AVX optimized XTS-mode helper functions/macros and converts serpent-avx to use the new facilities. Benefits are slightly improved speed and reduced stack usage as use of temporary IV-array is avoided. tcrypt results, with Intel i5-2450M: enc dec 16B 1.00x 1.00x 64B 1.00x 1.00x 256B 1.04x 1.06x 1024B 1.09x 1.09x 8192B 1.10x 1.09x Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/include/asm/crypto/glue_helper.h | 24 ++++++++++++++++++++++++ arch/x86/include/asm/crypto/serpent-avx.h | 5 +++++ 2 files changed, 29 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h index e2d65b061d2..1eef55596e8 100644 --- a/arch/x86/include/asm/crypto/glue_helper.h +++ b/arch/x86/include/asm/crypto/glue_helper.h @@ -14,10 +14,13 @@ typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src); typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src); typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src, le128 *iv); +typedef void (*common_glue_xts_func_t)(void *ctx, u128 *dst, const u128 *src, + le128 *iv); #define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn)) #define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn)) #define GLUE_CTR_FUNC_CAST(fn) ((common_glue_ctr_func_t)(fn)) +#define GLUE_XTS_FUNC_CAST(fn) ((common_glue_xts_func_t)(fn)) struct common_glue_func_entry { unsigned int num_blocks; /* number of blocks that @fn will process */ @@ -25,6 +28,7 @@ struct common_glue_func_entry { common_glue_func_t ecb; common_glue_cbc_func_t cbc; common_glue_ctr_func_t ctr; + common_glue_xts_func_t xts; } fn_u; }; @@ -96,6 +100,16 @@ static inline void le128_inc(le128 *i) i->b = cpu_to_le64(b); } +static inline void le128_gf128mul_x_ble(le128 *dst, const le128 *src) +{ + u64 a = le64_to_cpu(src->a); + u64 b = le64_to_cpu(src->b); + u64 _tt = ((s64)a >> 63) & 0x87; + + dst->a = cpu_to_le64((a << 1) ^ (b >> 63)); + dst->b = cpu_to_le64((b << 1) ^ _tt); +} + extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, struct blkcipher_desc *desc, struct scatterlist *dst, @@ -118,4 +132,14 @@ extern int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes); +extern int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes, + common_glue_func_t tweak_fn, void *tweak_ctx, + void *crypt_ctx); + +extern void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, + le128 *iv, common_glue_func_t fn); + #endif /* _CRYPTO_GLUE_HELPER_H */ diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h index 0da1d3e2a55..56e79cc57ea 100644 --- a/arch/x86/include/asm/crypto/serpent-avx.h +++ b/arch/x86/include/asm/crypto/serpent-avx.h @@ -16,4 +16,9 @@ asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst, const u8 *src, le128 *iv); +asmlinkage void serpent_xts_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst, + const u8 *src, le128 *iv); +asmlinkage void serpent_xts_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, + const u8 *src, le128 *iv); + #endif -- cgit v1.2.3 From 604880107010a1e5794552d184cd5471ea31b973 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Sat, 13 Apr 2013 13:46:45 +0300 Subject: crypto: blowfish - add AVX2/x86_64 implementation of blowfish cipher Patch adds AVX2/x86-64 implementation of Blowfish cipher, requiring 32 parallel blocks for input (256 bytes). Table look-ups are performed using vpgatherdd instruction directly from vector registers and thus should be faster than earlier implementations. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/include/asm/cpufeature.h | 1 + arch/x86/include/asm/crypto/blowfish.h | 43 ++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 arch/x86/include/asm/crypto/blowfish.h (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 93fe929d1ce..1243272605a 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -278,6 +278,7 @@ extern const char * const x86_power_flags[32]; #define cpu_has_ssse3 boot_cpu_has(X86_FEATURE_SSSE3) #define cpu_has_aes boot_cpu_has(X86_FEATURE_AES) #define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX) +#define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2) #define cpu_has_ht boot_cpu_has(X86_FEATURE_HT) #define cpu_has_mp boot_cpu_has(X86_FEATURE_MP) #define cpu_has_nx boot_cpu_has(X86_FEATURE_NX) diff --git a/arch/x86/include/asm/crypto/blowfish.h b/arch/x86/include/asm/crypto/blowfish.h new file mode 100644 index 00000000000..f097b2face1 --- /dev/null +++ b/arch/x86/include/asm/crypto/blowfish.h @@ -0,0 +1,43 @@ +#ifndef ASM_X86_BLOWFISH_H +#define ASM_X86_BLOWFISH_H + +#include +#include + +#define BF_PARALLEL_BLOCKS 4 + +/* regular block cipher functions */ +asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src, + bool xor); +asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src); + +/* 4-way parallel cipher functions */ +asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst, + const u8 *src); + +static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src) +{ + __blowfish_enc_blk(ctx, dst, src, false); +} + +static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst, + const u8 *src) +{ + __blowfish_enc_blk(ctx, dst, src, true); +} + +static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, + const u8 *src) +{ + __blowfish_enc_blk_4way(ctx, dst, src, false); +} + +static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst, + const u8 *src) +{ + __blowfish_enc_blk_4way(ctx, dst, src, true); +} + +#endif -- cgit v1.2.3 From cf1521a1a5e21fd1e79a458605c4282fbfbbeee2 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Sat, 13 Apr 2013 13:46:50 +0300 Subject: crypto: twofish - add AVX2/x86_64 assembler implementation of twofish cipher Patch adds AVX2/x86-64 implementation of Twofish cipher, requiring 16 parallel blocks for input (256 bytes). Table look-ups are performed using vpgatherdd instruction directly from vector registers and thus should be faster than earlier implementations. Implementation also uses 256-bit wide YMM registers, which should give additional speed up compared to the AVX implementation. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/include/asm/crypto/twofish.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/include/asm/crypto/twofish.h index 878c51ceebb..e655c6029b4 100644 --- a/arch/x86/include/asm/crypto/twofish.h +++ b/arch/x86/include/asm/crypto/twofish.h @@ -28,6 +28,20 @@ asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst, const u8 *src); +/* 8-way parallel cipher functions */ +asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src); +asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src); +asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src); +asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src, le128 *iv); +asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src, le128 *iv); +asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src, le128 *iv); + /* helpers from twofish_x86_64-3way module */ extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src); extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, @@ -43,4 +57,8 @@ extern void lrw_twofish_exit_tfm(struct crypto_tfm *tfm); extern int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen); +/* helpers from twofish-avx module */ +extern void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv); +extern void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv); + #endif /* ASM_X86_TWOFISH_H */ -- cgit v1.2.3 From 56d76c96a9f3e39ab733c5643b3ce5a1d4be242a Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Sat, 13 Apr 2013 13:46:55 +0300 Subject: crypto: serpent - add AVX2/x86_64 assembler implementation of serpent cipher Patch adds AVX2/x86-64 implementation of Serpent cipher, requiring 16 parallel blocks for input (256 bytes). Implementation is based on the AVX implementation and extends to use the 256-bit wide YMM registers. Since serpent does not use table look-ups, this implementation should be close to two times faster than the AVX implementation. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/include/asm/crypto/serpent-avx.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h index 56e79cc57ea..33c2b8a435d 100644 --- a/arch/x86/include/asm/crypto/serpent-avx.h +++ b/arch/x86/include/asm/crypto/serpent-avx.h @@ -6,6 +6,16 @@ #define SERPENT_PARALLEL_BLOCKS 8 +struct serpent_lrw_ctx { + struct lrw_table_ctx lrw_table; + struct serpent_ctx serpent_ctx; +}; + +struct serpent_xts_ctx { + struct serpent_ctx tweak_ctx; + struct serpent_ctx crypt_ctx; +}; + asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst, const u8 *src); asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, @@ -21,4 +31,18 @@ asmlinkage void serpent_xts_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst, asmlinkage void serpent_xts_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, const u8 *src, le128 *iv); +extern void __serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, + le128 *iv); + +extern void serpent_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv); +extern void serpent_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv); + +extern int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen); + +extern void lrw_serpent_exit_tfm(struct crypto_tfm *tfm); + +extern int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen); + #endif -- cgit v1.2.3 From f3f935a76aa0eee68da2b273a08d84ba8ffc7a73 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Sat, 13 Apr 2013 13:47:00 +0300 Subject: crypto: camellia - add AVX2/AES-NI/x86_64 assembler implementation of camellia cipher Patch adds AVX2/AES-NI/x86-64 implementation of Camellia cipher, requiring 32 parallel blocks for input (512 bytes). Compared to AVX implementation, this version is extended to use the 256-bit wide YMM registers. For AES-NI instructions data is split to two 128-bit registers and merged afterwards. Even with this additional handling, performance should be higher compared to the AES-NI/AVX implementation. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/include/asm/crypto/camellia.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/crypto/camellia.h b/arch/x86/include/asm/crypto/camellia.h index 98038add801..bb93333d920 100644 --- a/arch/x86/include/asm/crypto/camellia.h +++ b/arch/x86/include/asm/crypto/camellia.h @@ -48,6 +48,22 @@ asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst, asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst, const u8 *src); +/* 16-way parallel cipher functions (avx/aes-ni) */ +asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst, + const u8 *src); +asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst, + const u8 *src); + +asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst, + const u8 *src); +asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst, + const u8 *src, le128 *iv); + +asmlinkage void camellia_xts_enc_16way(struct camellia_ctx *ctx, u8 *dst, + const u8 *src, le128 *iv); +asmlinkage void camellia_xts_dec_16way(struct camellia_ctx *ctx, u8 *dst, + const u8 *src, le128 *iv); + static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst, const u8 *src) { @@ -79,4 +95,7 @@ extern void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, extern void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, le128 *iv); +extern void camellia_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv); +extern void camellia_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv); + #endif /* ASM_X86_CAMELLIA_H */ -- cgit v1.2.3