From ddf169a98f01d6fd46295ec0dd4c1d6385be65d4 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 8 Dec 2020 00:34:02 +0100 Subject: crypto: aesni - implement support for cts(cbc(aes)) Follow the same approach as the arm64 driver for implementing a version of AES-NI in CBC mode that supports ciphertext stealing. This results in a ~2x speed increase for relatively short inputs (less than 256 bytes), which is relevant given that AES-CBC with ciphertext stealing is used for filename encryption in the fscrypt layer. For larger inputs, the speedup is still significant (~25% on decryption, ~6% on encryption) Tested-by: Eric Biggers # x86_64 Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_asm.S | 129 ++++++++++++++++++++++++++++++++++- arch/x86/crypto/aesni-intel_glue.c | 133 +++++++++++++++++++++++++++++++++++++ 2 files changed, 261 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index d1436c37008b..a2710f76862f 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -2577,13 +2577,140 @@ SYM_FUNC_START(aesni_cbc_dec) ret SYM_FUNC_END(aesni_cbc_dec) -#ifdef __x86_64__ +/* + * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, + * size_t len, u8 *iv) + */ +SYM_FUNC_START(aesni_cts_cbc_enc) + FRAME_BEGIN +#ifndef __x86_64__ + pushl IVP + pushl LEN + pushl KEYP + pushl KLEN + movl (FRAME_OFFSET+20)(%esp), KEYP # ctx + movl (FRAME_OFFSET+24)(%esp), OUTP # dst + movl (FRAME_OFFSET+28)(%esp), INP # src + movl (FRAME_OFFSET+32)(%esp), LEN # len + movl (FRAME_OFFSET+36)(%esp), IVP # iv + lea .Lcts_permute_table, T1 +#else + lea .Lcts_permute_table(%rip), T1 +#endif + mov 480(KEYP), KLEN + movups (IVP), STATE + sub $16, LEN + mov T1, IVP + add $32, IVP + add LEN, T1 + sub LEN, IVP + movups (T1), %xmm4 + movups (IVP), %xmm5 + + movups (INP), IN1 + add LEN, INP + movups (INP), IN2 + + pxor IN1, STATE + call _aesni_enc1 + + pshufb %xmm5, IN2 + pxor STATE, IN2 + pshufb %xmm4, STATE + add OUTP, LEN + movups STATE, (LEN) + + movaps IN2, STATE + call _aesni_enc1 + movups STATE, (OUTP) + +#ifndef __x86_64__ + popl KLEN + popl KEYP + popl LEN + popl IVP +#endif + FRAME_END + ret +SYM_FUNC_END(aesni_cts_cbc_enc) + +/* + * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, + * size_t len, u8 *iv) + */ +SYM_FUNC_START(aesni_cts_cbc_dec) + FRAME_BEGIN +#ifndef __x86_64__ + pushl IVP + pushl LEN + pushl KEYP + pushl KLEN + movl (FRAME_OFFSET+20)(%esp), KEYP # ctx + movl (FRAME_OFFSET+24)(%esp), OUTP # dst + movl (FRAME_OFFSET+28)(%esp), INP # src + movl (FRAME_OFFSET+32)(%esp), LEN # len + movl (FRAME_OFFSET+36)(%esp), IVP # iv + lea .Lcts_permute_table, T1 +#else + lea .Lcts_permute_table(%rip), T1 +#endif + mov 480(KEYP), KLEN + add $240, KEYP + movups (IVP), IV + sub $16, LEN + mov T1, IVP + add $32, IVP + add LEN, T1 + sub LEN, IVP + movups (T1), %xmm4 + + movups (INP), STATE + add LEN, INP + movups (INP), IN1 + + call _aesni_dec1 + movaps STATE, IN2 + pshufb %xmm4, STATE + pxor IN1, STATE + + add OUTP, LEN + movups STATE, (LEN) + + movups (IVP), %xmm0 + pshufb %xmm0, IN1 + pblendvb IN2, IN1 + movaps IN1, STATE + call _aesni_dec1 + + pxor IV, STATE + movups STATE, (OUTP) + +#ifndef __x86_64__ + popl KLEN + popl KEYP + popl LEN + popl IVP +#endif + FRAME_END + ret +SYM_FUNC_END(aesni_cts_cbc_dec) + .pushsection .rodata .align 16 +.Lcts_permute_table: + .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 + .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 + .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 +#ifdef __x86_64__ .Lbswap_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +#endif .popsection +#ifdef __x86_64__ /* * _aesni_inc_init: internal ABI * setup registers used by _aesni_inc diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index ad8a7188a2bf..96bdc1584215 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -93,6 +93,10 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); +asmlinkage void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv); +asmlinkage void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv); #define AVX_GEN2_OPTSIZE 640 #define AVX_GEN4_OPTSIZE 4096 @@ -454,6 +458,118 @@ static int cbc_decrypt(struct skcipher_request *req) return err; } +static int cts_cbc_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); + int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2; + struct scatterlist *src = req->src, *dst = req->dst; + struct scatterlist sg_src[2], sg_dst[2]; + struct skcipher_request subreq; + struct skcipher_walk walk; + int err; + + skcipher_request_set_tfm(&subreq, tfm); + skcipher_request_set_callback(&subreq, skcipher_request_flags(req), + NULL, NULL); + + if (req->cryptlen <= AES_BLOCK_SIZE) { + if (req->cryptlen < AES_BLOCK_SIZE) + return -EINVAL; + cbc_blocks = 1; + } + + if (cbc_blocks > 0) { + skcipher_request_set_crypt(&subreq, req->src, req->dst, + cbc_blocks * AES_BLOCK_SIZE, + req->iv); + + err = cbc_encrypt(&subreq); + if (err) + return err; + + if (req->cryptlen == AES_BLOCK_SIZE) + return 0; + + dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen); + if (req->dst != req->src) + dst = scatterwalk_ffwd(sg_dst, req->dst, + subreq.cryptlen); + } + + /* handle ciphertext stealing */ + skcipher_request_set_crypt(&subreq, src, dst, + req->cryptlen - cbc_blocks * AES_BLOCK_SIZE, + req->iv); + + err = skcipher_walk_virt(&walk, &subreq, false); + if (err) + return err; + + kernel_fpu_begin(); + aesni_cts_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, + walk.nbytes, walk.iv); + kernel_fpu_end(); + + return skcipher_walk_done(&walk, 0); +} + +static int cts_cbc_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); + int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2; + struct scatterlist *src = req->src, *dst = req->dst; + struct scatterlist sg_src[2], sg_dst[2]; + struct skcipher_request subreq; + struct skcipher_walk walk; + int err; + + skcipher_request_set_tfm(&subreq, tfm); + skcipher_request_set_callback(&subreq, skcipher_request_flags(req), + NULL, NULL); + + if (req->cryptlen <= AES_BLOCK_SIZE) { + if (req->cryptlen < AES_BLOCK_SIZE) + return -EINVAL; + cbc_blocks = 1; + } + + if (cbc_blocks > 0) { + skcipher_request_set_crypt(&subreq, req->src, req->dst, + cbc_blocks * AES_BLOCK_SIZE, + req->iv); + + err = cbc_decrypt(&subreq); + if (err) + return err; + + if (req->cryptlen == AES_BLOCK_SIZE) + return 0; + + dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen); + if (req->dst != req->src) + dst = scatterwalk_ffwd(sg_dst, req->dst, + subreq.cryptlen); + } + + /* handle ciphertext stealing */ + skcipher_request_set_crypt(&subreq, src, dst, + req->cryptlen - cbc_blocks * AES_BLOCK_SIZE, + req->iv); + + err = skcipher_walk_virt(&walk, &subreq, false); + if (err) + return err; + + kernel_fpu_begin(); + aesni_cts_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, + walk.nbytes, walk.iv); + kernel_fpu_end(); + + return skcipher_walk_done(&walk, 0); +} + #ifdef CONFIG_X86_64 static void ctr_crypt_final(struct crypto_aes_ctx *ctx, struct skcipher_walk *walk) @@ -928,6 +1044,23 @@ static struct skcipher_alg aesni_skciphers[] = { .setkey = aesni_skcipher_setkey, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, + }, { + .base = { + .cra_name = "__cts(cbc(aes))", + .cra_driver_name = "__cts-cbc-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = CRYPTO_AES_CTX_SIZE, + .cra_module = THIS_MODULE, + }, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .walksize = 2 * AES_BLOCK_SIZE, + .setkey = aesni_skcipher_setkey, + .encrypt = cts_cbc_encrypt, + .decrypt = cts_cbc_decrypt, #ifdef CONFIG_X86_64 }, { .base = { -- cgit v1.2.3 From 0eb76ba29d16df2951d37c54ca279c4e5630b071 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 11 Dec 2020 13:27:15 +0100 Subject: crypto: remove cipher routines from public crypto API The cipher routines in the crypto API are mostly intended for templates implementing skcipher modes generically in software, and shouldn't be used outside of the crypto subsystem. So move the prototypes and all related definitions to a new header file under include/crypto/internal. Also, let's use the new module namespace feature to move the symbol exports into a new namespace CRYPTO_INTERNAL. Signed-off-by: Ard Biesheuvel Acked-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/arm/crypto/aes-neonbs-glue.c | 3 +++ arch/s390/crypto/aes_s390.c | 2 ++ 2 files changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/arm/crypto/aes-neonbs-glue.c b/arch/arm/crypto/aes-neonbs-glue.c index f70af1d0514b..5c6cd3c63cbc 100644 --- a/arch/arm/crypto/aes-neonbs-glue.c +++ b/arch/arm/crypto/aes-neonbs-glue.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -23,6 +24,8 @@ MODULE_ALIAS_CRYPTO("cbc(aes)-all"); MODULE_ALIAS_CRYPTO("ctr(aes)"); MODULE_ALIAS_CRYPTO("xts(aes)"); +MODULE_IMPORT_NS(CRYPTO_INTERNAL); + asmlinkage void aesbs_convert_key(u8 out[], u32 const rk[], int rounds); asmlinkage void aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c index 73044634d342..54c7536f2482 100644 --- a/arch/s390/crypto/aes_s390.c +++ b/arch/s390/crypto/aes_s390.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -1055,3 +1056,4 @@ MODULE_ALIAS_CRYPTO("aes-all"); MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm"); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(CRYPTO_INTERNAL); -- cgit v1.2.3 From 15deb4333cd6d4e1e3216582e4c531ec40a6b060 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 17 Dec 2020 19:55:15 +0100 Subject: crypto: arm64/aes-ce - really hide slower algos when faster ones are enabled Commit 69b6f2e817e5b ("crypto: arm64/aes-neon - limit exposed routines if faster driver is enabled") intended to hide modes from the plain NEON driver that are also implemented by the faster bit sliced NEON one if both are enabled. However, the defined() CPP function does not detect if the bit sliced NEON driver is enabled as a module. So instead, let's use IS_ENABLED() here. Fixes: 69b6f2e817e5b ("crypto: arm64/aes-neon - limit exposed routines if ...") Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/aes-glue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index 34b8a89197be..cafb5b96be0e 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -55,7 +55,7 @@ MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions"); #define aes_mac_update neon_aes_mac_update MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON"); #endif -#if defined(USE_V8_CRYPTO_EXTENSIONS) || !defined(CONFIG_CRYPTO_AES_ARM64_BS) +#if defined(USE_V8_CRYPTO_EXTENSIONS) || !IS_ENABLED(CONFIG_CRYPTO_AES_ARM64_BS) MODULE_ALIAS_CRYPTO("ecb(aes)"); MODULE_ALIAS_CRYPTO("cbc(aes)"); MODULE_ALIAS_CRYPTO("ctr(aes)"); @@ -650,7 +650,7 @@ static int __maybe_unused xts_decrypt(struct skcipher_request *req) } static struct skcipher_alg aes_algs[] = { { -#if defined(USE_V8_CRYPTO_EXTENSIONS) || !defined(CONFIG_CRYPTO_AES_ARM64_BS) +#if defined(USE_V8_CRYPTO_EXTENSIONS) || !IS_ENABLED(CONFIG_CRYPTO_AES_ARM64_BS) .base = { .cra_name = "__ecb(aes)", .cra_driver_name = "__ecb-aes-" MODE, -- cgit v1.2.3 From 5318d3db465d29efe97b0e18da29ad95156e6142 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 17 Dec 2020 19:55:16 +0100 Subject: crypto: arm64/aes-ctr - improve tail handling Counter mode is a stream cipher chaining mode that is typically used with inputs that are of arbitrarily length, and so a tail block which is smaller than a full AES block is rule rather than exception. The current ctr(aes) implementation for arm64 always makes a separate call into the assembler routine to process this tail block, which is suboptimal, given that it requires reloading of the AES round keys, and prevents us from handling this tail block using the 5-way stride that we use for better performance on deep pipelines. So let's update the assembler routine so it can handle any input size, and uses NEON permutation instructions and overlapping loads and stores to handle the tail block. This results in a ~16% speedup for 1420 byte blocks on cores with deep pipelines such as ThunderX2. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/aes-glue.c | 46 ++++++------ arch/arm64/crypto/aes-modes.S | 165 ++++++++++++++++++++++++++++-------------- 2 files changed, 137 insertions(+), 74 deletions(-) (limited to 'arch') diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index cafb5b96be0e..e7f116d833b9 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -24,6 +24,7 @@ #ifdef USE_V8_CRYPTO_EXTENSIONS #define MODE "ce" #define PRIO 300 +#define STRIDE 5 #define aes_expandkey ce_aes_expandkey #define aes_ecb_encrypt ce_aes_ecb_encrypt #define aes_ecb_decrypt ce_aes_ecb_decrypt @@ -41,6 +42,7 @@ MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions"); #else #define MODE "neon" #define PRIO 200 +#define STRIDE 4 #define aes_ecb_encrypt neon_aes_ecb_encrypt #define aes_ecb_decrypt neon_aes_ecb_decrypt #define aes_cbc_encrypt neon_aes_cbc_encrypt @@ -87,7 +89,7 @@ asmlinkage void aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds, int bytes, u8 const iv[]); asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], - int rounds, int blocks, u8 ctr[]); + int rounds, int bytes, u8 ctr[], u8 finalbuf[]); asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds, int bytes, u32 const rk2[], u8 iv[], @@ -448,34 +450,36 @@ static int ctr_encrypt(struct skcipher_request *req) struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); int err, rounds = 6 + ctx->key_length / 4; struct skcipher_walk walk; - int blocks; err = skcipher_walk_virt(&walk, req, false); - while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) { - kernel_neon_begin(); - aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr, - ctx->key_enc, rounds, blocks, walk.iv); - kernel_neon_end(); - err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE); - } - if (walk.nbytes) { - u8 __aligned(8) tail[AES_BLOCK_SIZE]; + while (walk.nbytes > 0) { + const u8 *src = walk.src.virt.addr; unsigned int nbytes = walk.nbytes; - u8 *tdst = walk.dst.virt.addr; - u8 *tsrc = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + u8 buf[AES_BLOCK_SIZE]; + unsigned int tail; - /* - * Tell aes_ctr_encrypt() to process a tail block. - */ - blocks = -1; + if (unlikely(nbytes < AES_BLOCK_SIZE)) + src = memcpy(buf, src, nbytes); + else if (nbytes < walk.total) + nbytes &= ~(AES_BLOCK_SIZE - 1); kernel_neon_begin(); - aes_ctr_encrypt(tail, NULL, ctx->key_enc, rounds, - blocks, walk.iv); + aes_ctr_encrypt(dst, src, ctx->key_enc, rounds, nbytes, + walk.iv, buf); kernel_neon_end(); - crypto_xor_cpy(tdst, tsrc, tail, nbytes); - err = skcipher_walk_done(&walk, 0); + + tail = nbytes % (STRIDE * AES_BLOCK_SIZE); + if (tail > 0 && tail < AES_BLOCK_SIZE) + /* + * The final partial block could not be returned using + * an overlapping store, so it was passed via buf[] + * instead. + */ + memcpy(dst + nbytes - tail, buf, tail); + + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } return err; diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S index cf618d8f6cec..3d1f97799899 100644 --- a/arch/arm64/crypto/aes-modes.S +++ b/arch/arm64/crypto/aes-modes.S @@ -321,42 +321,76 @@ AES_FUNC_END(aes_cbc_cts_decrypt) /* * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, - * int blocks, u8 ctr[]) + * int bytes, u8 ctr[], u8 finalbuf[]) */ AES_FUNC_START(aes_ctr_encrypt) stp x29, x30, [sp, #-16]! mov x29, sp - enc_prepare w3, x2, x6 + enc_prepare w3, x2, x12 ld1 {vctr.16b}, [x5] - umov x6, vctr.d[1] /* keep swabbed ctr in reg */ - rev x6, x6 - cmn w6, w4 /* 32 bit overflow? */ - bcs .Lctrloop + umov x12, vctr.d[1] /* keep swabbed ctr in reg */ + rev x12, x12 + .LctrloopNx: - subs w4, w4, #MAX_STRIDE - bmi .Lctr1x - add w7, w6, #1 + add w7, w4, #15 + sub w4, w4, #MAX_STRIDE << 4 + lsr w7, w7, #4 + mov w8, #MAX_STRIDE + cmp w7, w8 + csel w7, w7, w8, lt + adds x12, x12, x7 + mov v0.16b, vctr.16b - add w8, w6, #2 mov v1.16b, vctr.16b - add w9, w6, #3 mov v2.16b, vctr.16b - add w9, w6, #3 - rev w7, w7 mov v3.16b, vctr.16b - rev w8, w8 ST5( mov v4.16b, vctr.16b ) - mov v1.s[3], w7 - rev w9, w9 -ST5( add w10, w6, #4 ) - mov v2.s[3], w8 -ST5( rev w10, w10 ) - mov v3.s[3], w9 -ST5( mov v4.s[3], w10 ) - ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */ + bcs 0f + + .subsection 1 + /* apply carry to outgoing counter */ +0: umov x8, vctr.d[0] + rev x8, x8 + add x8, x8, #1 + rev x8, x8 + ins vctr.d[0], x8 + + /* apply carry to N counter blocks for N := x12 */ + adr x16, 1f + sub x16, x16, x12, lsl #3 + br x16 + hint 34 // bti c + mov v0.d[0], vctr.d[0] + hint 34 // bti c + mov v1.d[0], vctr.d[0] + hint 34 // bti c + mov v2.d[0], vctr.d[0] + hint 34 // bti c + mov v3.d[0], vctr.d[0] +ST5( hint 34 ) +ST5( mov v4.d[0], vctr.d[0] ) +1: b 2f + .previous + +2: rev x7, x12 + ins vctr.d[1], x7 + sub x7, x12, #MAX_STRIDE - 1 + sub x8, x12, #MAX_STRIDE - 2 + sub x9, x12, #MAX_STRIDE - 3 + rev x7, x7 + rev x8, x8 + mov v1.d[1], x7 + rev x9, x9 +ST5( sub x10, x12, #MAX_STRIDE - 4 ) + mov v2.d[1], x8 +ST5( rev x10, x10 ) + mov v3.d[1], x9 +ST5( mov v4.d[1], x10 ) + tbnz w4, #31, .Lctrtail + ld1 {v5.16b-v7.16b}, [x1], #48 ST4( bl aes_encrypt_block4x ) ST5( bl aes_encrypt_block5x ) eor v0.16b, v5.16b, v0.16b @@ -368,47 +402,72 @@ ST5( ld1 {v5.16b-v6.16b}, [x1], #32 ) ST5( eor v4.16b, v6.16b, v4.16b ) st1 {v0.16b-v3.16b}, [x0], #64 ST5( st1 {v4.16b}, [x0], #16 ) - add x6, x6, #MAX_STRIDE - rev x7, x6 - ins vctr.d[1], x7 cbz w4, .Lctrout b .LctrloopNx -.Lctr1x: - adds w4, w4, #MAX_STRIDE - beq .Lctrout -.Lctrloop: - mov v0.16b, vctr.16b - encrypt_block v0, w3, x2, x8, w7 - - adds x6, x6, #1 /* increment BE ctr */ - rev x7, x6 - ins vctr.d[1], x7 - bcs .Lctrcarry /* overflow? */ - -.Lctrcarrydone: - subs w4, w4, #1 - bmi .Lctrtailblock /* blocks <0 means tail block */ - ld1 {v3.16b}, [x1], #16 - eor v3.16b, v0.16b, v3.16b - st1 {v3.16b}, [x0], #16 - bne .Lctrloop .Lctrout: st1 {vctr.16b}, [x5] /* return next CTR value */ ldp x29, x30, [sp], #16 ret -.Lctrtailblock: - st1 {v0.16b}, [x0] +.Lctrtail: + /* XOR up to MAX_STRIDE * 16 - 1 bytes of in/output with v0 ... v3/v4 */ + mov x16, #16 + ands x13, x4, #0xf + csel x13, x13, x16, ne + +ST5( cmp w4, #64 - (MAX_STRIDE << 4) ) +ST5( csel x14, x16, xzr, gt ) + cmp w4, #48 - (MAX_STRIDE << 4) + csel x15, x16, xzr, gt + cmp w4, #32 - (MAX_STRIDE << 4) + csel x16, x16, xzr, gt + cmp w4, #16 - (MAX_STRIDE << 4) + ble .Lctrtail1x + + adr_l x12, .Lcts_permute_table + add x12, x12, x13 + +ST5( ld1 {v5.16b}, [x1], x14 ) + ld1 {v6.16b}, [x1], x15 + ld1 {v7.16b}, [x1], x16 + +ST4( bl aes_encrypt_block4x ) +ST5( bl aes_encrypt_block5x ) + + ld1 {v8.16b}, [x1], x13 + ld1 {v9.16b}, [x1] + ld1 {v10.16b}, [x12] + +ST4( eor v6.16b, v6.16b, v0.16b ) +ST4( eor v7.16b, v7.16b, v1.16b ) +ST4( tbl v3.16b, {v3.16b}, v10.16b ) +ST4( eor v8.16b, v8.16b, v2.16b ) +ST4( eor v9.16b, v9.16b, v3.16b ) + +ST5( eor v5.16b, v5.16b, v0.16b ) +ST5( eor v6.16b, v6.16b, v1.16b ) +ST5( tbl v4.16b, {v4.16b}, v10.16b ) +ST5( eor v7.16b, v7.16b, v2.16b ) +ST5( eor v8.16b, v8.16b, v3.16b ) +ST5( eor v9.16b, v9.16b, v4.16b ) + +ST5( st1 {v5.16b}, [x0], x14 ) + st1 {v6.16b}, [x0], x15 + st1 {v7.16b}, [x0], x16 + add x13, x13, x0 + st1 {v9.16b}, [x13] // overlapping stores + st1 {v8.16b}, [x0] b .Lctrout -.Lctrcarry: - umov x7, vctr.d[0] /* load upper word of ctr */ - rev x7, x7 /* ... to handle the carry */ - add x7, x7, #1 - rev x7, x7 - ins vctr.d[0], x7 - b .Lctrcarrydone +.Lctrtail1x: + csel x0, x0, x6, eq // use finalbuf if less than a full block + ld1 {v5.16b}, [x1] +ST5( mov v3.16b, v4.16b ) + encrypt_block v3, w3, x2, x8, w7 + eor v5.16b, v5.16b, v3.16b + st1 {v5.16b}, [x0] + b .Lctrout AES_FUNC_END(aes_ctr_encrypt) -- cgit v1.2.3 From 1aa90f4cf034ed4f016a02330820ac0551a6c13c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 23 Dec 2020 00:09:51 -0800 Subject: crypto: x86/blake2s - define shash_alg structs using macros The shash_alg structs for the four variants of BLAKE2s are identical except for the algorithm name, driver name, and digest size. So, avoid code duplication by using a macro to define these structs. Acked-by: Ard Biesheuvel Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/blake2s-glue.c | 84 ++++++++++++------------------------------ 1 file changed, 23 insertions(+), 61 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c index c025a01cf708..4dcb2ee89efc 100644 --- a/arch/x86/crypto/blake2s-glue.c +++ b/arch/x86/crypto/blake2s-glue.c @@ -129,67 +129,29 @@ static int crypto_blake2s_final(struct shash_desc *desc, u8 *out) return 0; } -static struct shash_alg blake2s_algs[] = {{ - .base.cra_name = "blake2s-128", - .base.cra_driver_name = "blake2s-128-x86", - .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, - .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), - .base.cra_priority = 200, - .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, - - .digestsize = BLAKE2S_128_HASH_SIZE, - .setkey = crypto_blake2s_setkey, - .init = crypto_blake2s_init, - .update = crypto_blake2s_update, - .final = crypto_blake2s_final, - .descsize = sizeof(struct blake2s_state), -}, { - .base.cra_name = "blake2s-160", - .base.cra_driver_name = "blake2s-160-x86", - .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, - .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), - .base.cra_priority = 200, - .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, - - .digestsize = BLAKE2S_160_HASH_SIZE, - .setkey = crypto_blake2s_setkey, - .init = crypto_blake2s_init, - .update = crypto_blake2s_update, - .final = crypto_blake2s_final, - .descsize = sizeof(struct blake2s_state), -}, { - .base.cra_name = "blake2s-224", - .base.cra_driver_name = "blake2s-224-x86", - .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, - .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), - .base.cra_priority = 200, - .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, - - .digestsize = BLAKE2S_224_HASH_SIZE, - .setkey = crypto_blake2s_setkey, - .init = crypto_blake2s_init, - .update = crypto_blake2s_update, - .final = crypto_blake2s_final, - .descsize = sizeof(struct blake2s_state), -}, { - .base.cra_name = "blake2s-256", - .base.cra_driver_name = "blake2s-256-x86", - .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, - .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), - .base.cra_priority = 200, - .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, - - .digestsize = BLAKE2S_256_HASH_SIZE, - .setkey = crypto_blake2s_setkey, - .init = crypto_blake2s_init, - .update = crypto_blake2s_update, - .final = crypto_blake2s_final, - .descsize = sizeof(struct blake2s_state), -}}; +#define BLAKE2S_ALG(name, driver_name, digest_size) \ + { \ + .base.cra_name = name, \ + .base.cra_driver_name = driver_name, \ + .base.cra_priority = 200, \ + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \ + .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, \ + .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), \ + .base.cra_module = THIS_MODULE, \ + .digestsize = digest_size, \ + .setkey = crypto_blake2s_setkey, \ + .init = crypto_blake2s_init, \ + .update = crypto_blake2s_update, \ + .final = crypto_blake2s_final, \ + .descsize = sizeof(struct blake2s_state), \ + } + +static struct shash_alg blake2s_algs[] = { + BLAKE2S_ALG("blake2s-128", "blake2s-128-x86", BLAKE2S_128_HASH_SIZE), + BLAKE2S_ALG("blake2s-160", "blake2s-160-x86", BLAKE2S_160_HASH_SIZE), + BLAKE2S_ALG("blake2s-224", "blake2s-224-x86", BLAKE2S_224_HASH_SIZE), + BLAKE2S_ALG("blake2s-256", "blake2s-256-x86", BLAKE2S_256_HASH_SIZE), +}; static int __init blake2s_mod_init(void) { -- cgit v1.2.3 From 8c4a93a1270ddffc7660ae43fa8030ecfe9c06d9 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 23 Dec 2020 00:09:54 -0800 Subject: crypto: blake2s - share the "shash" API boilerplate code Add helper functions for shash implementations of BLAKE2s to include/crypto/internal/blake2s.h, taking advantage of __blake2s_update() and __blake2s_final() that were added by the previous patch to share more code between the library and shash implementations. crypto_blake2s_setkey() and crypto_blake2s_init() are usable as shash_alg::setkey and shash_alg::init directly, while crypto_blake2s_update() and crypto_blake2s_final() take an extra 'blake2s_compress_t' function pointer parameter. This allows the implementation of the compression function to be overridden, which is the only part that optimized implementations really care about. The new functions are inline functions (similar to those in sha1_base.h, sha256_base.h, and sm3_base.h) because this avoids needing to add a new module blake2s_helpers.ko, they aren't *too* long, and this avoids indirect calls which are expensive these days. Note that they can't go in blake2s_generic.ko, as that would require selecting CRYPTO_BLAKE2S from CRYPTO_BLAKE2S_X86, which would cause a recursive dependency. Finally, use these new helper functions in the x86 implementation of BLAKE2s. (This part should be a separate patch, but unfortunately the x86 implementation used the exact same function names like "crypto_blake2s_update()", so it had to be updated at the same time.) Signed-off-by: Eric Biggers Acked-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/blake2s-glue.c | 74 ++++-------------------------------------- 1 file changed, 7 insertions(+), 67 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c index 4dcb2ee89efc..a40365ab301e 100644 --- a/arch/x86/crypto/blake2s-glue.c +++ b/arch/x86/crypto/blake2s-glue.c @@ -58,75 +58,15 @@ void blake2s_compress_arch(struct blake2s_state *state, } EXPORT_SYMBOL(blake2s_compress_arch); -static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key, - unsigned int keylen) +static int crypto_blake2s_update_x86(struct shash_desc *desc, + const u8 *in, unsigned int inlen) { - struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm); - - if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) - return -EINVAL; - - memcpy(tctx->key, key, keylen); - tctx->keylen = keylen; - - return 0; -} - -static int crypto_blake2s_init(struct shash_desc *desc) -{ - struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); - struct blake2s_state *state = shash_desc_ctx(desc); - const int outlen = crypto_shash_digestsize(desc->tfm); - - if (tctx->keylen) - blake2s_init_key(state, outlen, tctx->key, tctx->keylen); - else - blake2s_init(state, outlen); - - return 0; -} - -static int crypto_blake2s_update(struct shash_desc *desc, const u8 *in, - unsigned int inlen) -{ - struct blake2s_state *state = shash_desc_ctx(desc); - const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen; - - if (unlikely(!inlen)) - return 0; - if (inlen > fill) { - memcpy(state->buf + state->buflen, in, fill); - blake2s_compress_arch(state, state->buf, 1, BLAKE2S_BLOCK_SIZE); - state->buflen = 0; - in += fill; - inlen -= fill; - } - if (inlen > BLAKE2S_BLOCK_SIZE) { - const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE); - /* Hash one less (full) block than strictly possible */ - blake2s_compress_arch(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE); - in += BLAKE2S_BLOCK_SIZE * (nblocks - 1); - inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1); - } - memcpy(state->buf + state->buflen, in, inlen); - state->buflen += inlen; - - return 0; + return crypto_blake2s_update(desc, in, inlen, blake2s_compress_arch); } -static int crypto_blake2s_final(struct shash_desc *desc, u8 *out) +static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out) { - struct blake2s_state *state = shash_desc_ctx(desc); - - blake2s_set_lastblock(state); - memset(state->buf + state->buflen, 0, - BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */ - blake2s_compress_arch(state, state->buf, 1, state->buflen); - cpu_to_le32_array(state->h, ARRAY_SIZE(state->h)); - memcpy(out, state->h, state->outlen); - memzero_explicit(state, sizeof(*state)); - - return 0; + return crypto_blake2s_final(desc, out, blake2s_compress_arch); } #define BLAKE2S_ALG(name, driver_name, digest_size) \ @@ -141,8 +81,8 @@ static int crypto_blake2s_final(struct shash_desc *desc, u8 *out) .digestsize = digest_size, \ .setkey = crypto_blake2s_setkey, \ .init = crypto_blake2s_init, \ - .update = crypto_blake2s_update, \ - .final = crypto_blake2s_final, \ + .update = crypto_blake2s_update_x86, \ + .final = crypto_blake2s_final_x86, \ .descsize = sizeof(struct blake2s_state), \ } -- cgit v1.2.3 From 5172d322d34c30fb926b29aeb5a064e1fd8a5e13 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 23 Dec 2020 00:09:59 -0800 Subject: crypto: arm/blake2s - add ARM scalar optimized BLAKE2s Add an ARM scalar optimized implementation of BLAKE2s. NEON isn't very useful for BLAKE2s because the BLAKE2s block size is too small for NEON to help. Each NEON instruction would depend on the previous one, resulting in poor performance. With scalar instructions, on the other hand, we can take advantage of ARM's "free" rotations (like I did in chacha-scalar-core.S) to get an implementation get runs much faster than the C implementation. Performance results on Cortex-A7 in cycles per byte using the shash API: 4096-byte messages: blake2s-256-arm: 18.8 blake2s-256-generic: 26.0 500-byte messages: blake2s-256-arm: 20.3 blake2s-256-generic: 27.9 100-byte messages: blake2s-256-arm: 29.7 blake2s-256-generic: 39.2 32-byte messages: blake2s-256-arm: 50.6 blake2s-256-generic: 66.2 Except on very short messages, this is still slower than the NEON implementation of BLAKE2b which I've written; that is 14.0, 16.4, 25.8, and 76.1 cpb on 4096, 500, 100, and 32-byte messages, respectively. However, optimized BLAKE2s is useful for cases where BLAKE2s is used instead of BLAKE2b, such as WireGuard. This new implementation is added in the form of a new module blake2s-arm.ko, which is analogous to blake2s-x86_64.ko in that it provides blake2s_compress_arch() for use by the library API as well as optionally register the algorithms with the shash API. Acked-by: Ard Biesheuvel Signed-off-by: Eric Biggers Tested-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm/crypto/Kconfig | 9 ++ arch/arm/crypto/Makefile | 2 + arch/arm/crypto/blake2s-core.S | 285 +++++++++++++++++++++++++++++++++++++++++ arch/arm/crypto/blake2s-glue.c | 78 +++++++++++ 4 files changed, 374 insertions(+) create mode 100644 arch/arm/crypto/blake2s-core.S create mode 100644 arch/arm/crypto/blake2s-glue.c (limited to 'arch') diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig index c9bf2df85cb9..281c829c12d0 100644 --- a/arch/arm/crypto/Kconfig +++ b/arch/arm/crypto/Kconfig @@ -62,6 +62,15 @@ config CRYPTO_SHA512_ARM SHA-512 secure hash standard (DFIPS 180-2) implemented using optimized ARM assembler and NEON, when available. +config CRYPTO_BLAKE2S_ARM + tristate "BLAKE2s digest algorithm (ARM)" + select CRYPTO_ARCH_HAVE_LIB_BLAKE2S + help + BLAKE2s digest algorithm optimized with ARM scalar instructions. This + is faster than the generic implementations of BLAKE2s and BLAKE2b, but + slower than the NEON implementation of BLAKE2b. (There is no NEON + implementation of BLAKE2s, since NEON doesn't really help with it.) + config CRYPTO_AES_ARM tristate "Scalar AES cipher for ARM" select CRYPTO_ALGAPI diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile index b745c17d356f..5ad1e985a718 100644 --- a/arch/arm/crypto/Makefile +++ b/arch/arm/crypto/Makefile @@ -9,6 +9,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o +obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += blake2s-arm.o obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o @@ -29,6 +30,7 @@ sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o sha256-arm-y := sha256-core.o sha256_glue.o $(sha256-arm-neon-y) sha512-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha512-neon-glue.o sha512-arm-y := sha512-core.o sha512-glue.o $(sha512-arm-neon-y) +blake2s-arm-y := blake2s-core.o blake2s-glue.o sha1-arm-ce-y := sha1-ce-core.o sha1-ce-glue.o sha2-arm-ce-y := sha2-ce-core.o sha2-ce-glue.o aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o diff --git a/arch/arm/crypto/blake2s-core.S b/arch/arm/crypto/blake2s-core.S new file mode 100644 index 000000000000..bed897e9a181 --- /dev/null +++ b/arch/arm/crypto/blake2s-core.S @@ -0,0 +1,285 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * BLAKE2s digest algorithm, ARM scalar implementation + * + * Copyright 2020 Google LLC + * + * Author: Eric Biggers + */ + +#include + + // Registers used to hold message words temporarily. There aren't + // enough ARM registers to hold the whole message block, so we have to + // load the words on-demand. + M_0 .req r12 + M_1 .req r14 + +// The BLAKE2s initialization vector +.Lblake2s_IV: + .word 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A + .word 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 + +.macro __ldrd a, b, src, offset +#if __LINUX_ARM_ARCH__ >= 6 + ldrd \a, \b, [\src, #\offset] +#else + ldr \a, [\src, #\offset] + ldr \b, [\src, #\offset + 4] +#endif +.endm + +.macro __strd a, b, dst, offset +#if __LINUX_ARM_ARCH__ >= 6 + strd \a, \b, [\dst, #\offset] +#else + str \a, [\dst, #\offset] + str \b, [\dst, #\offset + 4] +#endif +.endm + +// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals. +// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two +// columns/diagonals. s0-s1 are the word offsets to the message words the first +// column/diagonal needs, and likewise s2-s3 for the second column/diagonal. +// M_0 and M_1 are free to use, and the message block can be found at sp + 32. +// +// Note that to save instructions, the rotations don't happen when the +// pseudocode says they should, but rather they are delayed until the values are +// used. See the comment above _blake2s_round(). +.macro _blake2s_quarterround a0, b0, c0, d0, a1, b1, c1, d1, s0, s1, s2, s3 + + ldr M_0, [sp, #32 + 4 * \s0] + ldr M_1, [sp, #32 + 4 * \s2] + + // a += b + m[blake2s_sigma[r][2*i + 0]]; + add \a0, \a0, \b0, ror #brot + add \a1, \a1, \b1, ror #brot + add \a0, \a0, M_0 + add \a1, \a1, M_1 + + // d = ror32(d ^ a, 16); + eor \d0, \a0, \d0, ror #drot + eor \d1, \a1, \d1, ror #drot + + // c += d; + add \c0, \c0, \d0, ror #16 + add \c1, \c1, \d1, ror #16 + + // b = ror32(b ^ c, 12); + eor \b0, \c0, \b0, ror #brot + eor \b1, \c1, \b1, ror #brot + + ldr M_0, [sp, #32 + 4 * \s1] + ldr M_1, [sp, #32 + 4 * \s3] + + // a += b + m[blake2s_sigma[r][2*i + 1]]; + add \a0, \a0, \b0, ror #12 + add \a1, \a1, \b1, ror #12 + add \a0, \a0, M_0 + add \a1, \a1, M_1 + + // d = ror32(d ^ a, 8); + eor \d0, \a0, \d0, ror#16 + eor \d1, \a1, \d1, ror#16 + + // c += d; + add \c0, \c0, \d0, ror#8 + add \c1, \c1, \d1, ror#8 + + // b = ror32(b ^ c, 7); + eor \b0, \c0, \b0, ror#12 + eor \b1, \c1, \b1, ror#12 +.endm + +// Execute one round of BLAKE2s by updating the state matrix v[0..15]. v[0..9] +// are in r0..r9. The stack pointer points to 8 bytes of scratch space for +// spilling v[8..9], then to v[9..15], then to the message block. r10-r12 and +// r14 are free to use. The macro arguments s0-s15 give the order in which the +// message words are used in this round. +// +// All rotates are performed using the implicit rotate operand accepted by the +// 'add' and 'eor' instructions. This is faster than using explicit rotate +// instructions. To make this work, we allow the values in the second and last +// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the +// wrong rotation amount. The rotation amount is then fixed up just in time +// when the values are used. 'brot' is the number of bits the values in row 'b' +// need to be rotated right to arrive at the correct values, and 'drot' +// similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such +// that they end up as (7, 8) after every round. +.macro _blake2s_round s0, s1, s2, s3, s4, s5, s6, s7, \ + s8, s9, s10, s11, s12, s13, s14, s15 + + // Mix first two columns: + // (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]). + __ldrd r10, r11, sp, 16 // load v[12] and v[13] + _blake2s_quarterround r0, r4, r8, r10, r1, r5, r9, r11, \ + \s0, \s1, \s2, \s3 + __strd r8, r9, sp, 0 + __strd r10, r11, sp, 16 + + // Mix second two columns: + // (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]). + __ldrd r8, r9, sp, 8 // load v[10] and v[11] + __ldrd r10, r11, sp, 24 // load v[14] and v[15] + _blake2s_quarterround r2, r6, r8, r10, r3, r7, r9, r11, \ + \s4, \s5, \s6, \s7 + str r10, [sp, #24] // store v[14] + // v[10], v[11], and v[15] are used below, so no need to store them yet. + + .set brot, 7 + .set drot, 8 + + // Mix first two diagonals: + // (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]). + ldr r10, [sp, #16] // load v[12] + _blake2s_quarterround r0, r5, r8, r11, r1, r6, r9, r10, \ + \s8, \s9, \s10, \s11 + __strd r8, r9, sp, 8 + str r11, [sp, #28] + str r10, [sp, #16] + + // Mix second two diagonals: + // (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]). + __ldrd r8, r9, sp, 0 // load v[8] and v[9] + __ldrd r10, r11, sp, 20 // load v[13] and v[14] + _blake2s_quarterround r2, r7, r8, r10, r3, r4, r9, r11, \ + \s12, \s13, \s14, \s15 + __strd r10, r11, sp, 20 +.endm + +// +// void blake2s_compress_arch(struct blake2s_state *state, +// const u8 *block, size_t nblocks, u32 inc); +// +// Only the first three fields of struct blake2s_state are used: +// u32 h[8]; (inout) +// u32 t[2]; (inout) +// u32 f[2]; (in) +// + .align 5 +ENTRY(blake2s_compress_arch) + push {r0-r2,r4-r11,lr} // keep this an even number + +.Lnext_block: + // r0 is 'state' + // r1 is 'block' + // r3 is 'inc' + + // Load and increment the counter t[0..1]. + __ldrd r10, r11, r0, 32 + adds r10, r10, r3 + adc r11, r11, #0 + __strd r10, r11, r0, 32 + + // _blake2s_round is very short on registers, so copy the message block + // to the stack to save a register during the rounds. This also has the + // advantage that misalignment only needs to be dealt with in one place. + sub sp, sp, #64 + mov r12, sp + tst r1, #3 + bne .Lcopy_block_misaligned + ldmia r1!, {r2-r9} + stmia r12!, {r2-r9} + ldmia r1!, {r2-r9} + stmia r12, {r2-r9} +.Lcopy_block_done: + str r1, [sp, #68] // Update message pointer + + // Calculate v[8..15]. Push v[9..15] onto the stack, and leave space + // for spilling v[8..9]. Leave v[8..9] in r8-r9. + mov r14, r0 // r14 = state + adr r12, .Lblake2s_IV + ldmia r12!, {r8-r9} // load IV[0..1] + __ldrd r0, r1, r14, 40 // load f[0..1] + ldm r12, {r2-r7} // load IV[3..7] + eor r4, r4, r10 // v[12] = IV[4] ^ t[0] + eor r5, r5, r11 // v[13] = IV[5] ^ t[1] + eor r6, r6, r0 // v[14] = IV[6] ^ f[0] + eor r7, r7, r1 // v[15] = IV[7] ^ f[1] + push {r2-r7} // push v[9..15] + sub sp, sp, #8 // leave space for v[8..9] + + // Load h[0..7] == v[0..7]. + ldm r14, {r0-r7} + + // Execute the rounds. Each round is provided the order in which it + // needs to use the message words. + .set brot, 0 + .set drot, 0 + _blake2s_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + _blake2s_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 + _blake2s_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 + _blake2s_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 + _blake2s_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 + _blake2s_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 + _blake2s_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 + _blake2s_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 + _blake2s_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 + _blake2s_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 + + // Fold the final state matrix into the hash chaining value: + // + // for (i = 0; i < 8; i++) + // h[i] ^= v[i] ^ v[i + 8]; + // + ldr r14, [sp, #96] // r14 = &h[0] + add sp, sp, #8 // v[8..9] are already loaded. + pop {r10-r11} // load v[10..11] + eor r0, r0, r8 + eor r1, r1, r9 + eor r2, r2, r10 + eor r3, r3, r11 + ldm r14, {r8-r11} // load h[0..3] + eor r0, r0, r8 + eor r1, r1, r9 + eor r2, r2, r10 + eor r3, r3, r11 + stmia r14!, {r0-r3} // store new h[0..3] + ldm r14, {r0-r3} // load old h[4..7] + pop {r8-r11} // load v[12..15] + eor r0, r0, r4, ror #brot + eor r1, r1, r5, ror #brot + eor r2, r2, r6, ror #brot + eor r3, r3, r7, ror #brot + eor r0, r0, r8, ror #drot + eor r1, r1, r9, ror #drot + eor r2, r2, r10, ror #drot + eor r3, r3, r11, ror #drot + add sp, sp, #64 // skip copy of message block + stm r14, {r0-r3} // store new h[4..7] + + // Advance to the next block, if there is one. Note that if there are + // multiple blocks, then 'inc' (the counter increment amount) must be + // 64. So we can simply set it to 64 without re-loading it. + ldm sp, {r0, r1, r2} // load (state, block, nblocks) + mov r3, #64 // set 'inc' + subs r2, r2, #1 // nblocks-- + str r2, [sp, #8] + bne .Lnext_block // nblocks != 0? + + pop {r0-r2,r4-r11,pc} + + // The next message block (pointed to by r1) isn't 4-byte aligned, so it + // can't be loaded using ldmia. Copy it to the stack buffer (pointed to + // by r12) using an alternative method. r2-r9 are free to use. +.Lcopy_block_misaligned: + mov r2, #64 +1: +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + ldr r3, [r1], #4 +#else + ldrb r3, [r1, #0] + ldrb r4, [r1, #1] + ldrb r5, [r1, #2] + ldrb r6, [r1, #3] + add r1, r1, #4 + orr r3, r3, r4, lsl #8 + orr r3, r3, r5, lsl #16 + orr r3, r3, r6, lsl #24 +#endif + subs r2, r2, #4 + str r3, [r12], #4 + bne 1b + b .Lcopy_block_done +ENDPROC(blake2s_compress_arch) diff --git a/arch/arm/crypto/blake2s-glue.c b/arch/arm/crypto/blake2s-glue.c new file mode 100644 index 000000000000..f2cc1e5fc9ec --- /dev/null +++ b/arch/arm/crypto/blake2s-glue.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * BLAKE2s digest algorithm, ARM scalar implementation + * + * Copyright 2020 Google LLC + */ + +#include +#include + +#include + +/* defined in blake2s-core.S */ +EXPORT_SYMBOL(blake2s_compress_arch); + +static int crypto_blake2s_update_arm(struct shash_desc *desc, + const u8 *in, unsigned int inlen) +{ + return crypto_blake2s_update(desc, in, inlen, blake2s_compress_arch); +} + +static int crypto_blake2s_final_arm(struct shash_desc *desc, u8 *out) +{ + return crypto_blake2s_final(desc, out, blake2s_compress_arch); +} + +#define BLAKE2S_ALG(name, driver_name, digest_size) \ + { \ + .base.cra_name = name, \ + .base.cra_driver_name = driver_name, \ + .base.cra_priority = 200, \ + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \ + .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, \ + .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), \ + .base.cra_module = THIS_MODULE, \ + .digestsize = digest_size, \ + .setkey = crypto_blake2s_setkey, \ + .init = crypto_blake2s_init, \ + .update = crypto_blake2s_update_arm, \ + .final = crypto_blake2s_final_arm, \ + .descsize = sizeof(struct blake2s_state), \ + } + +static struct shash_alg blake2s_arm_algs[] = { + BLAKE2S_ALG("blake2s-128", "blake2s-128-arm", BLAKE2S_128_HASH_SIZE), + BLAKE2S_ALG("blake2s-160", "blake2s-160-arm", BLAKE2S_160_HASH_SIZE), + BLAKE2S_ALG("blake2s-224", "blake2s-224-arm", BLAKE2S_224_HASH_SIZE), + BLAKE2S_ALG("blake2s-256", "blake2s-256-arm", BLAKE2S_256_HASH_SIZE), +}; + +static int __init blake2s_arm_mod_init(void) +{ + return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? + crypto_register_shashes(blake2s_arm_algs, + ARRAY_SIZE(blake2s_arm_algs)) : 0; +} + +static void __exit blake2s_arm_mod_exit(void) +{ + if (IS_REACHABLE(CONFIG_CRYPTO_HASH)) + crypto_unregister_shashes(blake2s_arm_algs, + ARRAY_SIZE(blake2s_arm_algs)); +} + +module_init(blake2s_arm_mod_init); +module_exit(blake2s_arm_mod_exit); + +MODULE_DESCRIPTION("BLAKE2s digest algorithm, ARM scalar implementation"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Eric Biggers "); +MODULE_ALIAS_CRYPTO("blake2s-128"); +MODULE_ALIAS_CRYPTO("blake2s-128-arm"); +MODULE_ALIAS_CRYPTO("blake2s-160"); +MODULE_ALIAS_CRYPTO("blake2s-160-arm"); +MODULE_ALIAS_CRYPTO("blake2s-224"); +MODULE_ALIAS_CRYPTO("blake2s-224-arm"); +MODULE_ALIAS_CRYPTO("blake2s-256"); +MODULE_ALIAS_CRYPTO("blake2s-256-arm"); -- cgit v1.2.3 From 1862eb007367f9e4cfd52d0406742de337b28ebf Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 23 Dec 2020 00:10:03 -0800 Subject: crypto: arm/blake2b - add NEON-accelerated BLAKE2b Add a NEON-accelerated implementation of BLAKE2b. On Cortex-A7 (which these days is the most common ARM processor that doesn't have the ARMv8 Crypto Extensions), this is over twice as fast as SHA-256, and slightly faster than SHA-1. It is also almost three times as fast as the generic implementation of BLAKE2b: Algorithm Cycles per byte (on 4096-byte messages) =================== ======================================= blake2b-256-neon 14.0 sha1-neon 16.3 blake2s-256-arm 18.8 sha1-asm 20.8 blake2s-256-generic 26.0 sha256-neon 28.9 sha256-asm 32.0 blake2b-256-generic 38.9 This implementation isn't directly based on any other implementation, but it borrows some ideas from previous NEON code I've written as well as from chacha-neon-core.S. At least on Cortex-A7, it is faster than the other NEON implementations of BLAKE2b I'm aware of (the implementation in the BLAKE2 official repository using intrinsics, and Andrew Moon's implementation which can be found in SUPERCOP). It does only one block at a time, so it performs well on short messages too. NEON-accelerated BLAKE2b is useful because there is interest in using BLAKE2b-256 for dm-verity on low-end Android devices (specifically, devices that lack the ARMv8 Crypto Extensions) to replace SHA-1. On these devices, the performance cost of upgrading to SHA-256 may be unacceptable, whereas BLAKE2b-256 would actually improve performance. Although BLAKE2b is intended for 64-bit platforms (unlike BLAKE2s which is intended for 32-bit platforms), on 32-bit ARM processors with NEON, BLAKE2b is actually faster than BLAKE2s. This is because NEON supports 64-bit operations, and because BLAKE2s's block size is too small for NEON to be helpful for it. The best I've been able to do with BLAKE2s on Cortex-A7 is 18.8 cpb with an optimized scalar implementation. (I didn't try BLAKE2sp and BLAKE3, which in theory would be faster, but they're more complex as they require running multiple hashes at once. Note that BLAKE2b already uses all the NEON bandwidth on the Cortex-A7, so I expect that any speedup from BLAKE2sp or BLAKE3 would come only from the smaller number of rounds, not from the extra parallelism.) For now this BLAKE2b implementation is only wired up to the shash API, since there is no library API for BLAKE2b yet. However, I've tried to keep things consistent with BLAKE2s, e.g. by defining blake2b_compress_arch() which is analogous to blake2s_compress_arch() and could be exported for use by the library API later if needed. Acked-by: Ard Biesheuvel Signed-off-by: Eric Biggers Tested-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm/crypto/Kconfig | 10 ++ arch/arm/crypto/Makefile | 2 + arch/arm/crypto/blake2b-neon-core.S | 347 ++++++++++++++++++++++++++++++++++++ arch/arm/crypto/blake2b-neon-glue.c | 105 +++++++++++ 4 files changed, 464 insertions(+) create mode 100644 arch/arm/crypto/blake2b-neon-core.S create mode 100644 arch/arm/crypto/blake2b-neon-glue.c (limited to 'arch') diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig index 281c829c12d0..2b575792363e 100644 --- a/arch/arm/crypto/Kconfig +++ b/arch/arm/crypto/Kconfig @@ -71,6 +71,16 @@ config CRYPTO_BLAKE2S_ARM slower than the NEON implementation of BLAKE2b. (There is no NEON implementation of BLAKE2s, since NEON doesn't really help with it.) +config CRYPTO_BLAKE2B_NEON + tristate "BLAKE2b digest algorithm (ARM NEON)" + depends on KERNEL_MODE_NEON + select CRYPTO_BLAKE2B + help + BLAKE2b digest algorithm optimized with ARM NEON instructions. + On ARM processors that have NEON support but not the ARMv8 + Crypto Extensions, typically this BLAKE2b implementation is + much faster than SHA-2 and slightly faster than SHA-1. + config CRYPTO_AES_ARM tristate "Scalar AES cipher for ARM" select CRYPTO_ALGAPI diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile index 5ad1e985a718..8f26c454ea12 100644 --- a/arch/arm/crypto/Makefile +++ b/arch/arm/crypto/Makefile @@ -10,6 +10,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += blake2s-arm.o +obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o @@ -31,6 +32,7 @@ sha256-arm-y := sha256-core.o sha256_glue.o $(sha256-arm-neon-y) sha512-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha512-neon-glue.o sha512-arm-y := sha512-core.o sha512-glue.o $(sha512-arm-neon-y) blake2s-arm-y := blake2s-core.o blake2s-glue.o +blake2b-neon-y := blake2b-neon-core.o blake2b-neon-glue.o sha1-arm-ce-y := sha1-ce-core.o sha1-ce-glue.o sha2-arm-ce-y := sha2-ce-core.o sha2-ce-glue.o aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o diff --git a/arch/arm/crypto/blake2b-neon-core.S b/arch/arm/crypto/blake2b-neon-core.S new file mode 100644 index 000000000000..0406a186377f --- /dev/null +++ b/arch/arm/crypto/blake2b-neon-core.S @@ -0,0 +1,347 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * BLAKE2b digest algorithm, NEON accelerated + * + * Copyright 2020 Google LLC + * + * Author: Eric Biggers + */ + +#include + + .text + .fpu neon + + // The arguments to blake2b_compress_neon() + STATE .req r0 + BLOCK .req r1 + NBLOCKS .req r2 + INC .req r3 + + // Pointers to the rotation tables + ROR24_TABLE .req r4 + ROR16_TABLE .req r5 + + // The original stack pointer + ORIG_SP .req r6 + + // NEON registers which contain the message words of the current block. + // M_0-M_3 are occasionally used for other purposes too. + M_0 .req d16 + M_1 .req d17 + M_2 .req d18 + M_3 .req d19 + M_4 .req d20 + M_5 .req d21 + M_6 .req d22 + M_7 .req d23 + M_8 .req d24 + M_9 .req d25 + M_10 .req d26 + M_11 .req d27 + M_12 .req d28 + M_13 .req d29 + M_14 .req d30 + M_15 .req d31 + + .align 4 + // Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8 + // instruction. This is the most efficient way to implement these + // rotation amounts with NEON. (On Cortex-A53 it's the same speed as + // vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.) +.Lror24_table: + .byte 3, 4, 5, 6, 7, 0, 1, 2 +.Lror16_table: + .byte 2, 3, 4, 5, 6, 7, 0, 1 + // The BLAKE2b initialization vector +.Lblake2b_IV: + .quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b + .quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1 + .quad 0x510e527fade682d1, 0x9b05688c2b3e6c1f + .quad 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 + +// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the +// NEON registers q0-q7. The message block is in q8..q15 (M_0-M_15). The stack +// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9 +// (M_0-M_3), so that they can be reloaded if they are used as temporary +// registers. The macro arguments s0-s15 give the order in which the message +// words are used in this round. 'final' is 1 if this is the final round. +.macro _blake2b_round s0, s1, s2, s3, s4, s5, s6, s7, \ + s8, s9, s10, s11, s12, s13, s14, s15, final=0 + + // Mix the columns: + // (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]), + // (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]). + + // a += b + m[blake2b_sigma[r][2*i + 0]]; + vadd.u64 q0, q0, q2 + vadd.u64 q1, q1, q3 + vadd.u64 d0, d0, M_\s0 + vadd.u64 d1, d1, M_\s2 + vadd.u64 d2, d2, M_\s4 + vadd.u64 d3, d3, M_\s6 + + // d = ror64(d ^ a, 32); + veor q6, q6, q0 + veor q7, q7, q1 + vrev64.32 q6, q6 + vrev64.32 q7, q7 + + // c += d; + vadd.u64 q4, q4, q6 + vadd.u64 q5, q5, q7 + + // b = ror64(b ^ c, 24); + vld1.8 {M_0}, [ROR24_TABLE, :64] + veor q2, q2, q4 + veor q3, q3, q5 + vtbl.8 d4, {d4}, M_0 + vtbl.8 d5, {d5}, M_0 + vtbl.8 d6, {d6}, M_0 + vtbl.8 d7, {d7}, M_0 + + // a += b + m[blake2b_sigma[r][2*i + 1]]; + // + // M_0 got clobbered above, so we have to reload it if any of the four + // message words this step needs happens to be M_0. Otherwise we don't + // need to reload it here, as it will just get clobbered again below. +.if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0 + vld1.8 {M_0}, [sp, :64] +.endif + vadd.u64 q0, q0, q2 + vadd.u64 q1, q1, q3 + vadd.u64 d0, d0, M_\s1 + vadd.u64 d1, d1, M_\s3 + vadd.u64 d2, d2, M_\s5 + vadd.u64 d3, d3, M_\s7 + + // d = ror64(d ^ a, 16); + vld1.8 {M_0}, [ROR16_TABLE, :64] + veor q6, q6, q0 + veor q7, q7, q1 + vtbl.8 d12, {d12}, M_0 + vtbl.8 d13, {d13}, M_0 + vtbl.8 d14, {d14}, M_0 + vtbl.8 d15, {d15}, M_0 + + // c += d; + vadd.u64 q4, q4, q6 + vadd.u64 q5, q5, q7 + + // b = ror64(b ^ c, 63); + // + // This rotation amount isn't a multiple of 8, so it has to be + // implemented using a pair of shifts, which requires temporary + // registers. Use q8-q9 (M_0-M_3) for this, and reload them afterwards. + veor q8, q2, q4 + veor q9, q3, q5 + vshr.u64 q2, q8, #63 + vshr.u64 q3, q9, #63 + vsli.u64 q2, q8, #1 + vsli.u64 q3, q9, #1 + vld1.8 {q8-q9}, [sp, :256] + + // Mix the diagonals: + // (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]), + // (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]). + // + // There are two possible ways to do this: use 'vext' instructions to + // shift the rows of the matrix so that the diagonals become columns, + // and undo it afterwards; or just use 64-bit operations on 'd' + // registers instead of 128-bit operations on 'q' registers. We use the + // latter approach, as it performs much better on Cortex-A7. + + // a += b + m[blake2b_sigma[r][2*i + 0]]; + vadd.u64 d0, d0, d5 + vadd.u64 d1, d1, d6 + vadd.u64 d2, d2, d7 + vadd.u64 d3, d3, d4 + vadd.u64 d0, d0, M_\s8 + vadd.u64 d1, d1, M_\s10 + vadd.u64 d2, d2, M_\s12 + vadd.u64 d3, d3, M_\s14 + + // d = ror64(d ^ a, 32); + veor d15, d15, d0 + veor d12, d12, d1 + veor d13, d13, d2 + veor d14, d14, d3 + vrev64.32 d15, d15 + vrev64.32 d12, d12 + vrev64.32 d13, d13 + vrev64.32 d14, d14 + + // c += d; + vadd.u64 d10, d10, d15 + vadd.u64 d11, d11, d12 + vadd.u64 d8, d8, d13 + vadd.u64 d9, d9, d14 + + // b = ror64(b ^ c, 24); + vld1.8 {M_0}, [ROR24_TABLE, :64] + veor d5, d5, d10 + veor d6, d6, d11 + veor d7, d7, d8 + veor d4, d4, d9 + vtbl.8 d5, {d5}, M_0 + vtbl.8 d6, {d6}, M_0 + vtbl.8 d7, {d7}, M_0 + vtbl.8 d4, {d4}, M_0 + + // a += b + m[blake2b_sigma[r][2*i + 1]]; +.if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0 + vld1.8 {M_0}, [sp, :64] +.endif + vadd.u64 d0, d0, d5 + vadd.u64 d1, d1, d6 + vadd.u64 d2, d2, d7 + vadd.u64 d3, d3, d4 + vadd.u64 d0, d0, M_\s9 + vadd.u64 d1, d1, M_\s11 + vadd.u64 d2, d2, M_\s13 + vadd.u64 d3, d3, M_\s15 + + // d = ror64(d ^ a, 16); + vld1.8 {M_0}, [ROR16_TABLE, :64] + veor d15, d15, d0 + veor d12, d12, d1 + veor d13, d13, d2 + veor d14, d14, d3 + vtbl.8 d12, {d12}, M_0 + vtbl.8 d13, {d13}, M_0 + vtbl.8 d14, {d14}, M_0 + vtbl.8 d15, {d15}, M_0 + + // c += d; + vadd.u64 d10, d10, d15 + vadd.u64 d11, d11, d12 + vadd.u64 d8, d8, d13 + vadd.u64 d9, d9, d14 + + // b = ror64(b ^ c, 63); + veor d16, d4, d9 + veor d17, d5, d10 + veor d18, d6, d11 + veor d19, d7, d8 + vshr.u64 q2, q8, #63 + vshr.u64 q3, q9, #63 + vsli.u64 q2, q8, #1 + vsli.u64 q3, q9, #1 + // Reloading q8-q9 can be skipped on the final round. +.if ! \final + vld1.8 {q8-q9}, [sp, :256] +.endif +.endm + +// +// void blake2b_compress_neon(struct blake2b_state *state, +// const u8 *block, size_t nblocks, u32 inc); +// +// Only the first three fields of struct blake2b_state are used: +// u64 h[8]; (inout) +// u64 t[2]; (inout) +// u64 f[2]; (in) +// + .align 5 +ENTRY(blake2b_compress_neon) + push {r4-r10} + + // Allocate a 32-byte stack buffer that is 32-byte aligned. + mov ORIG_SP, sp + sub ip, sp, #32 + bic ip, ip, #31 + mov sp, ip + + adr ROR24_TABLE, .Lror24_table + adr ROR16_TABLE, .Lror16_table + + mov ip, STATE + vld1.64 {q0-q1}, [ip]! // Load h[0..3] + vld1.64 {q2-q3}, [ip]! // Load h[4..7] +.Lnext_block: + adr r10, .Lblake2b_IV + vld1.64 {q14-q15}, [ip] // Load t[0..1] and f[0..1] + vld1.64 {q4-q5}, [r10]! // Load IV[0..3] + vmov r7, r8, d28 // Copy t[0] to (r7, r8) + vld1.64 {q6-q7}, [r10] // Load IV[4..7] + adds r7, r7, INC // Increment counter + bcs .Lslow_inc_ctr + vmov.i32 d28[0], r7 + vst1.64 {d28}, [ip] // Update t[0] +.Linc_ctr_done: + + // Load the next message block and finish initializing the state matrix + // 'v'. Fortunately, there are exactly enough NEON registers to fit the + // entire state matrix in q0-q7 and the entire message block in q8-15. + // + // However, _blake2b_round also needs some extra registers for rotates, + // so we have to spill some registers. It's better to spill the message + // registers than the state registers, as the message doesn't change. + // Therefore we store a copy of the first 32 bytes of the message block + // (q8-q9) in an aligned buffer on the stack so that they can be + // reloaded when needed. (We could just reload directly from the + // message buffer, but it's faster to use aligned loads.) + vld1.8 {q8-q9}, [BLOCK]! + veor q6, q6, q14 // v[12..13] = IV[4..5] ^ t[0..1] + vld1.8 {q10-q11}, [BLOCK]! + veor q7, q7, q15 // v[14..15] = IV[6..7] ^ f[0..1] + vld1.8 {q12-q13}, [BLOCK]! + vst1.8 {q8-q9}, [sp, :256] + mov ip, STATE + vld1.8 {q14-q15}, [BLOCK]! + + // Execute the rounds. Each round is provided the order in which it + // needs to use the message words. + _blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + _blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 + _blake2b_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 + _blake2b_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 + _blake2b_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 + _blake2b_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 + _blake2b_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 + _blake2b_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 + _blake2b_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 + _blake2b_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 + _blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + _blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \ + final=1 + + // Fold the final state matrix into the hash chaining value: + // + // for (i = 0; i < 8; i++) + // h[i] ^= v[i] ^ v[i + 8]; + // + vld1.64 {q8-q9}, [ip]! // Load old h[0..3] + veor q0, q0, q4 // v[0..1] ^= v[8..9] + veor q1, q1, q5 // v[2..3] ^= v[10..11] + vld1.64 {q10-q11}, [ip] // Load old h[4..7] + veor q2, q2, q6 // v[4..5] ^= v[12..13] + veor q3, q3, q7 // v[6..7] ^= v[14..15] + veor q0, q0, q8 // v[0..1] ^= h[0..1] + veor q1, q1, q9 // v[2..3] ^= h[2..3] + mov ip, STATE + subs NBLOCKS, NBLOCKS, #1 // nblocks-- + vst1.64 {q0-q1}, [ip]! // Store new h[0..3] + veor q2, q2, q10 // v[4..5] ^= h[4..5] + veor q3, q3, q11 // v[6..7] ^= h[6..7] + vst1.64 {q2-q3}, [ip]! // Store new h[4..7] + + // Advance to the next block, if there is one. + bne .Lnext_block // nblocks != 0? + + mov sp, ORIG_SP + pop {r4-r10} + mov pc, lr + +.Lslow_inc_ctr: + // Handle the case where the counter overflowed its low 32 bits, by + // carrying the overflow bit into the full 128-bit counter. + vmov r9, r10, d29 + adcs r8, r8, #0 + adcs r9, r9, #0 + adc r10, r10, #0 + vmov d28, r7, r8 + vmov d29, r9, r10 + vst1.64 {q14}, [ip] // Update t[0] and t[1] + b .Linc_ctr_done +ENDPROC(blake2b_compress_neon) diff --git a/arch/arm/crypto/blake2b-neon-glue.c b/arch/arm/crypto/blake2b-neon-glue.c new file mode 100644 index 000000000000..34d73200e7fa --- /dev/null +++ b/arch/arm/crypto/blake2b-neon-glue.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * BLAKE2b digest algorithm, NEON accelerated + * + * Copyright 2020 Google LLC + */ + +#include +#include +#include + +#include +#include + +#include +#include + +asmlinkage void blake2b_compress_neon(struct blake2b_state *state, + const u8 *block, size_t nblocks, u32 inc); + +static void blake2b_compress_arch(struct blake2b_state *state, + const u8 *block, size_t nblocks, u32 inc) +{ + if (!crypto_simd_usable()) { + blake2b_compress_generic(state, block, nblocks, inc); + return; + } + + do { + const size_t blocks = min_t(size_t, nblocks, + SZ_4K / BLAKE2B_BLOCK_SIZE); + + kernel_neon_begin(); + blake2b_compress_neon(state, block, blocks, inc); + kernel_neon_end(); + + nblocks -= blocks; + block += blocks * BLAKE2B_BLOCK_SIZE; + } while (nblocks); +} + +static int crypto_blake2b_update_neon(struct shash_desc *desc, + const u8 *in, unsigned int inlen) +{ + return crypto_blake2b_update(desc, in, inlen, blake2b_compress_arch); +} + +static int crypto_blake2b_final_neon(struct shash_desc *desc, u8 *out) +{ + return crypto_blake2b_final(desc, out, blake2b_compress_arch); +} + +#define BLAKE2B_ALG(name, driver_name, digest_size) \ + { \ + .base.cra_name = name, \ + .base.cra_driver_name = driver_name, \ + .base.cra_priority = 200, \ + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \ + .base.cra_blocksize = BLAKE2B_BLOCK_SIZE, \ + .base.cra_ctxsize = sizeof(struct blake2b_tfm_ctx), \ + .base.cra_module = THIS_MODULE, \ + .digestsize = digest_size, \ + .setkey = crypto_blake2b_setkey, \ + .init = crypto_blake2b_init, \ + .update = crypto_blake2b_update_neon, \ + .final = crypto_blake2b_final_neon, \ + .descsize = sizeof(struct blake2b_state), \ + } + +static struct shash_alg blake2b_neon_algs[] = { + BLAKE2B_ALG("blake2b-160", "blake2b-160-neon", BLAKE2B_160_HASH_SIZE), + BLAKE2B_ALG("blake2b-256", "blake2b-256-neon", BLAKE2B_256_HASH_SIZE), + BLAKE2B_ALG("blake2b-384", "blake2b-384-neon", BLAKE2B_384_HASH_SIZE), + BLAKE2B_ALG("blake2b-512", "blake2b-512-neon", BLAKE2B_512_HASH_SIZE), +}; + +static int __init blake2b_neon_mod_init(void) +{ + if (!(elf_hwcap & HWCAP_NEON)) + return -ENODEV; + + return crypto_register_shashes(blake2b_neon_algs, + ARRAY_SIZE(blake2b_neon_algs)); +} + +static void __exit blake2b_neon_mod_exit(void) +{ + return crypto_unregister_shashes(blake2b_neon_algs, + ARRAY_SIZE(blake2b_neon_algs)); +} + +module_init(blake2b_neon_mod_init); +module_exit(blake2b_neon_mod_exit); + +MODULE_DESCRIPTION("BLAKE2b digest algorithm, NEON accelerated"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Eric Biggers "); +MODULE_ALIAS_CRYPTO("blake2b-160"); +MODULE_ALIAS_CRYPTO("blake2b-160-neon"); +MODULE_ALIAS_CRYPTO("blake2b-256"); +MODULE_ALIAS_CRYPTO("blake2b-256-neon"); +MODULE_ALIAS_CRYPTO("blake2b-384"); +MODULE_ALIAS_CRYPTO("blake2b-384-neon"); +MODULE_ALIAS_CRYPTO("blake2b-512"); +MODULE_ALIAS_CRYPTO("blake2b-512-neon"); -- cgit v1.2.3 From 86ad60a65f29dd862a11c22bb4b5be28d6c5cef1 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 31 Dec 2020 17:41:54 +0100 Subject: crypto: x86/aes-ni-xts - use direct calls to and 4-way stride The XTS asm helper arrangement is a bit odd: the 8-way stride helper consists of back-to-back calls to the 4-way core transforms, which are called indirectly, based on a boolean that indicates whether we are performing encryption or decryption. Given how costly indirect calls are on x86, let's switch to direct calls, and given how the 8-way stride doesn't really add anything substantial, use a 4-way stride instead, and make the asm core routine deal with any multiple of 4 blocks. Since 512 byte sectors or 4 KB blocks are the typical quantities XTS operates on, increase the stride exported to the glue helper to 512 bytes as well. As a result, the number of indirect calls is reduced from 3 per 64 bytes of in/output to 1 per 512 bytes of in/output, which produces a 65% speedup when operating on 1 KB blocks (measured on a Intel(R) Core(TM) i7-8650U CPU) Fixes: 9697fa39efd3f ("x86/retpoline/crypto: Convert crypto assembler indirect jumps") Tested-by: Eric Biggers # x86_64 Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_asm.S | 115 ++++++++++++++++++++++--------------- arch/x86/crypto/aesni-intel_glue.c | 25 ++++---- 2 files changed, 84 insertions(+), 56 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index a2710f76862f..84d8a156cdcd 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -2842,25 +2842,18 @@ SYM_FUNC_END(aesni_ctr_enc) pxor CTR, IV; /* - * void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *dst, - * const u8 *src, bool enc, le128 *iv) + * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst, + * const u8 *src, unsigned int len, le128 *iv) */ -SYM_FUNC_START(aesni_xts_crypt8) +SYM_FUNC_START(aesni_xts_encrypt) FRAME_BEGIN - testb %cl, %cl - movl $0, %ecx - movl $240, %r10d - leaq _aesni_enc4, %r11 - leaq _aesni_dec4, %rax - cmovel %r10d, %ecx - cmoveq %rax, %r11 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK movups (IVP), IV mov 480(KEYP), KLEN - addq %rcx, KEYP +.Lxts_enc_loop4: movdqa IV, STATE1 movdqu 0x00(INP), INC pxor INC, STATE1 @@ -2884,71 +2877,103 @@ SYM_FUNC_START(aesni_xts_crypt8) pxor INC, STATE4 movdqu IV, 0x30(OUTP) - CALL_NOSPEC r11 + call _aesni_enc4 movdqu 0x00(OUTP), INC pxor INC, STATE1 movdqu STATE1, 0x00(OUTP) - _aesni_gf128mul_x_ble() - movdqa IV, STATE1 - movdqu 0x40(INP), INC - pxor INC, STATE1 - movdqu IV, 0x40(OUTP) - movdqu 0x10(OUTP), INC pxor INC, STATE2 movdqu STATE2, 0x10(OUTP) - _aesni_gf128mul_x_ble() - movdqa IV, STATE2 - movdqu 0x50(INP), INC - pxor INC, STATE2 - movdqu IV, 0x50(OUTP) - movdqu 0x20(OUTP), INC pxor INC, STATE3 movdqu STATE3, 0x20(OUTP) - _aesni_gf128mul_x_ble() - movdqa IV, STATE3 - movdqu 0x60(INP), INC - pxor INC, STATE3 - movdqu IV, 0x60(OUTP) - movdqu 0x30(OUTP), INC pxor INC, STATE4 movdqu STATE4, 0x30(OUTP) _aesni_gf128mul_x_ble() - movdqa IV, STATE4 - movdqu 0x70(INP), INC - pxor INC, STATE4 - movdqu IV, 0x70(OUTP) - _aesni_gf128mul_x_ble() + add $64, INP + add $64, OUTP + sub $64, LEN + ja .Lxts_enc_loop4 + movups IV, (IVP) - CALL_NOSPEC r11 + FRAME_END + ret +SYM_FUNC_END(aesni_xts_encrypt) + +/* + * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst, + * const u8 *src, unsigned int len, le128 *iv) + */ +SYM_FUNC_START(aesni_xts_decrypt) + FRAME_BEGIN + + movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK + movups (IVP), IV + + mov 480(KEYP), KLEN + add $240, KEYP - movdqu 0x40(OUTP), INC +.Lxts_dec_loop4: + movdqa IV, STATE1 + movdqu 0x00(INP), INC pxor INC, STATE1 - movdqu STATE1, 0x40(OUTP) + movdqu IV, 0x00(OUTP) - movdqu 0x50(OUTP), INC + _aesni_gf128mul_x_ble() + movdqa IV, STATE2 + movdqu 0x10(INP), INC + pxor INC, STATE2 + movdqu IV, 0x10(OUTP) + + _aesni_gf128mul_x_ble() + movdqa IV, STATE3 + movdqu 0x20(INP), INC + pxor INC, STATE3 + movdqu IV, 0x20(OUTP) + + _aesni_gf128mul_x_ble() + movdqa IV, STATE4 + movdqu 0x30(INP), INC + pxor INC, STATE4 + movdqu IV, 0x30(OUTP) + + call _aesni_dec4 + + movdqu 0x00(OUTP), INC + pxor INC, STATE1 + movdqu STATE1, 0x00(OUTP) + + movdqu 0x10(OUTP), INC pxor INC, STATE2 - movdqu STATE2, 0x50(OUTP) + movdqu STATE2, 0x10(OUTP) - movdqu 0x60(OUTP), INC + movdqu 0x20(OUTP), INC pxor INC, STATE3 - movdqu STATE3, 0x60(OUTP) + movdqu STATE3, 0x20(OUTP) - movdqu 0x70(OUTP), INC + movdqu 0x30(OUTP), INC pxor INC, STATE4 - movdqu STATE4, 0x70(OUTP) + movdqu STATE4, 0x30(OUTP) + + _aesni_gf128mul_x_ble() + + add $64, INP + add $64, OUTP + sub $64, LEN + ja .Lxts_dec_loop4 + + movups IV, (IVP) FRAME_END ret -SYM_FUNC_END(aesni_xts_crypt8) +SYM_FUNC_END(aesni_xts_decrypt) #endif diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 96bdc1584215..84e3ed49b35d 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -101,6 +101,12 @@ asmlinkage void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, #define AVX_GEN2_OPTSIZE 640 #define AVX_GEN4_OPTSIZE 4096 +asmlinkage void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv); + +asmlinkage void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv); + #ifdef CONFIG_X86_64 static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out, @@ -108,9 +114,6 @@ static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out, asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); -asmlinkage void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *out, - const u8 *in, bool enc, le128 *iv); - /* asmlinkage void aesni_gcm_enc() * void *ctx, AES Key schedule. Starts on a 16 byte boundary. * struct gcm_context_data. May be uninitialized. @@ -663,14 +666,14 @@ static void aesni_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv) glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_dec); } -static void aesni_xts_enc8(const void *ctx, u8 *dst, const u8 *src, le128 *iv) +static void aesni_xts_enc32(const void *ctx, u8 *dst, const u8 *src, le128 *iv) { - aesni_xts_crypt8(ctx, dst, src, true, iv); + aesni_xts_encrypt(ctx, dst, src, 32 * AES_BLOCK_SIZE, (u8 *)iv); } -static void aesni_xts_dec8(const void *ctx, u8 *dst, const u8 *src, le128 *iv) +static void aesni_xts_dec32(const void *ctx, u8 *dst, const u8 *src, le128 *iv) { - aesni_xts_crypt8(ctx, dst, src, false, iv); + aesni_xts_decrypt(ctx, dst, src, 32 * AES_BLOCK_SIZE, (u8 *)iv); } static const struct common_glue_ctx aesni_enc_xts = { @@ -678,8 +681,8 @@ static const struct common_glue_ctx aesni_enc_xts = { .fpu_blocks_limit = 1, .funcs = { { - .num_blocks = 8, - .fn_u = { .xts = aesni_xts_enc8 } + .num_blocks = 32, + .fn_u = { .xts = aesni_xts_enc32 } }, { .num_blocks = 1, .fn_u = { .xts = aesni_xts_enc } @@ -691,8 +694,8 @@ static const struct common_glue_ctx aesni_dec_xts = { .fpu_blocks_limit = 1, .funcs = { { - .num_blocks = 8, - .fn_u = { .xts = aesni_xts_dec8 } + .num_blocks = 32, + .fn_u = { .xts = aesni_xts_dec32 } }, { .num_blocks = 1, .fn_u = { .xts = aesni_xts_dec } -- cgit v1.2.3 From 2481104fe98d5b016fdd95d649b1235f21e491ba Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 31 Dec 2020 17:41:55 +0100 Subject: crypto: x86/aes-ni-xts - rewrite and drop indirections via glue helper The AES-NI driver implements XTS via the glue helper, which consumes a struct with sets of function pointers which are invoked on chunks of input data of the appropriate size, as annotated in the struct. Let's get rid of this indirection, so that we can perform direct calls to the assembler helpers. Instead, let's adopt the arm64 strategy, i.e., provide a helper which can consume inputs of any size, provided that the penultimate, full block is passed via the last call if ciphertext stealing needs to be applied. This also allows us to enable the XTS mode for i386. Tested-by: Eric Biggers # x86_64 Signed-off-by: Ard Biesheuvel Reported-by: kernel test robot Reported-by: kernel test robot Reported-by: kernel test robot Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_asm.S | 280 ++++++++++++++++++++++++++++++------- arch/x86/crypto/aesni-intel_glue.c | 220 ++++++++++++++++------------- 2 files changed, 356 insertions(+), 144 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 84d8a156cdcd..4e3972570916 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -43,10 +43,6 @@ #ifdef __x86_64__ # constants in mergeable sections, linker can reorder and merge -.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16 -.align 16 -.Lgf128mul_x_ble_mask: - .octa 0x00000000000000010000000000000087 .section .rodata.cst16.POLY, "aM", @progbits, 16 .align 16 POLY: .octa 0xC2000000000000000000000000000001 @@ -146,7 +142,7 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff #define CTR %xmm11 #define INC %xmm12 -#define GF128MUL_MASK %xmm10 +#define GF128MUL_MASK %xmm7 #ifdef __x86_64__ #define AREG %rax @@ -2823,6 +2819,14 @@ SYM_FUNC_START(aesni_ctr_enc) ret SYM_FUNC_END(aesni_ctr_enc) +#endif + +.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16 +.align 16 +.Lgf128mul_x_ble_mask: + .octa 0x00000000000000010000000000000087 +.previous + /* * _aesni_gf128mul_x_ble: internal ABI * Multiply in GF(2^128) for XTS IVs @@ -2835,11 +2839,11 @@ SYM_FUNC_END(aesni_ctr_enc) * CTR: == temporary value */ #define _aesni_gf128mul_x_ble() \ - pshufd $0x13, IV, CTR; \ + pshufd $0x13, IV, KEY; \ paddq IV, IV; \ - psrad $31, CTR; \ - pand GF128MUL_MASK, CTR; \ - pxor CTR, IV; + psrad $31, KEY; \ + pand GF128MUL_MASK, KEY; \ + pxor KEY, IV; /* * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst, @@ -2847,65 +2851,153 @@ SYM_FUNC_END(aesni_ctr_enc) */ SYM_FUNC_START(aesni_xts_encrypt) FRAME_BEGIN - +#ifndef __x86_64__ + pushl IVP + pushl LEN + pushl KEYP + pushl KLEN + movl (FRAME_OFFSET+20)(%esp), KEYP # ctx + movl (FRAME_OFFSET+24)(%esp), OUTP # dst + movl (FRAME_OFFSET+28)(%esp), INP # src + movl (FRAME_OFFSET+32)(%esp), LEN # len + movl (FRAME_OFFSET+36)(%esp), IVP # iv movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK +#else + movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK +#endif movups (IVP), IV mov 480(KEYP), KLEN .Lxts_enc_loop4: + sub $64, LEN + jl .Lxts_enc_1x + movdqa IV, STATE1 - movdqu 0x00(INP), INC - pxor INC, STATE1 + movdqu 0x00(INP), IN + pxor IN, STATE1 movdqu IV, 0x00(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE2 - movdqu 0x10(INP), INC - pxor INC, STATE2 + movdqu 0x10(INP), IN + pxor IN, STATE2 movdqu IV, 0x10(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE3 - movdqu 0x20(INP), INC - pxor INC, STATE3 + movdqu 0x20(INP), IN + pxor IN, STATE3 movdqu IV, 0x20(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE4 - movdqu 0x30(INP), INC - pxor INC, STATE4 + movdqu 0x30(INP), IN + pxor IN, STATE4 movdqu IV, 0x30(OUTP) call _aesni_enc4 - movdqu 0x00(OUTP), INC - pxor INC, STATE1 + movdqu 0x00(OUTP), IN + pxor IN, STATE1 movdqu STATE1, 0x00(OUTP) - movdqu 0x10(OUTP), INC - pxor INC, STATE2 + movdqu 0x10(OUTP), IN + pxor IN, STATE2 movdqu STATE2, 0x10(OUTP) - movdqu 0x20(OUTP), INC - pxor INC, STATE3 + movdqu 0x20(OUTP), IN + pxor IN, STATE3 movdqu STATE3, 0x20(OUTP) - movdqu 0x30(OUTP), INC - pxor INC, STATE4 + movdqu 0x30(OUTP), IN + pxor IN, STATE4 movdqu STATE4, 0x30(OUTP) _aesni_gf128mul_x_ble() add $64, INP add $64, OUTP - sub $64, LEN - ja .Lxts_enc_loop4 + test LEN, LEN + jnz .Lxts_enc_loop4 +.Lxts_enc_ret_iv: movups IV, (IVP) +.Lxts_enc_ret: +#ifndef __x86_64__ + popl KLEN + popl KEYP + popl LEN + popl IVP +#endif FRAME_END ret + +.Lxts_enc_1x: + add $64, LEN + jz .Lxts_enc_ret_iv + sub $16, LEN + jl .Lxts_enc_cts4 + +.Lxts_enc_loop1: + movdqu (INP), STATE + pxor IV, STATE + call _aesni_enc1 + pxor IV, STATE + _aesni_gf128mul_x_ble() + + test LEN, LEN + jz .Lxts_enc_out + + add $16, INP + sub $16, LEN + jl .Lxts_enc_cts1 + + movdqu STATE, (OUTP) + add $16, OUTP + jmp .Lxts_enc_loop1 + +.Lxts_enc_out: + movdqu STATE, (OUTP) + jmp .Lxts_enc_ret_iv + +.Lxts_enc_cts4: + movdqa STATE4, STATE + sub $16, OUTP + +.Lxts_enc_cts1: +#ifndef __x86_64__ + lea .Lcts_permute_table, T1 +#else + lea .Lcts_permute_table(%rip), T1 +#endif + add LEN, INP /* rewind input pointer */ + add $16, LEN /* # bytes in final block */ + movups (INP), IN1 + + mov T1, IVP + add $32, IVP + add LEN, T1 + sub LEN, IVP + add OUTP, LEN + + movups (T1), %xmm4 + movaps STATE, IN2 + pshufb %xmm4, STATE + movups STATE, (LEN) + + movups (IVP), %xmm0 + pshufb %xmm0, IN1 + pblendvb IN2, IN1 + movaps IN1, STATE + + pxor IV, STATE + call _aesni_enc1 + pxor IV, STATE + + movups STATE, (OUTP) + jmp .Lxts_enc_ret SYM_FUNC_END(aesni_xts_encrypt) /* @@ -2914,66 +3006,158 @@ SYM_FUNC_END(aesni_xts_encrypt) */ SYM_FUNC_START(aesni_xts_decrypt) FRAME_BEGIN - +#ifndef __x86_64__ + pushl IVP + pushl LEN + pushl KEYP + pushl KLEN + movl (FRAME_OFFSET+20)(%esp), KEYP # ctx + movl (FRAME_OFFSET+24)(%esp), OUTP # dst + movl (FRAME_OFFSET+28)(%esp), INP # src + movl (FRAME_OFFSET+32)(%esp), LEN # len + movl (FRAME_OFFSET+36)(%esp), IVP # iv movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK +#else + movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK +#endif movups (IVP), IV mov 480(KEYP), KLEN add $240, KEYP + test $15, LEN + jz .Lxts_dec_loop4 + sub $16, LEN + .Lxts_dec_loop4: + sub $64, LEN + jl .Lxts_dec_1x + movdqa IV, STATE1 - movdqu 0x00(INP), INC - pxor INC, STATE1 + movdqu 0x00(INP), IN + pxor IN, STATE1 movdqu IV, 0x00(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE2 - movdqu 0x10(INP), INC - pxor INC, STATE2 + movdqu 0x10(INP), IN + pxor IN, STATE2 movdqu IV, 0x10(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE3 - movdqu 0x20(INP), INC - pxor INC, STATE3 + movdqu 0x20(INP), IN + pxor IN, STATE3 movdqu IV, 0x20(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE4 - movdqu 0x30(INP), INC - pxor INC, STATE4 + movdqu 0x30(INP), IN + pxor IN, STATE4 movdqu IV, 0x30(OUTP) call _aesni_dec4 - movdqu 0x00(OUTP), INC - pxor INC, STATE1 + movdqu 0x00(OUTP), IN + pxor IN, STATE1 movdqu STATE1, 0x00(OUTP) - movdqu 0x10(OUTP), INC - pxor INC, STATE2 + movdqu 0x10(OUTP), IN + pxor IN, STATE2 movdqu STATE2, 0x10(OUTP) - movdqu 0x20(OUTP), INC - pxor INC, STATE3 + movdqu 0x20(OUTP), IN + pxor IN, STATE3 movdqu STATE3, 0x20(OUTP) - movdqu 0x30(OUTP), INC - pxor INC, STATE4 + movdqu 0x30(OUTP), IN + pxor IN, STATE4 movdqu STATE4, 0x30(OUTP) _aesni_gf128mul_x_ble() add $64, INP add $64, OUTP - sub $64, LEN - ja .Lxts_dec_loop4 + test LEN, LEN + jnz .Lxts_dec_loop4 +.Lxts_dec_ret_iv: movups IV, (IVP) +.Lxts_dec_ret: +#ifndef __x86_64__ + popl KLEN + popl KEYP + popl LEN + popl IVP +#endif FRAME_END ret -SYM_FUNC_END(aesni_xts_decrypt) +.Lxts_dec_1x: + add $64, LEN + jz .Lxts_dec_ret_iv + +.Lxts_dec_loop1: + movdqu (INP), STATE + + add $16, INP + sub $16, LEN + jl .Lxts_dec_cts1 + + pxor IV, STATE + call _aesni_dec1 + pxor IV, STATE + _aesni_gf128mul_x_ble() + + test LEN, LEN + jz .Lxts_dec_out + + movdqu STATE, (OUTP) + add $16, OUTP + jmp .Lxts_dec_loop1 + +.Lxts_dec_out: + movdqu STATE, (OUTP) + jmp .Lxts_dec_ret_iv + +.Lxts_dec_cts1: + movdqa IV, STATE4 + _aesni_gf128mul_x_ble() + + pxor IV, STATE + call _aesni_dec1 + pxor IV, STATE + +#ifndef __x86_64__ + lea .Lcts_permute_table, T1 +#else + lea .Lcts_permute_table(%rip), T1 #endif + add LEN, INP /* rewind input pointer */ + add $16, LEN /* # bytes in final block */ + movups (INP), IN1 + + mov T1, IVP + add $32, IVP + add LEN, T1 + sub LEN, IVP + add OUTP, LEN + + movups (T1), %xmm4 + movaps STATE, IN2 + pshufb %xmm4, STATE + movups STATE, (LEN) + + movups (IVP), %xmm0 + pshufb %xmm0, IN1 + pblendvb IN2, IN1 + movaps IN1, STATE + + pxor STATE4, STATE + call _aesni_dec1 + pxor STATE4, STATE + + movups STATE, (OUTP) + jmp .Lxts_dec_ret +SYM_FUNC_END(aesni_xts_decrypt) diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 84e3ed49b35d..2116bc2b9507 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -33,9 +33,6 @@ #include #include #include -#ifdef CONFIG_X86_64 -#include -#endif #define AESNI_ALIGN 16 @@ -632,98 +629,6 @@ static int ctr_crypt(struct skcipher_request *req) return err; } -static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key, - unsigned int keylen) -{ - struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - int err; - - err = xts_verify_key(tfm, key, keylen); - if (err) - return err; - - keylen /= 2; - - /* first half of xts-key is for crypt */ - err = aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_crypt_ctx, - key, keylen); - if (err) - return err; - - /* second half of xts-key is for tweak */ - return aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_tweak_ctx, - key + keylen, keylen); -} - - -static void aesni_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_enc); -} - -static void aesni_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_dec); -} - -static void aesni_xts_enc32(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - aesni_xts_encrypt(ctx, dst, src, 32 * AES_BLOCK_SIZE, (u8 *)iv); -} - -static void aesni_xts_dec32(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - aesni_xts_decrypt(ctx, dst, src, 32 * AES_BLOCK_SIZE, (u8 *)iv); -} - -static const struct common_glue_ctx aesni_enc_xts = { - .num_funcs = 2, - .fpu_blocks_limit = 1, - - .funcs = { { - .num_blocks = 32, - .fn_u = { .xts = aesni_xts_enc32 } - }, { - .num_blocks = 1, - .fn_u = { .xts = aesni_xts_enc } - } } -}; - -static const struct common_glue_ctx aesni_dec_xts = { - .num_funcs = 2, - .fpu_blocks_limit = 1, - - .funcs = { { - .num_blocks = 32, - .fn_u = { .xts = aesni_xts_dec32 } - }, { - .num_blocks = 1, - .fn_u = { .xts = aesni_xts_dec } - } } -}; - -static int xts_encrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&aesni_enc_xts, req, aesni_enc, - aes_ctx(ctx->raw_tweak_ctx), - aes_ctx(ctx->raw_crypt_ctx), - false); -} - -static int xts_decrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&aesni_dec_xts, req, aesni_enc, - aes_ctx(ctx->raw_tweak_ctx), - aes_ctx(ctx->raw_crypt_ctx), - true); -} - static int rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len) { @@ -996,6 +901,128 @@ static int helper_rfc4106_decrypt(struct aead_request *req) } #endif +static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keylen) +{ + struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm); + int err; + + err = xts_verify_key(tfm, key, keylen); + if (err) + return err; + + keylen /= 2; + + /* first half of xts-key is for crypt */ + err = aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_crypt_ctx, + key, keylen); + if (err) + return err; + + /* second half of xts-key is for tweak */ + return aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_tweak_ctx, + key + keylen, keylen); +} + +static int xts_crypt(struct skcipher_request *req, bool encrypt) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm); + int tail = req->cryptlen % AES_BLOCK_SIZE; + struct skcipher_request subreq; + struct skcipher_walk walk; + int err; + + if (req->cryptlen < AES_BLOCK_SIZE) + return -EINVAL; + + err = skcipher_walk_virt(&walk, req, false); + + if (unlikely(tail > 0 && walk.nbytes < walk.total)) { + int blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2; + + skcipher_walk_abort(&walk); + + skcipher_request_set_tfm(&subreq, tfm); + skcipher_request_set_callback(&subreq, + skcipher_request_flags(req), + NULL, NULL); + skcipher_request_set_crypt(&subreq, req->src, req->dst, + blocks * AES_BLOCK_SIZE, req->iv); + req = &subreq; + err = skcipher_walk_virt(&walk, req, false); + } else { + tail = 0; + } + + kernel_fpu_begin(); + + /* calculate first value of T */ + aesni_enc(aes_ctx(ctx->raw_tweak_ctx), walk.iv, walk.iv); + + while (walk.nbytes > 0) { + int nbytes = walk.nbytes; + + if (nbytes < walk.total) + nbytes &= ~(AES_BLOCK_SIZE - 1); + + if (encrypt) + aesni_xts_encrypt(aes_ctx(ctx->raw_crypt_ctx), + walk.dst.virt.addr, walk.src.virt.addr, + nbytes, walk.iv); + else + aesni_xts_decrypt(aes_ctx(ctx->raw_crypt_ctx), + walk.dst.virt.addr, walk.src.virt.addr, + nbytes, walk.iv); + kernel_fpu_end(); + + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + + if (walk.nbytes > 0) + kernel_fpu_begin(); + } + + if (unlikely(tail > 0 && !err)) { + struct scatterlist sg_src[2], sg_dst[2]; + struct scatterlist *src, *dst; + + dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen); + if (req->dst != req->src) + dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen); + + skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail, + req->iv); + + err = skcipher_walk_virt(&walk, &subreq, false); + if (err) + return err; + + kernel_fpu_begin(); + if (encrypt) + aesni_xts_encrypt(aes_ctx(ctx->raw_crypt_ctx), + walk.dst.virt.addr, walk.src.virt.addr, + walk.nbytes, walk.iv); + else + aesni_xts_decrypt(aes_ctx(ctx->raw_crypt_ctx), + walk.dst.virt.addr, walk.src.virt.addr, + walk.nbytes, walk.iv); + kernel_fpu_end(); + + err = skcipher_walk_done(&walk, 0); + } + return err; +} + +static int xts_encrypt(struct skcipher_request *req) +{ + return xts_crypt(req, true); +} + +static int xts_decrypt(struct skcipher_request *req) +{ + return xts_crypt(req, false); +} + static struct crypto_alg aesni_cipher_alg = { .cra_name = "aes", .cra_driver_name = "aes-aesni", @@ -1082,6 +1109,7 @@ static struct skcipher_alg aesni_skciphers[] = { .setkey = aesni_skcipher_setkey, .encrypt = ctr_crypt, .decrypt = ctr_crypt, +#endif }, { .base = { .cra_name = "__xts(aes)", @@ -1095,10 +1123,10 @@ static struct skcipher_alg aesni_skciphers[] = { .min_keysize = 2 * AES_MIN_KEY_SIZE, .max_keysize = 2 * AES_MAX_KEY_SIZE, .ivsize = AES_BLOCK_SIZE, + .walksize = 2 * AES_BLOCK_SIZE, .setkey = xts_aesni_setkey, .encrypt = xts_encrypt, .decrypt = xts_decrypt, -#endif } }; -- cgit v1.2.3 From a13ed1d15b07a04b1f74b2df61ff7a5e47f45dd8 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 4 Jan 2021 16:55:46 +0100 Subject: crypto: aesni - prevent misaligned buffers on the stack The GCM mode driver uses 16 byte aligned buffers on the stack to pass the IV to the asm helpers, but unfortunately, the x86 port does not guarantee that the stack pointer is 16 byte aligned upon entry in the first place. Since the compiler is not aware of this, it will not emit the additional stack realignment sequence that is needed, and so the alignment is not guaranteed to be more than 8 bytes. So instead, allocate some padding on the stack, and realign the IV pointer by hand. Cc: Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_glue.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 2116bc2b9507..880f9f8b5153 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -710,7 +710,8 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, struct crypto_aead *tfm = crypto_aead_reqtfm(req); unsigned long auth_tag_len = crypto_aead_authsize(tfm); const struct aesni_gcm_tfm_s *gcm_tfm = aesni_gcm_tfm; - struct gcm_context_data data AESNI_ALIGN_ATTR; + u8 databuf[sizeof(struct gcm_context_data) + (AESNI_ALIGN - 8)] __aligned(8); + struct gcm_context_data *data = PTR_ALIGN((void *)databuf, AESNI_ALIGN); struct scatter_walk dst_sg_walk = {}; unsigned long left = req->cryptlen; unsigned long len, srclen, dstlen; @@ -759,8 +760,7 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, } kernel_fpu_begin(); - gcm_tfm->init(aes_ctx, &data, iv, - hash_subkey, assoc, assoclen); + gcm_tfm->init(aes_ctx, data, iv, hash_subkey, assoc, assoclen); if (req->src != req->dst) { while (left) { src = scatterwalk_map(&src_sg_walk); @@ -770,10 +770,10 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, len = min(srclen, dstlen); if (len) { if (enc) - gcm_tfm->enc_update(aes_ctx, &data, + gcm_tfm->enc_update(aes_ctx, data, dst, src, len); else - gcm_tfm->dec_update(aes_ctx, &data, + gcm_tfm->dec_update(aes_ctx, data, dst, src, len); } left -= len; @@ -791,10 +791,10 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, len = scatterwalk_clamp(&src_sg_walk, left); if (len) { if (enc) - gcm_tfm->enc_update(aes_ctx, &data, + gcm_tfm->enc_update(aes_ctx, data, src, src, len); else - gcm_tfm->dec_update(aes_ctx, &data, + gcm_tfm->dec_update(aes_ctx, data, src, src, len); } left -= len; @@ -803,7 +803,7 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, scatterwalk_done(&src_sg_walk, 1, left); } } - gcm_tfm->finalize(aes_ctx, &data, authTag, auth_tag_len); + gcm_tfm->finalize(aes_ctx, data, authTag, auth_tag_len); kernel_fpu_end(); if (!assocmem) @@ -852,7 +852,8 @@ static int helper_rfc4106_encrypt(struct aead_request *req) struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); void *aes_ctx = &(ctx->aes_key_expanded); - u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); + u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); unsigned int i; __be32 counter = cpu_to_be32(1); @@ -879,7 +880,8 @@ static int helper_rfc4106_decrypt(struct aead_request *req) struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); void *aes_ctx = &(ctx->aes_key_expanded); - u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); + u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); unsigned int i; if (unlikely(req->assoclen != 16 && req->assoclen != 20)) @@ -1149,7 +1151,8 @@ static int generic_gcmaes_encrypt(struct aead_request *req) struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm); void *aes_ctx = &(ctx->aes_key_expanded); - u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); + u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); __be32 counter = cpu_to_be32(1); memcpy(iv, req->iv, 12); @@ -1165,7 +1168,8 @@ static int generic_gcmaes_decrypt(struct aead_request *req) struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm); void *aes_ctx = &(ctx->aes_key_expanded); - u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); + u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); memcpy(iv, req->iv, 12); *((__be32 *)(iv+12)) = counter; -- cgit v1.2.3 From 30f2c18eb564acdc1c2c31f8cea9c7d38f46c681 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 4 Jan 2021 16:55:47 +0100 Subject: crypto: aesni - drop unused asm prototypes Drop some prototypes that are declared but never called. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_glue.c | 67 -------------------------------------- 1 file changed, 67 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 880f9f8b5153..0f124d72e6b4 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -111,49 +111,6 @@ static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out, asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); -/* asmlinkage void aesni_gcm_enc() - * void *ctx, AES Key schedule. Starts on a 16 byte boundary. - * struct gcm_context_data. May be uninitialized. - * u8 *out, Ciphertext output. Encrypt in-place is allowed. - * const u8 *in, Plaintext input - * unsigned long plaintext_len, Length of data in bytes for encryption. - * u8 *iv, Pre-counter block j0: 12 byte IV concatenated with 0x00000001. - * 16-byte aligned pointer. - * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. - * const u8 *aad, Additional Authentication Data (AAD) - * unsigned long aad_len, Length of AAD in bytes. - * u8 *auth_tag, Authenticated Tag output. - * unsigned long auth_tag_len), Authenticated Tag Length in bytes. - * Valid values are 16 (most likely), 12 or 8. - */ -asmlinkage void aesni_gcm_enc(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long plaintext_len, u8 *iv, - u8 *hash_subkey, const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len); - -/* asmlinkage void aesni_gcm_dec() - * void *ctx, AES Key schedule. Starts on a 16 byte boundary. - * struct gcm_context_data. May be uninitialized. - * u8 *out, Plaintext output. Decrypt in-place is allowed. - * const u8 *in, Ciphertext input - * unsigned long ciphertext_len, Length of data in bytes for decryption. - * u8 *iv, Pre-counter block j0: 12 byte IV concatenated with 0x00000001. - * 16-byte aligned pointer. - * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. - * const u8 *aad, Additional Authentication Data (AAD) - * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going - * to be 8 or 12 bytes - * u8 *auth_tag, Authenticated Tag output. - * unsigned long auth_tag_len) Authenticated Tag Length in bytes. - * Valid values are 16 (most likely), 12 or 8. - */ -asmlinkage void aesni_gcm_dec(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long ciphertext_len, u8 *iv, - u8 *hash_subkey, const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len); - /* Scatter / Gather routines, with args similar to above */ asmlinkage void aesni_gcm_init(void *ctx, struct gcm_context_data *gdata, @@ -218,18 +175,6 @@ asmlinkage void aesni_gcm_finalize_avx_gen2(void *ctx, struct gcm_context_data *gdata, u8 *auth_tag, unsigned long auth_tag_len); -asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long plaintext_len, u8 *iv, - const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len); - -asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long ciphertext_len, u8 *iv, - const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len); - static const struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen2 = { .init = &aesni_gcm_init_avx_gen2, .enc_update = &aesni_gcm_enc_update_avx_gen2, @@ -260,18 +205,6 @@ asmlinkage void aesni_gcm_finalize_avx_gen4(void *ctx, struct gcm_context_data *gdata, u8 *auth_tag, unsigned long auth_tag_len); -asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long plaintext_len, u8 *iv, - const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len); - -asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long ciphertext_len, u8 *iv, - const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len); - static const struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen4 = { .init = &aesni_gcm_init_avx_gen4, .enc_update = &aesni_gcm_enc_update_avx_gen4, -- cgit v1.2.3 From 2694e23ffd210cbbc05cd45bec77dc1c11bb72a2 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 4 Jan 2021 16:55:48 +0100 Subject: crypto: aesni - clean up mapping of associated data The gcm(aes-ni) driver is only built for x86_64, which does not make use of highmem. So testing for PageHighMem is pointless and can be omitted. While at it, replace GFP_ATOMIC with the appropriate runtime decided value based on the context. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_glue.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 0f124d72e6b4..efef6e6b1d34 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -667,14 +667,15 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, gcm_tfm = &aesni_gcm_tfm_sse; /* Linearize assoc, if not already linear */ - if (req->src->length >= assoclen && req->src->length && - (!PageHighMem(sg_page(req->src)) || - req->src->offset + req->src->length <= PAGE_SIZE)) { + if (req->src->length >= assoclen && req->src->length) { scatterwalk_start(&assoc_sg_walk, req->src); assoc = scatterwalk_map(&assoc_sg_walk); } else { + gfp_t flags = (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ? + GFP_KERNEL : GFP_ATOMIC; + /* assoc can be any length, so must be on heap */ - assocmem = kmalloc(assoclen, GFP_ATOMIC); + assocmem = kmalloc(assoclen, flags); if (unlikely(!assocmem)) return -ENOMEM; assoc = assocmem; -- cgit v1.2.3 From 83c83e658863e4e57f4defe6cc1bc05f3d968e2a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 4 Jan 2021 16:55:49 +0100 Subject: crypto: aesni - refactor scatterlist processing Currently, the gcm(aes-ni) driver open codes the scatterlist handling that is encapsulated by the skcipher walk API. So let's switch to that instead. Also, move the handling at the end of gcmaes_crypt_by_sg() that is dependent on whether we are encrypting or decrypting into the callers, which always do one or the other. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_glue.c | 139 +++++++++++++++---------------------- 1 file changed, 56 insertions(+), 83 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index efef6e6b1d34..180fa79a0727 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -638,25 +638,18 @@ static int generic_gcmaes_set_authsize(struct crypto_aead *tfm, static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, unsigned int assoclen, u8 *hash_subkey, - u8 *iv, void *aes_ctx) + u8 *iv, void *aes_ctx, u8 *auth_tag, + unsigned long auth_tag_len) { - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - unsigned long auth_tag_len = crypto_aead_authsize(tfm); const struct aesni_gcm_tfm_s *gcm_tfm = aesni_gcm_tfm; u8 databuf[sizeof(struct gcm_context_data) + (AESNI_ALIGN - 8)] __aligned(8); struct gcm_context_data *data = PTR_ALIGN((void *)databuf, AESNI_ALIGN); - struct scatter_walk dst_sg_walk = {}; unsigned long left = req->cryptlen; - unsigned long len, srclen, dstlen; struct scatter_walk assoc_sg_walk; - struct scatter_walk src_sg_walk; - struct scatterlist src_start[2]; - struct scatterlist dst_start[2]; - struct scatterlist *src_sg; - struct scatterlist *dst_sg; - u8 *src, *dst, *assoc; + struct skcipher_walk walk; u8 *assocmem = NULL; - u8 authTag[16]; + u8 *assoc; + int err; if (!enc) left -= auth_tag_len; @@ -683,61 +676,8 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, scatterwalk_map_and_copy(assoc, req->src, 0, assoclen, 0); } - if (left) { - src_sg = scatterwalk_ffwd(src_start, req->src, req->assoclen); - scatterwalk_start(&src_sg_walk, src_sg); - if (req->src != req->dst) { - dst_sg = scatterwalk_ffwd(dst_start, req->dst, - req->assoclen); - scatterwalk_start(&dst_sg_walk, dst_sg); - } - } - kernel_fpu_begin(); gcm_tfm->init(aes_ctx, data, iv, hash_subkey, assoc, assoclen); - if (req->src != req->dst) { - while (left) { - src = scatterwalk_map(&src_sg_walk); - dst = scatterwalk_map(&dst_sg_walk); - srclen = scatterwalk_clamp(&src_sg_walk, left); - dstlen = scatterwalk_clamp(&dst_sg_walk, left); - len = min(srclen, dstlen); - if (len) { - if (enc) - gcm_tfm->enc_update(aes_ctx, data, - dst, src, len); - else - gcm_tfm->dec_update(aes_ctx, data, - dst, src, len); - } - left -= len; - - scatterwalk_unmap(src); - scatterwalk_unmap(dst); - scatterwalk_advance(&src_sg_walk, len); - scatterwalk_advance(&dst_sg_walk, len); - scatterwalk_done(&src_sg_walk, 0, left); - scatterwalk_done(&dst_sg_walk, 1, left); - } - } else { - while (left) { - dst = src = scatterwalk_map(&src_sg_walk); - len = scatterwalk_clamp(&src_sg_walk, left); - if (len) { - if (enc) - gcm_tfm->enc_update(aes_ctx, data, - src, src, len); - else - gcm_tfm->dec_update(aes_ctx, data, - src, src, len); - } - left -= len; - scatterwalk_unmap(src); - scatterwalk_advance(&src_sg_walk, len); - scatterwalk_done(&src_sg_walk, 1, left); - } - } - gcm_tfm->finalize(aes_ctx, data, authTag, auth_tag_len); kernel_fpu_end(); if (!assocmem) @@ -745,24 +685,25 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, else kfree(assocmem); - if (!enc) { - u8 authTagMsg[16]; + err = enc ? skcipher_walk_aead_encrypt(&walk, req, false) + : skcipher_walk_aead_decrypt(&walk, req, false); - /* Copy out original authTag */ - scatterwalk_map_and_copy(authTagMsg, req->src, - req->assoclen + req->cryptlen - - auth_tag_len, - auth_tag_len, 0); + while (walk.nbytes > 0) { + kernel_fpu_begin(); + (enc ? gcm_tfm->enc_update + : gcm_tfm->dec_update)(aes_ctx, data, walk.dst.virt.addr, + walk.src.virt.addr, walk.nbytes); + kernel_fpu_end(); - /* Compare generated tag with passed in tag. */ - return crypto_memneq(authTagMsg, authTag, auth_tag_len) ? - -EBADMSG : 0; + err = skcipher_walk_done(&walk, 0); } - /* Copy in the authTag */ - scatterwalk_map_and_copy(authTag, req->dst, - req->assoclen + req->cryptlen, - auth_tag_len, 1); + if (err) + return err; + + kernel_fpu_begin(); + gcm_tfm->finalize(aes_ctx, data, auth_tag, auth_tag_len); + kernel_fpu_end(); return 0; } @@ -770,15 +711,47 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen, u8 *hash_subkey, u8 *iv, void *aes_ctx) { - return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv, - aes_ctx); + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + unsigned long auth_tag_len = crypto_aead_authsize(tfm); + u8 auth_tag[16]; + int err; + + err = gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv, aes_ctx, + auth_tag, auth_tag_len); + if (err) + return err; + + scatterwalk_map_and_copy(auth_tag, req->dst, + req->assoclen + req->cryptlen, + auth_tag_len, 1); + return 0; } static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen, u8 *hash_subkey, u8 *iv, void *aes_ctx) { - return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv, - aes_ctx); + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + unsigned long auth_tag_len = crypto_aead_authsize(tfm); + u8 auth_tag_msg[16]; + u8 auth_tag[16]; + int err; + + err = gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv, aes_ctx, + auth_tag, auth_tag_len); + if (err) + return err; + + /* Copy out original auth_tag */ + scatterwalk_map_and_copy(auth_tag_msg, req->src, + req->assoclen + req->cryptlen - auth_tag_len, + auth_tag_len, 0); + + /* Compare generated tag with passed in tag. */ + if (crypto_memneq(auth_tag_msg, auth_tag, auth_tag_len)) { + memzero_explicit(auth_tag, sizeof(auth_tag)); + return -EBADMSG; + } + return 0; } static int helper_rfc4106_encrypt(struct aead_request *req) -- cgit v1.2.3 From d6cbf4eaa46794b173c691a71211d882398d7977 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 4 Jan 2021 16:55:50 +0100 Subject: crypto: aesni - replace function pointers with static branches Replace the function pointers in the GCM implementation with static branches, which are based on code patching, which occurs only at module load time. This avoids the severe performance penalty caused by the use of retpolines. In order to retain the ability to switch between different versions of the implementation based on the input size on cores that support AVX and AVX2, use static branches instead of static calls. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_glue.c | 98 +++++++++++++++++++++----------------- 1 file changed, 54 insertions(+), 44 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 180fa79a0727..a548fdbc3073 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -128,24 +129,6 @@ asmlinkage void aesni_gcm_finalize(void *ctx, struct gcm_context_data *gdata, u8 *auth_tag, unsigned long auth_tag_len); -static const struct aesni_gcm_tfm_s { - void (*init)(void *ctx, struct gcm_context_data *gdata, u8 *iv, - u8 *hash_subkey, const u8 *aad, unsigned long aad_len); - void (*enc_update)(void *ctx, struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long plaintext_len); - void (*dec_update)(void *ctx, struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long ciphertext_len); - void (*finalize)(void *ctx, struct gcm_context_data *gdata, - u8 *auth_tag, unsigned long auth_tag_len); -} *aesni_gcm_tfm; - -static const struct aesni_gcm_tfm_s aesni_gcm_tfm_sse = { - .init = &aesni_gcm_init, - .enc_update = &aesni_gcm_enc_update, - .dec_update = &aesni_gcm_dec_update, - .finalize = &aesni_gcm_finalize, -}; - asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv, void *keys, u8 *out, unsigned int num_bytes); asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv, @@ -175,13 +158,6 @@ asmlinkage void aesni_gcm_finalize_avx_gen2(void *ctx, struct gcm_context_data *gdata, u8 *auth_tag, unsigned long auth_tag_len); -static const struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen2 = { - .init = &aesni_gcm_init_avx_gen2, - .enc_update = &aesni_gcm_enc_update_avx_gen2, - .dec_update = &aesni_gcm_dec_update_avx_gen2, - .finalize = &aesni_gcm_finalize_avx_gen2, -}; - /* * asmlinkage void aesni_gcm_init_avx_gen4() * gcm_data *my_ctx_data, context data @@ -205,12 +181,8 @@ asmlinkage void aesni_gcm_finalize_avx_gen4(void *ctx, struct gcm_context_data *gdata, u8 *auth_tag, unsigned long auth_tag_len); -static const struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen4 = { - .init = &aesni_gcm_init_avx_gen4, - .enc_update = &aesni_gcm_enc_update_avx_gen4, - .dec_update = &aesni_gcm_dec_update_avx_gen4, - .finalize = &aesni_gcm_finalize_avx_gen4, -}; +static __ro_after_init DEFINE_STATIC_KEY_FALSE(gcm_use_avx); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(gcm_use_avx2); static inline struct aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) @@ -641,12 +613,12 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, u8 *iv, void *aes_ctx, u8 *auth_tag, unsigned long auth_tag_len) { - const struct aesni_gcm_tfm_s *gcm_tfm = aesni_gcm_tfm; u8 databuf[sizeof(struct gcm_context_data) + (AESNI_ALIGN - 8)] __aligned(8); struct gcm_context_data *data = PTR_ALIGN((void *)databuf, AESNI_ALIGN); unsigned long left = req->cryptlen; struct scatter_walk assoc_sg_walk; struct skcipher_walk walk; + bool do_avx, do_avx2; u8 *assocmem = NULL; u8 *assoc; int err; @@ -654,10 +626,8 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, if (!enc) left -= auth_tag_len; - if (left < AVX_GEN4_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen4) - gcm_tfm = &aesni_gcm_tfm_avx_gen2; - if (left < AVX_GEN2_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen2) - gcm_tfm = &aesni_gcm_tfm_sse; + do_avx = (left >= AVX_GEN2_OPTSIZE); + do_avx2 = (left >= AVX_GEN4_OPTSIZE); /* Linearize assoc, if not already linear */ if (req->src->length >= assoclen && req->src->length) { @@ -677,7 +647,14 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, } kernel_fpu_begin(); - gcm_tfm->init(aes_ctx, data, iv, hash_subkey, assoc, assoclen); + if (static_branch_likely(&gcm_use_avx2) && do_avx2) + aesni_gcm_init_avx_gen4(aes_ctx, data, iv, hash_subkey, assoc, + assoclen); + else if (static_branch_likely(&gcm_use_avx) && do_avx) + aesni_gcm_init_avx_gen2(aes_ctx, data, iv, hash_subkey, assoc, + assoclen); + else + aesni_gcm_init(aes_ctx, data, iv, hash_subkey, assoc, assoclen); kernel_fpu_end(); if (!assocmem) @@ -690,9 +667,35 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, while (walk.nbytes > 0) { kernel_fpu_begin(); - (enc ? gcm_tfm->enc_update - : gcm_tfm->dec_update)(aes_ctx, data, walk.dst.virt.addr, - walk.src.virt.addr, walk.nbytes); + if (static_branch_likely(&gcm_use_avx2) && do_avx2) { + if (enc) + aesni_gcm_enc_update_avx_gen4(aes_ctx, data, + walk.dst.virt.addr, + walk.src.virt.addr, + walk.nbytes); + else + aesni_gcm_dec_update_avx_gen4(aes_ctx, data, + walk.dst.virt.addr, + walk.src.virt.addr, + walk.nbytes); + } else if (static_branch_likely(&gcm_use_avx) && do_avx) { + if (enc) + aesni_gcm_enc_update_avx_gen2(aes_ctx, data, + walk.dst.virt.addr, + walk.src.virt.addr, + walk.nbytes); + else + aesni_gcm_dec_update_avx_gen2(aes_ctx, data, + walk.dst.virt.addr, + walk.src.virt.addr, + walk.nbytes); + } else if (enc) { + aesni_gcm_enc_update(aes_ctx, data, walk.dst.virt.addr, + walk.src.virt.addr, walk.nbytes); + } else { + aesni_gcm_dec_update(aes_ctx, data, walk.dst.virt.addr, + walk.src.virt.addr, walk.nbytes); + } kernel_fpu_end(); err = skcipher_walk_done(&walk, 0); @@ -702,7 +705,14 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, return err; kernel_fpu_begin(); - gcm_tfm->finalize(aes_ctx, data, auth_tag, auth_tag_len); + if (static_branch_likely(&gcm_use_avx2) && do_avx2) + aesni_gcm_finalize_avx_gen4(aes_ctx, data, auth_tag, + auth_tag_len); + else if (static_branch_likely(&gcm_use_avx) && do_avx) + aesni_gcm_finalize_avx_gen2(aes_ctx, data, auth_tag, + auth_tag_len); + else + aesni_gcm_finalize(aes_ctx, data, auth_tag, auth_tag_len); kernel_fpu_end(); return 0; @@ -1141,14 +1151,14 @@ static int __init aesni_init(void) #ifdef CONFIG_X86_64 if (boot_cpu_has(X86_FEATURE_AVX2)) { pr_info("AVX2 version of gcm_enc/dec engaged.\n"); - aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen4; + static_branch_enable(&gcm_use_avx); + static_branch_enable(&gcm_use_avx2); } else if (boot_cpu_has(X86_FEATURE_AVX)) { pr_info("AVX version of gcm_enc/dec engaged.\n"); - aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen2; + static_branch_enable(&gcm_use_avx); } else { pr_info("SSE version of gcm_enc/dec engaged.\n"); - aesni_gcm_tfm = &aesni_gcm_tfm_sse; } aesni_ctr_enc_tfm = aesni_ctr_enc; if (boot_cpu_has(X86_FEATURE_AVX)) { -- cgit v1.2.3 From 55a7e88f016873ef1717295d8460416b1ccd05a5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Jan 2021 17:47:49 +0100 Subject: crypto: x86/camellia - switch to XTS template Now that the XTS template can wrap accelerated ECB modes, it can be used to implement Camellia in XTS mode as well, which turns out to be at least as fast, and sometimes even faster. Acked-by: Eric Biggers Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/camellia-aesni-avx-asm_64.S | 181 ----------------------- arch/x86/crypto/camellia-aesni-avx2-asm_64.S | 207 --------------------------- arch/x86/crypto/camellia_aesni_avx2_glue.c | 70 --------- arch/x86/crypto/camellia_aesni_avx_glue.c | 101 +------------ arch/x86/include/asm/crypto/camellia.h | 18 --- 5 files changed, 1 insertion(+), 576 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index ecc0a9a905c4..471c34e6cac2 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -17,7 +17,6 @@ #include #include -#include #define CAMELLIA_TABLE_BYTE_LEN 272 @@ -593,10 +592,6 @@ SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 -/* For XTS mode IV generation */ -.Lxts_gf128mul_and_shl1_mask: - .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 - /* * pre-SubByte transform * @@ -1111,179 +1106,3 @@ SYM_FUNC_START(camellia_ctr_16way) FRAME_END ret; SYM_FUNC_END(camellia_ctr_16way) - -#define gf128mul_x_ble(iv, mask, tmp) \ - vpsrad $31, iv, tmp; \ - vpaddq iv, iv, iv; \ - vpshufd $0x13, tmp, tmp; \ - vpand mask, tmp, tmp; \ - vpxor tmp, iv, iv; - -.align 8 -SYM_FUNC_START_LOCAL(camellia_xts_crypt_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - * %r8: index for input whitening key - * %r9: pointer to __camellia_enc_blk16 or __camellia_dec_blk16 - */ - FRAME_BEGIN - - subq $(16 * 16), %rsp; - movq %rsp, %rax; - - vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14; - - /* load IV */ - vmovdqu (%rcx), %xmm0; - vpxor 0 * 16(%rdx), %xmm0, %xmm15; - vmovdqu %xmm15, 15 * 16(%rax); - vmovdqu %xmm0, 0 * 16(%rsi); - - /* construct IVs */ - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 1 * 16(%rdx), %xmm0, %xmm15; - vmovdqu %xmm15, 14 * 16(%rax); - vmovdqu %xmm0, 1 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 2 * 16(%rdx), %xmm0, %xmm13; - vmovdqu %xmm0, 2 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 3 * 16(%rdx), %xmm0, %xmm12; - vmovdqu %xmm0, 3 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 4 * 16(%rdx), %xmm0, %xmm11; - vmovdqu %xmm0, 4 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 5 * 16(%rdx), %xmm0, %xmm10; - vmovdqu %xmm0, 5 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 6 * 16(%rdx), %xmm0, %xmm9; - vmovdqu %xmm0, 6 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 7 * 16(%rdx), %xmm0, %xmm8; - vmovdqu %xmm0, 7 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 8 * 16(%rdx), %xmm0, %xmm7; - vmovdqu %xmm0, 8 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 9 * 16(%rdx), %xmm0, %xmm6; - vmovdqu %xmm0, 9 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 10 * 16(%rdx), %xmm0, %xmm5; - vmovdqu %xmm0, 10 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 11 * 16(%rdx), %xmm0, %xmm4; - vmovdqu %xmm0, 11 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 12 * 16(%rdx), %xmm0, %xmm3; - vmovdqu %xmm0, 12 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 13 * 16(%rdx), %xmm0, %xmm2; - vmovdqu %xmm0, 13 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 14 * 16(%rdx), %xmm0, %xmm1; - vmovdqu %xmm0, 14 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 15 * 16(%rdx), %xmm0, %xmm15; - vmovdqu %xmm15, 0 * 16(%rax); - vmovdqu %xmm0, 15 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vmovdqu %xmm0, (%rcx); - - /* inpack16_pre: */ - vmovq (key_table)(CTX, %r8, 8), %xmm15; - vpshufb .Lpack_bswap, %xmm15, %xmm15; - vpxor 0 * 16(%rax), %xmm15, %xmm0; - vpxor %xmm1, %xmm15, %xmm1; - vpxor %xmm2, %xmm15, %xmm2; - vpxor %xmm3, %xmm15, %xmm3; - vpxor %xmm4, %xmm15, %xmm4; - vpxor %xmm5, %xmm15, %xmm5; - vpxor %xmm6, %xmm15, %xmm6; - vpxor %xmm7, %xmm15, %xmm7; - vpxor %xmm8, %xmm15, %xmm8; - vpxor %xmm9, %xmm15, %xmm9; - vpxor %xmm10, %xmm15, %xmm10; - vpxor %xmm11, %xmm15, %xmm11; - vpxor %xmm12, %xmm15, %xmm12; - vpxor %xmm13, %xmm15, %xmm13; - vpxor 14 * 16(%rax), %xmm15, %xmm14; - vpxor 15 * 16(%rax), %xmm15, %xmm15; - - CALL_NOSPEC r9; - - addq $(16 * 16), %rsp; - - vpxor 0 * 16(%rsi), %xmm7, %xmm7; - vpxor 1 * 16(%rsi), %xmm6, %xmm6; - vpxor 2 * 16(%rsi), %xmm5, %xmm5; - vpxor 3 * 16(%rsi), %xmm4, %xmm4; - vpxor 4 * 16(%rsi), %xmm3, %xmm3; - vpxor 5 * 16(%rsi), %xmm2, %xmm2; - vpxor 6 * 16(%rsi), %xmm1, %xmm1; - vpxor 7 * 16(%rsi), %xmm0, %xmm0; - vpxor 8 * 16(%rsi), %xmm15, %xmm15; - vpxor 9 * 16(%rsi), %xmm14, %xmm14; - vpxor 10 * 16(%rsi), %xmm13, %xmm13; - vpxor 11 * 16(%rsi), %xmm12, %xmm12; - vpxor 12 * 16(%rsi), %xmm11, %xmm11; - vpxor 13 * 16(%rsi), %xmm10, %xmm10; - vpxor 14 * 16(%rsi), %xmm9, %xmm9; - vpxor 15 * 16(%rsi), %xmm8, %xmm8; - write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, - %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, - %xmm8, %rsi); - - FRAME_END - ret; -SYM_FUNC_END(camellia_xts_crypt_16way) - -SYM_FUNC_START(camellia_xts_enc_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - xorl %r8d, %r8d; /* input whitening key, 0 for enc */ - - leaq __camellia_enc_blk16, %r9; - - jmp camellia_xts_crypt_16way; -SYM_FUNC_END(camellia_xts_enc_16way) - -SYM_FUNC_START(camellia_xts_dec_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - - cmpl $16, key_length(CTX); - movl $32, %r8d; - movl $24, %eax; - cmovel %eax, %r8d; /* input whitening key, last for dec */ - - leaq __camellia_dec_blk16, %r9; - - jmp camellia_xts_crypt_16way; -SYM_FUNC_END(camellia_xts_dec_16way) diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index 0907243c501c..9561dee52de0 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -7,7 +7,6 @@ #include #include -#include #define CAMELLIA_TABLE_BYTE_LEN 272 @@ -629,12 +628,6 @@ SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 -/* For XTS mode */ -.Lxts_gf128mul_and_shl1_mask_0: - .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 -.Lxts_gf128mul_and_shl1_mask_1: - .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 - /* * pre-SubByte transform * @@ -1201,203 +1194,3 @@ SYM_FUNC_START(camellia_ctr_32way) FRAME_END ret; SYM_FUNC_END(camellia_ctr_32way) - -#define gf128mul_x_ble(iv, mask, tmp) \ - vpsrad $31, iv, tmp; \ - vpaddq iv, iv, iv; \ - vpshufd $0x13, tmp, tmp; \ - vpand mask, tmp, tmp; \ - vpxor tmp, iv, iv; - -#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \ - vpsrad $31, iv, tmp0; \ - vpaddq iv, iv, tmp1; \ - vpsllq $2, iv, iv; \ - vpshufd $0x13, tmp0, tmp0; \ - vpsrad $31, tmp1, tmp1; \ - vpand mask2, tmp0, tmp0; \ - vpshufd $0x13, tmp1, tmp1; \ - vpxor tmp0, iv, iv; \ - vpand mask1, tmp1, tmp1; \ - vpxor tmp1, iv, iv; - -.align 8 -SYM_FUNC_START_LOCAL(camellia_xts_crypt_32way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (32 blocks) - * %rdx: src (32 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - * %r8: index for input whitening key - * %r9: pointer to __camellia_enc_blk32 or __camellia_dec_blk32 - */ - FRAME_BEGIN - - vzeroupper; - - subq $(16 * 32), %rsp; - movq %rsp, %rax; - - vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12; - - /* load IV and construct second IV */ - vmovdqu (%rcx), %xmm0; - vmovdqa %xmm0, %xmm15; - gf128mul_x_ble(%xmm0, %xmm12, %xmm13); - vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13; - vinserti128 $1, %xmm0, %ymm15, %ymm0; - vpxor 0 * 32(%rdx), %ymm0, %ymm15; - vmovdqu %ymm15, 15 * 32(%rax); - vmovdqu %ymm0, 0 * 32(%rsi); - - /* construct IVs */ - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 1 * 32(%rdx), %ymm0, %ymm15; - vmovdqu %ymm15, 14 * 32(%rax); - vmovdqu %ymm0, 1 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 2 * 32(%rdx), %ymm0, %ymm15; - vmovdqu %ymm15, 13 * 32(%rax); - vmovdqu %ymm0, 2 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 3 * 32(%rdx), %ymm0, %ymm15; - vmovdqu %ymm15, 12 * 32(%rax); - vmovdqu %ymm0, 3 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 4 * 32(%rdx), %ymm0, %ymm11; - vmovdqu %ymm0, 4 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 5 * 32(%rdx), %ymm0, %ymm10; - vmovdqu %ymm0, 5 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 6 * 32(%rdx), %ymm0, %ymm9; - vmovdqu %ymm0, 6 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 7 * 32(%rdx), %ymm0, %ymm8; - vmovdqu %ymm0, 7 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 8 * 32(%rdx), %ymm0, %ymm7; - vmovdqu %ymm0, 8 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 9 * 32(%rdx), %ymm0, %ymm6; - vmovdqu %ymm0, 9 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 10 * 32(%rdx), %ymm0, %ymm5; - vmovdqu %ymm0, 10 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 11 * 32(%rdx), %ymm0, %ymm4; - vmovdqu %ymm0, 11 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 12 * 32(%rdx), %ymm0, %ymm3; - vmovdqu %ymm0, 12 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 13 * 32(%rdx), %ymm0, %ymm2; - vmovdqu %ymm0, 13 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 14 * 32(%rdx), %ymm0, %ymm1; - vmovdqu %ymm0, 14 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 15 * 32(%rdx), %ymm0, %ymm15; - vmovdqu %ymm15, 0 * 32(%rax); - vmovdqu %ymm0, 15 * 32(%rsi); - - vextracti128 $1, %ymm0, %xmm0; - gf128mul_x_ble(%xmm0, %xmm12, %xmm15); - vmovdqu %xmm0, (%rcx); - - /* inpack32_pre: */ - vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15; - vpshufb .Lpack_bswap, %ymm15, %ymm15; - vpxor 0 * 32(%rax), %ymm15, %ymm0; - vpxor %ymm1, %ymm15, %ymm1; - vpxor %ymm2, %ymm15, %ymm2; - vpxor %ymm3, %ymm15, %ymm3; - vpxor %ymm4, %ymm15, %ymm4; - vpxor %ymm5, %ymm15, %ymm5; - vpxor %ymm6, %ymm15, %ymm6; - vpxor %ymm7, %ymm15, %ymm7; - vpxor %ymm8, %ymm15, %ymm8; - vpxor %ymm9, %ymm15, %ymm9; - vpxor %ymm10, %ymm15, %ymm10; - vpxor %ymm11, %ymm15, %ymm11; - vpxor 12 * 32(%rax), %ymm15, %ymm12; - vpxor 13 * 32(%rax), %ymm15, %ymm13; - vpxor 14 * 32(%rax), %ymm15, %ymm14; - vpxor 15 * 32(%rax), %ymm15, %ymm15; - - CALL_NOSPEC r9; - - addq $(16 * 32), %rsp; - - vpxor 0 * 32(%rsi), %ymm7, %ymm7; - vpxor 1 * 32(%rsi), %ymm6, %ymm6; - vpxor 2 * 32(%rsi), %ymm5, %ymm5; - vpxor 3 * 32(%rsi), %ymm4, %ymm4; - vpxor 4 * 32(%rsi), %ymm3, %ymm3; - vpxor 5 * 32(%rsi), %ymm2, %ymm2; - vpxor 6 * 32(%rsi), %ymm1, %ymm1; - vpxor 7 * 32(%rsi), %ymm0, %ymm0; - vpxor 8 * 32(%rsi), %ymm15, %ymm15; - vpxor 9 * 32(%rsi), %ymm14, %ymm14; - vpxor 10 * 32(%rsi), %ymm13, %ymm13; - vpxor 11 * 32(%rsi), %ymm12, %ymm12; - vpxor 12 * 32(%rsi), %ymm11, %ymm11; - vpxor 13 * 32(%rsi), %ymm10, %ymm10; - vpxor 14 * 32(%rsi), %ymm9, %ymm9; - vpxor 15 * 32(%rsi), %ymm8, %ymm8; - write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, - %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, - %ymm8, %rsi); - - vzeroupper; - - FRAME_END - ret; -SYM_FUNC_END(camellia_xts_crypt_32way) - -SYM_FUNC_START(camellia_xts_enc_32way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (32 blocks) - * %rdx: src (32 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - - xorl %r8d, %r8d; /* input whitening key, 0 for enc */ - - leaq __camellia_enc_blk32, %r9; - - jmp camellia_xts_crypt_32way; -SYM_FUNC_END(camellia_xts_enc_32way) - -SYM_FUNC_START(camellia_xts_dec_32way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (32 blocks) - * %rdx: src (32 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - - cmpl $16, key_length(CTX); - movl $32, %r8d; - movl $24, %eax; - cmovel %eax, %r8d; /* input whitening key, last for dec */ - - leaq __camellia_dec_blk32, %r9; - - jmp camellia_xts_crypt_32way; -SYM_FUNC_END(camellia_xts_dec_32way) diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index ccda647422d6..d956d0473668 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -26,11 +25,6 @@ asmlinkage void camellia_cbc_dec_32way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void camellia_ctr_32way(const void *ctx, u8 *dst, const u8 *src, le128 *iv); -asmlinkage void camellia_xts_enc_32way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -asmlinkage void camellia_xts_dec_32way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); - static const struct common_glue_ctx camellia_enc = { .num_funcs = 4, .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, @@ -69,22 +63,6 @@ static const struct common_glue_ctx camellia_ctr = { } } }; -static const struct common_glue_ctx camellia_enc_xts = { - .num_funcs = 3, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, - .fn_u = { .xts = camellia_xts_enc_32way } - }, { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .xts = camellia_xts_enc_16way } - }, { - .num_blocks = 1, - .fn_u = { .xts = camellia_xts_enc } - } } -}; - static const struct common_glue_ctx camellia_dec = { .num_funcs = 4, .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, @@ -123,22 +101,6 @@ static const struct common_glue_ctx camellia_dec_cbc = { } } }; -static const struct common_glue_ctx camellia_dec_xts = { - .num_funcs = 3, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, - .fn_u = { .xts = camellia_xts_dec_32way } - }, { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .xts = camellia_xts_dec_16way } - }, { - .num_blocks = 1, - .fn_u = { .xts = camellia_xts_dec } - } } -}; - static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { @@ -170,24 +132,6 @@ static int ctr_crypt(struct skcipher_request *req) return glue_ctr_req_128bit(&camellia_ctr, req); } -static int xts_encrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&camellia_enc_xts, req, camellia_enc_blk, - &ctx->tweak_ctx, &ctx->crypt_ctx, false); -} - -static int xts_decrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&camellia_dec_xts, req, camellia_enc_blk, - &ctx->tweak_ctx, &ctx->crypt_ctx, true); -} - static struct skcipher_alg camellia_algs[] = { { .base.cra_name = "__ecb(camellia)", @@ -231,20 +175,6 @@ static struct skcipher_alg camellia_algs[] = { .setkey = camellia_setkey, .encrypt = ctr_crypt, .decrypt = ctr_crypt, - }, { - .base.cra_name = "__xts(camellia)", - .base.cra_driver_name = "__xts-camellia-aesni-avx2", - .base.cra_priority = 500, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct camellia_xts_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = 2 * CAMELLIA_MIN_KEY_SIZE, - .max_keysize = 2 * CAMELLIA_MAX_KEY_SIZE, - .ivsize = CAMELLIA_BLOCK_SIZE, - .setkey = xts_camellia_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, }, }; diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index 4e5de6ef206e..44614f8a452c 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -31,26 +30,6 @@ asmlinkage void camellia_ctr_16way(const void *ctx, u8 *dst, const u8 *src, le128 *iv); EXPORT_SYMBOL_GPL(camellia_ctr_16way); -asmlinkage void camellia_xts_enc_16way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -EXPORT_SYMBOL_GPL(camellia_xts_enc_16way); - -asmlinkage void camellia_xts_dec_16way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -EXPORT_SYMBOL_GPL(camellia_xts_dec_16way); - -void camellia_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - glue_xts_crypt_128bit_one(ctx, dst, src, iv, camellia_enc_blk); -} -EXPORT_SYMBOL_GPL(camellia_xts_enc); - -void camellia_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - glue_xts_crypt_128bit_one(ctx, dst, src, iv, camellia_dec_blk); -} -EXPORT_SYMBOL_GPL(camellia_xts_dec); - static const struct common_glue_ctx camellia_enc = { .num_funcs = 3, .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, @@ -83,19 +62,6 @@ static const struct common_glue_ctx camellia_ctr = { } } }; -static const struct common_glue_ctx camellia_enc_xts = { - .num_funcs = 2, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .xts = camellia_xts_enc_16way } - }, { - .num_blocks = 1, - .fn_u = { .xts = camellia_xts_enc } - } } -}; - static const struct common_glue_ctx camellia_dec = { .num_funcs = 3, .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, @@ -128,19 +94,6 @@ static const struct common_glue_ctx camellia_dec_cbc = { } } }; -static const struct common_glue_ctx camellia_dec_xts = { - .num_funcs = 2, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .xts = camellia_xts_dec_16way } - }, { - .num_blocks = 1, - .fn_u = { .xts = camellia_xts_dec } - } } -}; - static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { @@ -172,44 +125,6 @@ static int ctr_crypt(struct skcipher_request *req) return glue_ctr_req_128bit(&camellia_ctr, req); } -int xts_camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, - unsigned int keylen) -{ - struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - int err; - - err = xts_verify_key(tfm, key, keylen); - if (err) - return err; - - /* first half of xts-key is for crypt */ - err = __camellia_setkey(&ctx->crypt_ctx, key, keylen / 2); - if (err) - return err; - - /* second half of xts-key is for tweak */ - return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); -} -EXPORT_SYMBOL_GPL(xts_camellia_setkey); - -static int xts_encrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&camellia_enc_xts, req, camellia_enc_blk, - &ctx->tweak_ctx, &ctx->crypt_ctx, false); -} - -static int xts_decrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&camellia_dec_xts, req, camellia_enc_blk, - &ctx->tweak_ctx, &ctx->crypt_ctx, true); -} - static struct skcipher_alg camellia_algs[] = { { .base.cra_name = "__ecb(camellia)", @@ -253,21 +168,7 @@ static struct skcipher_alg camellia_algs[] = { .setkey = camellia_setkey, .encrypt = ctr_crypt, .decrypt = ctr_crypt, - }, { - .base.cra_name = "__xts(camellia)", - .base.cra_driver_name = "__xts-camellia-aesni", - .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct camellia_xts_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = 2 * CAMELLIA_MIN_KEY_SIZE, - .max_keysize = 2 * CAMELLIA_MAX_KEY_SIZE, - .ivsize = CAMELLIA_BLOCK_SIZE, - .setkey = xts_camellia_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, - }, + } }; static struct simd_skcipher_alg *camellia_simd_algs[ARRAY_SIZE(camellia_algs)]; diff --git a/arch/x86/include/asm/crypto/camellia.h b/arch/x86/include/asm/crypto/camellia.h index f6d91861cb14..0e5f82adbaf9 100644 --- a/arch/x86/include/asm/crypto/camellia.h +++ b/arch/x86/include/asm/crypto/camellia.h @@ -19,18 +19,10 @@ struct camellia_ctx { u32 key_length; }; -struct camellia_xts_ctx { - struct camellia_ctx tweak_ctx; - struct camellia_ctx crypt_ctx; -}; - extern int __camellia_setkey(struct camellia_ctx *cctx, const unsigned char *key, unsigned int key_len); -extern int xts_camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, - unsigned int keylen); - /* regular block cipher functions */ asmlinkage void __camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src, bool xor); @@ -49,11 +41,6 @@ asmlinkage void camellia_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void camellia_ctr_16way(const void *ctx, u8 *dst, const u8 *src, le128 *iv); -asmlinkage void camellia_xts_enc_16way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -asmlinkage void camellia_xts_dec_16way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); - static inline void camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src) { __camellia_enc_blk(ctx, dst, src, false); @@ -83,9 +70,4 @@ extern void camellia_crypt_ctr(const void *ctx, u8 *dst, const u8 *src, extern void camellia_crypt_ctr_2way(const void *ctx, u8 *dst, const u8 *src, le128 *iv); -extern void camellia_xts_enc(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -extern void camellia_xts_dec(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); - #endif /* ASM_X86_CAMELLIA_H */ -- cgit v1.2.3 From 2cc0fedb8124ac7a75d132988f1e11f5de30c61f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Jan 2021 17:47:50 +0100 Subject: crypto: x86/cast6 - switch to XTS template Now that the XTS template can wrap accelerated ECB modes, it can be used to implement CAST6 in XTS mode as well, which turns out to be at least as fast, and sometimes even faster Acked-by: Eric Biggers Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/cast6-avx-x86_64-asm_64.S | 56 ------------------ arch/x86/crypto/cast6_avx_glue.c | 98 ------------------------------- 2 files changed, 154 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S index 932a3ce32a88..0c1ea836215a 100644 --- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S @@ -212,8 +212,6 @@ .section .rodata.cst16, "aM", @progbits, 16 .align 16 -.Lxts_gf128mul_and_shl1_mask: - .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 .Lbswap_mask: .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 .Lbswap128_mask: @@ -440,57 +438,3 @@ SYM_FUNC_START(cast6_ctr_8way) FRAME_END ret; SYM_FUNC_END(cast6_ctr_8way) - -SYM_FUNC_START(cast6_xts_enc_8way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - FRAME_BEGIN - pushq %r15; - - movq %rdi, CTX - movq %rsi, %r11; - - /* regs <= src, dst <= IVs, regs <= regs xor IVs */ - load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, - RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask); - - call __cast6_enc_blk8; - - /* dst <= regs xor IVs(in dst) */ - store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); - - popq %r15; - FRAME_END - ret; -SYM_FUNC_END(cast6_xts_enc_8way) - -SYM_FUNC_START(cast6_xts_dec_8way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - FRAME_BEGIN - pushq %r15; - - movq %rdi, CTX - movq %rsi, %r11; - - /* regs <= src, dst <= IVs, regs <= regs xor IVs */ - load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, - RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask); - - call __cast6_dec_blk8; - - /* dst <= regs xor IVs(in dst) */ - store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); - - popq %r15; - FRAME_END - ret; -SYM_FUNC_END(cast6_xts_dec_8way) diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index 48e0f37796fa..5a21d3e9041c 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #define CAST6_PARALLEL_BLOCKS 8 @@ -27,27 +26,12 @@ asmlinkage void cast6_cbc_dec_8way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void cast6_ctr_8way(const void *ctx, u8 *dst, const u8 *src, le128 *iv); -asmlinkage void cast6_xts_enc_8way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -asmlinkage void cast6_xts_dec_8way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); - static int cast6_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { return cast6_setkey(&tfm->base, key, keylen); } -static void cast6_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - glue_xts_crypt_128bit_one(ctx, dst, src, iv, __cast6_encrypt); -} - -static void cast6_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - glue_xts_crypt_128bit_one(ctx, dst, src, iv, __cast6_decrypt); -} - static void cast6_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv) { be128 ctrblk; @@ -87,19 +71,6 @@ static const struct common_glue_ctx cast6_ctr = { } } }; -static const struct common_glue_ctx cast6_enc_xts = { - .num_funcs = 2, - .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .xts = cast6_xts_enc_8way } - }, { - .num_blocks = 1, - .fn_u = { .xts = cast6_xts_enc } - } } -}; - static const struct common_glue_ctx cast6_dec = { .num_funcs = 2, .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, @@ -126,19 +97,6 @@ static const struct common_glue_ctx cast6_dec_cbc = { } } }; -static const struct common_glue_ctx cast6_dec_xts = { - .num_funcs = 2, - .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .xts = cast6_xts_dec_8way } - }, { - .num_blocks = 1, - .fn_u = { .xts = cast6_xts_dec } - } } -}; - static int ecb_encrypt(struct skcipher_request *req) { return glue_ecb_req_128bit(&cast6_enc, req); @@ -164,48 +122,6 @@ static int ctr_crypt(struct skcipher_request *req) return glue_ctr_req_128bit(&cast6_ctr, req); } -struct cast6_xts_ctx { - struct cast6_ctx tweak_ctx; - struct cast6_ctx crypt_ctx; -}; - -static int xts_cast6_setkey(struct crypto_skcipher *tfm, const u8 *key, - unsigned int keylen) -{ - struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - int err; - - err = xts_verify_key(tfm, key, keylen); - if (err) - return err; - - /* first half of xts-key is for crypt */ - err = __cast6_setkey(&ctx->crypt_ctx, key, keylen / 2); - if (err) - return err; - - /* second half of xts-key is for tweak */ - return __cast6_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); -} - -static int xts_encrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&cast6_enc_xts, req, __cast6_encrypt, - &ctx->tweak_ctx, &ctx->crypt_ctx, false); -} - -static int xts_decrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&cast6_dec_xts, req, __cast6_encrypt, - &ctx->tweak_ctx, &ctx->crypt_ctx, true); -} - static struct skcipher_alg cast6_algs[] = { { .base.cra_name = "__ecb(cast6)", @@ -249,20 +165,6 @@ static struct skcipher_alg cast6_algs[] = { .setkey = cast6_setkey_skcipher, .encrypt = ctr_crypt, .decrypt = ctr_crypt, - }, { - .base.cra_name = "__xts(cast6)", - .base.cra_driver_name = "__xts-cast6-avx", - .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = CAST6_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct cast6_xts_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = 2 * CAST6_MIN_KEY_SIZE, - .max_keysize = 2 * CAST6_MAX_KEY_SIZE, - .ivsize = CAST6_BLOCK_SIZE, - .setkey = xts_cast6_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, }, }; -- cgit v1.2.3 From 9ec0af8aa6038163e7cd01dea3b8e085712d19fc Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Jan 2021 17:47:51 +0100 Subject: crypto: x86/serpent- switch to XTS template Now that the XTS template can wrap accelerated ECB modes, it can be used to implement Serpent in XTS mode as well, which turns out to be at least as fast, and sometimes even faster Acked-by: Eric Biggers Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/serpent-avx-x86_64-asm_64.S | 48 ------------- arch/x86/crypto/serpent-avx2-asm_64.S | 62 ----------------- arch/x86/crypto/serpent_avx2_glue.c | 72 -------------------- arch/x86/crypto/serpent_avx_glue.c | 101 ---------------------------- arch/x86/include/asm/crypto/serpent-avx.h | 21 ------ 5 files changed, 304 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S index ba9e4c1e7f5c..6b41f46bcc76 100644 --- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S @@ -18,10 +18,6 @@ .align 16 .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 -.section .rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16 -.align 16 -.Lxts_gf128mul_and_shl1_mask: - .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 .text @@ -735,47 +731,3 @@ SYM_FUNC_START(serpent_ctr_8way_avx) FRAME_END ret; SYM_FUNC_END(serpent_ctr_8way_avx) - -SYM_FUNC_START(serpent_xts_enc_8way_avx) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - FRAME_BEGIN - - /* regs <= src, dst <= IVs, regs <= regs xor IVs */ - load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, - RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask); - - call __serpent_enc_blk8_avx; - - /* dst <= regs xor IVs(in dst) */ - store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); - - FRAME_END - ret; -SYM_FUNC_END(serpent_xts_enc_8way_avx) - -SYM_FUNC_START(serpent_xts_dec_8way_avx) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - FRAME_BEGIN - - /* regs <= src, dst <= IVs, regs <= regs xor IVs */ - load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, - RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask); - - call __serpent_dec_blk8_avx; - - /* dst <= regs xor IVs(in dst) */ - store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); - - FRAME_END - ret; -SYM_FUNC_END(serpent_xts_dec_8way_avx) diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S index c9648aeae705..a510a949f02f 100644 --- a/arch/x86/crypto/serpent-avx2-asm_64.S +++ b/arch/x86/crypto/serpent-avx2-asm_64.S @@ -20,16 +20,6 @@ .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 -.section .rodata.cst16.xts_gf128mul_and_shl1_mask_0, "aM", @progbits, 16 -.align 16 -.Lxts_gf128mul_and_shl1_mask_0: - .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 - -.section .rodata.cst16.xts_gf128mul_and_shl1_mask_1, "aM", @progbits, 16 -.align 16 -.Lxts_gf128mul_and_shl1_mask_1: - .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 - .text #define CTX %rdi @@ -759,55 +749,3 @@ SYM_FUNC_START(serpent_ctr_16way) FRAME_END ret; SYM_FUNC_END(serpent_ctr_16way) - -SYM_FUNC_START(serpent_xts_enc_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - FRAME_BEGIN - - vzeroupper; - - load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, - RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT, - .Lxts_gf128mul_and_shl1_mask_0, - .Lxts_gf128mul_and_shl1_mask_1); - - call __serpent_enc_blk16; - - store_xts_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); - - vzeroupper; - - FRAME_END - ret; -SYM_FUNC_END(serpent_xts_enc_16way) - -SYM_FUNC_START(serpent_xts_dec_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - FRAME_BEGIN - - vzeroupper; - - load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, - RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT, - .Lxts_gf128mul_and_shl1_mask_0, - .Lxts_gf128mul_and_shl1_mask_1); - - call __serpent_dec_blk16; - - store_xts_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); - - vzeroupper; - - FRAME_END - ret; -SYM_FUNC_END(serpent_xts_dec_16way) diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index f973ace44ad3..9cdf2c078e21 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include @@ -25,11 +24,6 @@ asmlinkage void serpent_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void serpent_ctr_16way(const void *ctx, u8 *dst, const u8 *src, le128 *iv); -asmlinkage void serpent_xts_enc_16way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -asmlinkage void serpent_xts_dec_16way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); - static int serpent_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { @@ -68,22 +62,6 @@ static const struct common_glue_ctx serpent_ctr = { } } }; -static const struct common_glue_ctx serpent_enc_xts = { - .num_funcs = 3, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .xts = serpent_xts_enc_16way } - }, { - .num_blocks = 8, - .fn_u = { .xts = serpent_xts_enc_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .xts = serpent_xts_enc } - } } -}; - static const struct common_glue_ctx serpent_dec = { .num_funcs = 3, .fpu_blocks_limit = 8, @@ -116,22 +94,6 @@ static const struct common_glue_ctx serpent_dec_cbc = { } } }; -static const struct common_glue_ctx serpent_dec_xts = { - .num_funcs = 3, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .xts = serpent_xts_dec_16way } - }, { - .num_blocks = 8, - .fn_u = { .xts = serpent_xts_dec_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .xts = serpent_xts_dec } - } } -}; - static int ecb_encrypt(struct skcipher_request *req) { return glue_ecb_req_128bit(&serpent_enc, req); @@ -157,26 +119,6 @@ static int ctr_crypt(struct skcipher_request *req) return glue_ctr_req_128bit(&serpent_ctr, req); } -static int xts_encrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&serpent_enc_xts, req, - __serpent_encrypt, &ctx->tweak_ctx, - &ctx->crypt_ctx, false); -} - -static int xts_decrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&serpent_dec_xts, req, - __serpent_encrypt, &ctx->tweak_ctx, - &ctx->crypt_ctx, true); -} - static struct skcipher_alg serpent_algs[] = { { .base.cra_name = "__ecb(serpent)", @@ -220,20 +162,6 @@ static struct skcipher_alg serpent_algs[] = { .setkey = serpent_setkey_skcipher, .encrypt = ctr_crypt, .decrypt = ctr_crypt, - }, { - .base.cra_name = "__xts(serpent)", - .base.cra_driver_name = "__xts-serpent-avx2", - .base.cra_priority = 600, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = SERPENT_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct serpent_xts_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = 2 * SERPENT_MIN_KEY_SIZE, - .max_keysize = 2 * SERPENT_MAX_KEY_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .setkey = xts_serpent_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, }, }; diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index 7806d1cbe854..b17a08b57a91 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include @@ -36,14 +35,6 @@ asmlinkage void serpent_ctr_8way_avx(const void *ctx, u8 *dst, const u8 *src, le128 *iv); EXPORT_SYMBOL_GPL(serpent_ctr_8way_avx); -asmlinkage void serpent_xts_enc_8way_avx(const void *ctx, u8 *dst, - const u8 *src, le128 *iv); -EXPORT_SYMBOL_GPL(serpent_xts_enc_8way_avx); - -asmlinkage void serpent_xts_dec_8way_avx(const void *ctx, u8 *dst, - const u8 *src, le128 *iv); -EXPORT_SYMBOL_GPL(serpent_xts_dec_8way_avx); - void __serpent_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv) { be128 ctrblk; @@ -58,44 +49,12 @@ void __serpent_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv) } EXPORT_SYMBOL_GPL(__serpent_crypt_ctr); -void serpent_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - glue_xts_crypt_128bit_one(ctx, dst, src, iv, __serpent_encrypt); -} -EXPORT_SYMBOL_GPL(serpent_xts_enc); - -void serpent_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - glue_xts_crypt_128bit_one(ctx, dst, src, iv, __serpent_decrypt); -} -EXPORT_SYMBOL_GPL(serpent_xts_dec); - static int serpent_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen); } -int xts_serpent_setkey(struct crypto_skcipher *tfm, const u8 *key, - unsigned int keylen) -{ - struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - int err; - - err = xts_verify_key(tfm, key, keylen); - if (err) - return err; - - /* first half of xts-key is for crypt */ - err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2); - if (err) - return err; - - /* second half of xts-key is for tweak */ - return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); -} -EXPORT_SYMBOL_GPL(xts_serpent_setkey); - static const struct common_glue_ctx serpent_enc = { .num_funcs = 2, .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, @@ -122,19 +81,6 @@ static const struct common_glue_ctx serpent_ctr = { } } }; -static const struct common_glue_ctx serpent_enc_xts = { - .num_funcs = 2, - .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .xts = serpent_xts_enc_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .xts = serpent_xts_enc } - } } -}; - static const struct common_glue_ctx serpent_dec = { .num_funcs = 2, .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, @@ -161,19 +107,6 @@ static const struct common_glue_ctx serpent_dec_cbc = { } } }; -static const struct common_glue_ctx serpent_dec_xts = { - .num_funcs = 2, - .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .xts = serpent_xts_dec_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .xts = serpent_xts_dec } - } } -}; - static int ecb_encrypt(struct skcipher_request *req) { return glue_ecb_req_128bit(&serpent_enc, req); @@ -199,26 +132,6 @@ static int ctr_crypt(struct skcipher_request *req) return glue_ctr_req_128bit(&serpent_ctr, req); } -static int xts_encrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&serpent_enc_xts, req, - __serpent_encrypt, &ctx->tweak_ctx, - &ctx->crypt_ctx, false); -} - -static int xts_decrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&serpent_dec_xts, req, - __serpent_encrypt, &ctx->tweak_ctx, - &ctx->crypt_ctx, true); -} - static struct skcipher_alg serpent_algs[] = { { .base.cra_name = "__ecb(serpent)", @@ -262,20 +175,6 @@ static struct skcipher_alg serpent_algs[] = { .setkey = serpent_setkey_skcipher, .encrypt = ctr_crypt, .decrypt = ctr_crypt, - }, { - .base.cra_name = "__xts(serpent)", - .base.cra_driver_name = "__xts-serpent-avx", - .base.cra_priority = 500, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = SERPENT_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct serpent_xts_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = 2 * SERPENT_MIN_KEY_SIZE, - .max_keysize = 2 * SERPENT_MAX_KEY_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .setkey = xts_serpent_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, }, }; diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h index 251c2c89d7cf..23f3361a0e72 100644 --- a/arch/x86/include/asm/crypto/serpent-avx.h +++ b/arch/x86/include/asm/crypto/serpent-avx.h @@ -10,11 +10,6 @@ struct crypto_skcipher; #define SERPENT_PARALLEL_BLOCKS 8 -struct serpent_xts_ctx { - struct serpent_ctx tweak_ctx; - struct serpent_ctx crypt_ctx; -}; - asmlinkage void serpent_ecb_enc_8way_avx(const void *ctx, u8 *dst, const u8 *src); asmlinkage void serpent_ecb_dec_8way_avx(const void *ctx, u8 *dst, @@ -22,21 +17,5 @@ asmlinkage void serpent_ecb_dec_8way_avx(const void *ctx, u8 *dst, asmlinkage void serpent_cbc_dec_8way_avx(const void *ctx, u8 *dst, const u8 *src); -asmlinkage void serpent_ctr_8way_avx(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); - -asmlinkage void serpent_xts_enc_8way_avx(const void *ctx, u8 *dst, - const u8 *src, le128 *iv); -asmlinkage void serpent_xts_dec_8way_avx(const void *ctx, u8 *dst, - const u8 *src, le128 *iv); - -extern void __serpent_crypt_ctr(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); - -extern void serpent_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv); -extern void serpent_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv); - -extern int xts_serpent_setkey(struct crypto_skcipher *tfm, const u8 *key, - unsigned int keylen); #endif -- cgit v1.2.3 From da4df93a94a5aa7c5a599959d79ee99cdbe4c6b7 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Jan 2021 17:47:52 +0100 Subject: crypto: x86/twofish - switch to XTS template Now that the XTS template can wrap accelerated ECB modes, it can be used to implement Twofish in XTS mode as well, which turns out to be at least as fast, and sometimes even faster Acked-by: Eric Biggers Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/twofish-avx-x86_64-asm_64.S | 53 ---------------- arch/x86/crypto/twofish_avx_glue.c | 98 ----------------------------- 2 files changed, 151 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S index a5151393bb2f..84e61ef03638 100644 --- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S @@ -19,11 +19,6 @@ .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 -.section .rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16 -.align 16 -.Lxts_gf128mul_and_shl1_mask: - .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 - .text /* structure of crypto context */ @@ -406,51 +401,3 @@ SYM_FUNC_START(twofish_ctr_8way) FRAME_END ret; SYM_FUNC_END(twofish_ctr_8way) - -SYM_FUNC_START(twofish_xts_enc_8way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - FRAME_BEGIN - - movq %rsi, %r11; - - /* regs <= src, dst <= IVs, regs <= regs xor IVs */ - load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, - RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask); - - call __twofish_enc_blk8; - - /* dst <= regs xor IVs(in dst) */ - store_xts_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); - - FRAME_END - ret; -SYM_FUNC_END(twofish_xts_enc_8way) - -SYM_FUNC_START(twofish_xts_dec_8way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - FRAME_BEGIN - - movq %rsi, %r11; - - /* regs <= src, dst <= IVs, regs <= regs xor IVs */ - load_xts_8way(%rcx, %rdx, %rsi, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2, - RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask); - - call __twofish_dec_blk8; - - /* dst <= regs xor IVs(in dst) */ - store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); - - FRAME_END - ret; -SYM_FUNC_END(twofish_xts_dec_8way) diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index 2dbc8ce3730e..7b539bbb108f 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include @@ -29,11 +28,6 @@ asmlinkage void twofish_cbc_dec_8way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void twofish_ctr_8way(const void *ctx, u8 *dst, const u8 *src, le128 *iv); -asmlinkage void twofish_xts_enc_8way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -asmlinkage void twofish_xts_dec_8way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); - static int twofish_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { @@ -45,40 +39,6 @@ static inline void twofish_enc_blk_3way(const void *ctx, u8 *dst, const u8 *src) __twofish_enc_blk_3way(ctx, dst, src, false); } -static void twofish_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - glue_xts_crypt_128bit_one(ctx, dst, src, iv, twofish_enc_blk); -} - -static void twofish_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - glue_xts_crypt_128bit_one(ctx, dst, src, iv, twofish_dec_blk); -} - -struct twofish_xts_ctx { - struct twofish_ctx tweak_ctx; - struct twofish_ctx crypt_ctx; -}; - -static int xts_twofish_setkey(struct crypto_skcipher *tfm, const u8 *key, - unsigned int keylen) -{ - struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - int err; - - err = xts_verify_key(tfm, key, keylen); - if (err) - return err; - - /* first half of xts-key is for crypt */ - err = __twofish_setkey(&ctx->crypt_ctx, key, keylen / 2); - if (err) - return err; - - /* second half of xts-key is for tweak */ - return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); -} - static const struct common_glue_ctx twofish_enc = { .num_funcs = 3, .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, @@ -111,19 +71,6 @@ static const struct common_glue_ctx twofish_ctr = { } } }; -static const struct common_glue_ctx twofish_enc_xts = { - .num_funcs = 2, - .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .xts = twofish_xts_enc_8way } - }, { - .num_blocks = 1, - .fn_u = { .xts = twofish_xts_enc } - } } -}; - static const struct common_glue_ctx twofish_dec = { .num_funcs = 3, .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, @@ -156,19 +103,6 @@ static const struct common_glue_ctx twofish_dec_cbc = { } } }; -static const struct common_glue_ctx twofish_dec_xts = { - .num_funcs = 2, - .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .xts = twofish_xts_dec_8way } - }, { - .num_blocks = 1, - .fn_u = { .xts = twofish_xts_dec } - } } -}; - static int ecb_encrypt(struct skcipher_request *req) { return glue_ecb_req_128bit(&twofish_enc, req); @@ -194,24 +128,6 @@ static int ctr_crypt(struct skcipher_request *req) return glue_ctr_req_128bit(&twofish_ctr, req); } -static int xts_encrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&twofish_enc_xts, req, twofish_enc_blk, - &ctx->tweak_ctx, &ctx->crypt_ctx, false); -} - -static int xts_decrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&twofish_dec_xts, req, twofish_enc_blk, - &ctx->tweak_ctx, &ctx->crypt_ctx, true); -} - static struct skcipher_alg twofish_algs[] = { { .base.cra_name = "__ecb(twofish)", @@ -255,20 +171,6 @@ static struct skcipher_alg twofish_algs[] = { .setkey = twofish_setkey_skcipher, .encrypt = ctr_crypt, .decrypt = ctr_crypt, - }, { - .base.cra_name = "__xts(twofish)", - .base.cra_driver_name = "__xts-twofish-avx", - .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = TF_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct twofish_xts_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = 2 * TF_MIN_KEY_SIZE, - .max_keysize = 2 * TF_MAX_KEY_SIZE, - .ivsize = TF_BLOCK_SIZE, - .setkey = xts_twofish_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, }, }; -- cgit v1.2.3 From 31d49c448ab8556ce8d340eb28da2484e5b5629c Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Jan 2021 17:47:53 +0100 Subject: crypto: x86/glue-helper - drop XTS helper routines The glue helper's XTS routines are no longer used, so drop them. Acked-by: Eric Biggers Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/glue_helper-asm-avx.S | 59 ------------ arch/x86/crypto/glue_helper-asm-avx2.S | 78 --------------- arch/x86/crypto/glue_helper.c | 154 ------------------------------ arch/x86/include/asm/crypto/glue_helper.h | 12 --- 4 files changed, 303 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/glue_helper-asm-avx.S b/arch/x86/crypto/glue_helper-asm-avx.S index d08fc575ef7f..a94511432803 100644 --- a/arch/x86/crypto/glue_helper-asm-avx.S +++ b/arch/x86/crypto/glue_helper-asm-avx.S @@ -79,62 +79,3 @@ vpxor (6*16)(src), x6, x6; \ vpxor (7*16)(src), x7, x7; \ store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7); - -#define gf128mul_x_ble(iv, mask, tmp) \ - vpsrad $31, iv, tmp; \ - vpaddq iv, iv, iv; \ - vpshufd $0x13, tmp, tmp; \ - vpand mask, tmp, tmp; \ - vpxor tmp, iv, iv; - -#define load_xts_8way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, t0, \ - t1, xts_gf128mul_and_shl1_mask) \ - vmovdqa xts_gf128mul_and_shl1_mask, t0; \ - \ - /* load IV */ \ - vmovdqu (iv), tiv; \ - vpxor (0*16)(src), tiv, x0; \ - vmovdqu tiv, (0*16)(dst); \ - \ - /* construct and store IVs, also xor with source */ \ - gf128mul_x_ble(tiv, t0, t1); \ - vpxor (1*16)(src), tiv, x1; \ - vmovdqu tiv, (1*16)(dst); \ - \ - gf128mul_x_ble(tiv, t0, t1); \ - vpxor (2*16)(src), tiv, x2; \ - vmovdqu tiv, (2*16)(dst); \ - \ - gf128mul_x_ble(tiv, t0, t1); \ - vpxor (3*16)(src), tiv, x3; \ - vmovdqu tiv, (3*16)(dst); \ - \ - gf128mul_x_ble(tiv, t0, t1); \ - vpxor (4*16)(src), tiv, x4; \ - vmovdqu tiv, (4*16)(dst); \ - \ - gf128mul_x_ble(tiv, t0, t1); \ - vpxor (5*16)(src), tiv, x5; \ - vmovdqu tiv, (5*16)(dst); \ - \ - gf128mul_x_ble(tiv, t0, t1); \ - vpxor (6*16)(src), tiv, x6; \ - vmovdqu tiv, (6*16)(dst); \ - \ - gf128mul_x_ble(tiv, t0, t1); \ - vpxor (7*16)(src), tiv, x7; \ - vmovdqu tiv, (7*16)(dst); \ - \ - gf128mul_x_ble(tiv, t0, t1); \ - vmovdqu tiv, (iv); - -#define store_xts_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \ - vpxor (0*16)(dst), x0, x0; \ - vpxor (1*16)(dst), x1, x1; \ - vpxor (2*16)(dst), x2, x2; \ - vpxor (3*16)(dst), x3, x3; \ - vpxor (4*16)(dst), x4, x4; \ - vpxor (5*16)(dst), x5, x5; \ - vpxor (6*16)(dst), x6, x6; \ - vpxor (7*16)(dst), x7, x7; \ - store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7); diff --git a/arch/x86/crypto/glue_helper-asm-avx2.S b/arch/x86/crypto/glue_helper-asm-avx2.S index d84508c85c13..456bface1e5d 100644 --- a/arch/x86/crypto/glue_helper-asm-avx2.S +++ b/arch/x86/crypto/glue_helper-asm-avx2.S @@ -95,81 +95,3 @@ vpxor (6*32)(src), x6, x6; \ vpxor (7*32)(src), x7, x7; \ store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7); - -#define gf128mul_x_ble(iv, mask, tmp) \ - vpsrad $31, iv, tmp; \ - vpaddq iv, iv, iv; \ - vpshufd $0x13, tmp, tmp; \ - vpand mask, tmp, tmp; \ - vpxor tmp, iv, iv; - -#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \ - vpsrad $31, iv, tmp0; \ - vpaddq iv, iv, tmp1; \ - vpsllq $2, iv, iv; \ - vpshufd $0x13, tmp0, tmp0; \ - vpsrad $31, tmp1, tmp1; \ - vpand mask2, tmp0, tmp0; \ - vpshufd $0x13, tmp1, tmp1; \ - vpxor tmp0, iv, iv; \ - vpand mask1, tmp1, tmp1; \ - vpxor tmp1, iv, iv; - -#define load_xts_16way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, \ - tivx, t0, t0x, t1, t1x, t2, t2x, t3, \ - xts_gf128mul_and_shl1_mask_0, \ - xts_gf128mul_and_shl1_mask_1) \ - vbroadcasti128 xts_gf128mul_and_shl1_mask_0, t1; \ - \ - /* load IV and construct second IV */ \ - vmovdqu (iv), tivx; \ - vmovdqa tivx, t0x; \ - gf128mul_x_ble(tivx, t1x, t2x); \ - vbroadcasti128 xts_gf128mul_and_shl1_mask_1, t2; \ - vinserti128 $1, tivx, t0, tiv; \ - vpxor (0*32)(src), tiv, x0; \ - vmovdqu tiv, (0*32)(dst); \ - \ - /* construct and store IVs, also xor with source */ \ - gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ - vpxor (1*32)(src), tiv, x1; \ - vmovdqu tiv, (1*32)(dst); \ - \ - gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ - vpxor (2*32)(src), tiv, x2; \ - vmovdqu tiv, (2*32)(dst); \ - \ - gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ - vpxor (3*32)(src), tiv, x3; \ - vmovdqu tiv, (3*32)(dst); \ - \ - gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ - vpxor (4*32)(src), tiv, x4; \ - vmovdqu tiv, (4*32)(dst); \ - \ - gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ - vpxor (5*32)(src), tiv, x5; \ - vmovdqu tiv, (5*32)(dst); \ - \ - gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ - vpxor (6*32)(src), tiv, x6; \ - vmovdqu tiv, (6*32)(dst); \ - \ - gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ - vpxor (7*32)(src), tiv, x7; \ - vmovdqu tiv, (7*32)(dst); \ - \ - vextracti128 $1, tiv, tivx; \ - gf128mul_x_ble(tivx, t1x, t2x); \ - vmovdqu tivx, (iv); - -#define store_xts_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \ - vpxor (0*32)(dst), x0, x0; \ - vpxor (1*32)(dst), x1, x1; \ - vpxor (2*32)(dst), x2, x2; \ - vpxor (3*32)(dst), x3, x3; \ - vpxor (4*32)(dst), x4, x4; \ - vpxor (5*32)(dst), x5, x5; \ - vpxor (6*32)(dst), x6, x6; \ - vpxor (7*32)(dst), x7, x7; \ - store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7); diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c index d3d91a0abf88..786ffda1caf4 100644 --- a/arch/x86/crypto/glue_helper.c +++ b/arch/x86/crypto/glue_helper.c @@ -12,10 +12,8 @@ #include #include -#include #include #include -#include #include int glue_ecb_req_128bit(const struct common_glue_ctx *gctx, @@ -226,156 +224,4 @@ int glue_ctr_req_128bit(const struct common_glue_ctx *gctx, } EXPORT_SYMBOL_GPL(glue_ctr_req_128bit); -static unsigned int __glue_xts_req_128bit(const struct common_glue_ctx *gctx, - void *ctx, - struct skcipher_walk *walk) -{ - const unsigned int bsize = 128 / 8; - unsigned int nbytes = walk->nbytes; - u128 *src = walk->src.virt.addr; - u128 *dst = walk->dst.virt.addr; - unsigned int num_blocks, func_bytes; - unsigned int i; - - /* Process multi-block batch */ - for (i = 0; i < gctx->num_funcs; i++) { - num_blocks = gctx->funcs[i].num_blocks; - func_bytes = bsize * num_blocks; - - if (nbytes >= func_bytes) { - do { - gctx->funcs[i].fn_u.xts(ctx, (u8 *)dst, - (const u8 *)src, - walk->iv); - - src += num_blocks; - dst += num_blocks; - nbytes -= func_bytes; - } while (nbytes >= func_bytes); - - if (nbytes < bsize) - goto done; - } - } - -done: - return nbytes; -} - -int glue_xts_req_128bit(const struct common_glue_ctx *gctx, - struct skcipher_request *req, - common_glue_func_t tweak_fn, void *tweak_ctx, - void *crypt_ctx, bool decrypt) -{ - const bool cts = (req->cryptlen % XTS_BLOCK_SIZE); - const unsigned int bsize = 128 / 8; - struct skcipher_request subreq; - struct skcipher_walk walk; - bool fpu_enabled = false; - unsigned int nbytes, tail; - int err; - - if (req->cryptlen < XTS_BLOCK_SIZE) - return -EINVAL; - - if (unlikely(cts)) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - - tail = req->cryptlen % XTS_BLOCK_SIZE + XTS_BLOCK_SIZE; - - skcipher_request_set_tfm(&subreq, tfm); - skcipher_request_set_callback(&subreq, - crypto_skcipher_get_flags(tfm), - NULL, NULL); - skcipher_request_set_crypt(&subreq, req->src, req->dst, - req->cryptlen - tail, req->iv); - req = &subreq; - } - - err = skcipher_walk_virt(&walk, req, false); - nbytes = walk.nbytes; - if (err) - return err; - - /* set minimum length to bsize, for tweak_fn */ - fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - &walk, fpu_enabled, - nbytes < bsize ? bsize : nbytes); - - /* calculate first value of T */ - tweak_fn(tweak_ctx, walk.iv, walk.iv); - - while (nbytes) { - nbytes = __glue_xts_req_128bit(gctx, crypt_ctx, &walk); - - err = skcipher_walk_done(&walk, nbytes); - nbytes = walk.nbytes; - } - - if (unlikely(cts)) { - u8 *next_tweak, *final_tweak = req->iv; - struct scatterlist *src, *dst; - struct scatterlist s[2], d[2]; - le128 b[2]; - - dst = src = scatterwalk_ffwd(s, req->src, req->cryptlen); - if (req->dst != req->src) - dst = scatterwalk_ffwd(d, req->dst, req->cryptlen); - - if (decrypt) { - next_tweak = memcpy(b, req->iv, XTS_BLOCK_SIZE); - gf128mul_x_ble(b, b); - } else { - next_tweak = req->iv; - } - - skcipher_request_set_crypt(&subreq, src, dst, XTS_BLOCK_SIZE, - next_tweak); - - err = skcipher_walk_virt(&walk, req, false) ?: - skcipher_walk_done(&walk, - __glue_xts_req_128bit(gctx, crypt_ctx, &walk)); - if (err) - goto out; - - scatterwalk_map_and_copy(b, dst, 0, XTS_BLOCK_SIZE, 0); - memcpy(b + 1, b, tail - XTS_BLOCK_SIZE); - scatterwalk_map_and_copy(b, src, XTS_BLOCK_SIZE, - tail - XTS_BLOCK_SIZE, 0); - scatterwalk_map_and_copy(b, dst, 0, tail, 1); - - skcipher_request_set_crypt(&subreq, dst, dst, XTS_BLOCK_SIZE, - final_tweak); - - err = skcipher_walk_virt(&walk, req, false) ?: - skcipher_walk_done(&walk, - __glue_xts_req_128bit(gctx, crypt_ctx, &walk)); - } - -out: - glue_fpu_end(fpu_enabled); - - return err; -} -EXPORT_SYMBOL_GPL(glue_xts_req_128bit); - -void glue_xts_crypt_128bit_one(const void *ctx, u8 *dst, const u8 *src, - le128 *iv, common_glue_func_t fn) -{ - le128 ivblk = *iv; - - /* generate next IV */ - gf128mul_x_ble(iv, &ivblk); - - /* CC <- T xor C */ - u128_xor((u128 *)dst, (const u128 *)src, (u128 *)&ivblk); - - /* PP <- D(Key2,CC) */ - fn(ctx, dst, dst); - - /* P <- T xor PP */ - u128_xor((u128 *)dst, (u128 *)dst, (u128 *)&ivblk); -} -EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit_one); - MODULE_LICENSE("GPL"); diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h index 777c0f63418c..62680775d189 100644 --- a/arch/x86/include/asm/crypto/glue_helper.h +++ b/arch/x86/include/asm/crypto/glue_helper.h @@ -15,8 +15,6 @@ typedef void (*common_glue_func_t)(const void *ctx, u8 *dst, const u8 *src); typedef void (*common_glue_cbc_func_t)(const void *ctx, u8 *dst, const u8 *src); typedef void (*common_glue_ctr_func_t)(const void *ctx, u8 *dst, const u8 *src, le128 *iv); -typedef void (*common_glue_xts_func_t)(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); struct common_glue_func_entry { unsigned int num_blocks; /* number of blocks that @fn will process */ @@ -24,7 +22,6 @@ struct common_glue_func_entry { common_glue_func_t ecb; common_glue_cbc_func_t cbc; common_glue_ctr_func_t ctr; - common_glue_xts_func_t xts; } fn_u; }; @@ -106,13 +103,4 @@ extern int glue_cbc_decrypt_req_128bit(const struct common_glue_ctx *gctx, extern int glue_ctr_req_128bit(const struct common_glue_ctx *gctx, struct skcipher_request *req); -extern int glue_xts_req_128bit(const struct common_glue_ctx *gctx, - struct skcipher_request *req, - common_glue_func_t tweak_fn, void *tweak_ctx, - void *crypt_ctx, bool decrypt); - -extern void glue_xts_crypt_128bit_one(const void *ctx, u8 *dst, - const u8 *src, le128 *iv, - common_glue_func_t fn); - #endif /* _CRYPTO_GLUE_HELPER_H */ -- cgit v1.2.3 From a1f91ecf812ac333ee2897f3eb2d8f4f6b4ce942 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Jan 2021 17:47:54 +0100 Subject: crypto: x86/camellia - drop CTR mode implementation Camellia in CTR mode is never used by the kernel directly, and is highly unlikely to be relied upon by dm-crypt or algif_skcipher. So let's drop the accelerated CTR mode implementation, and instead, rely on the CTR template and the bare cipher. Acked-by: Eric Biggers Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/camellia-aesni-avx-asm_64.S | 117 ---------------------- arch/x86/crypto/camellia-aesni-avx2-asm_64.S | 144 --------------------------- arch/x86/crypto/camellia_aesni_avx2_glue.c | 41 -------- arch/x86/crypto/camellia_aesni_avx_glue.c | 40 -------- arch/x86/crypto/camellia_glue.c | 68 ------------- arch/x86/include/asm/crypto/camellia.h | 6 -- 6 files changed, 416 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index 471c34e6cac2..e2a0e0f4bf9d 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -588,10 +588,6 @@ SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) .long 0x80808080 .long 0x80808080 -/* For CTR-mode IV byteswap */ -.Lbswap128_mask: - .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 - /* * pre-SubByte transform * @@ -993,116 +989,3 @@ SYM_FUNC_START(camellia_cbc_dec_16way) FRAME_END ret; SYM_FUNC_END(camellia_cbc_dec_16way) - -#define inc_le128(x, minus_one, tmp) \ - vpcmpeqq minus_one, x, tmp; \ - vpsubq minus_one, x, x; \ - vpslldq $8, tmp, tmp; \ - vpsubq tmp, x, x; - -SYM_FUNC_START(camellia_ctr_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (little endian, 128bit) - */ - FRAME_BEGIN - - subq $(16 * 16), %rsp; - movq %rsp, %rax; - - vmovdqa .Lbswap128_mask, %xmm14; - - /* load IV and byteswap */ - vmovdqu (%rcx), %xmm0; - vpshufb %xmm14, %xmm0, %xmm15; - vmovdqu %xmm15, 15 * 16(%rax); - - vpcmpeqd %xmm15, %xmm15, %xmm15; - vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */ - - /* construct IVs */ - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm13; - vmovdqu %xmm13, 14 * 16(%rax); - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm13; - vmovdqu %xmm13, 13 * 16(%rax); - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm12; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm11; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm10; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm9; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm8; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm7; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm6; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm5; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm4; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm3; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm2; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm1; - inc_le128(%xmm0, %xmm15, %xmm13); - vmovdqa %xmm0, %xmm13; - vpshufb %xmm14, %xmm0, %xmm0; - inc_le128(%xmm13, %xmm15, %xmm14); - vmovdqu %xmm13, (%rcx); - - /* inpack16_pre: */ - vmovq (key_table)(CTX), %xmm15; - vpshufb .Lpack_bswap, %xmm15, %xmm15; - vpxor %xmm0, %xmm15, %xmm0; - vpxor %xmm1, %xmm15, %xmm1; - vpxor %xmm2, %xmm15, %xmm2; - vpxor %xmm3, %xmm15, %xmm3; - vpxor %xmm4, %xmm15, %xmm4; - vpxor %xmm5, %xmm15, %xmm5; - vpxor %xmm6, %xmm15, %xmm6; - vpxor %xmm7, %xmm15, %xmm7; - vpxor %xmm8, %xmm15, %xmm8; - vpxor %xmm9, %xmm15, %xmm9; - vpxor %xmm10, %xmm15, %xmm10; - vpxor %xmm11, %xmm15, %xmm11; - vpxor %xmm12, %xmm15, %xmm12; - vpxor 13 * 16(%rax), %xmm15, %xmm13; - vpxor 14 * 16(%rax), %xmm15, %xmm14; - vpxor 15 * 16(%rax), %xmm15, %xmm15; - - call __camellia_enc_blk16; - - addq $(16 * 16), %rsp; - - vpxor 0 * 16(%rdx), %xmm7, %xmm7; - vpxor 1 * 16(%rdx), %xmm6, %xmm6; - vpxor 2 * 16(%rdx), %xmm5, %xmm5; - vpxor 3 * 16(%rdx), %xmm4, %xmm4; - vpxor 4 * 16(%rdx), %xmm3, %xmm3; - vpxor 5 * 16(%rdx), %xmm2, %xmm2; - vpxor 6 * 16(%rdx), %xmm1, %xmm1; - vpxor 7 * 16(%rdx), %xmm0, %xmm0; - vpxor 8 * 16(%rdx), %xmm15, %xmm15; - vpxor 9 * 16(%rdx), %xmm14, %xmm14; - vpxor 10 * 16(%rdx), %xmm13, %xmm13; - vpxor 11 * 16(%rdx), %xmm12, %xmm12; - vpxor 12 * 16(%rdx), %xmm11, %xmm11; - vpxor 13 * 16(%rdx), %xmm10, %xmm10; - vpxor 14 * 16(%rdx), %xmm9, %xmm9; - vpxor 15 * 16(%rdx), %xmm8, %xmm8; - write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, - %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, - %xmm8, %rsi); - - FRAME_END - ret; -SYM_FUNC_END(camellia_ctr_16way) diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index 9561dee52de0..782e9712a1ec 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -624,10 +624,6 @@ SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) .section .rodata.cst16, "aM", @progbits, 16 .align 16 -/* For CTR-mode IV byteswap */ -.Lbswap128_mask: - .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 - /* * pre-SubByte transform * @@ -1054,143 +1050,3 @@ SYM_FUNC_START(camellia_cbc_dec_32way) FRAME_END ret; SYM_FUNC_END(camellia_cbc_dec_32way) - -#define inc_le128(x, minus_one, tmp) \ - vpcmpeqq minus_one, x, tmp; \ - vpsubq minus_one, x, x; \ - vpslldq $8, tmp, tmp; \ - vpsubq tmp, x, x; - -#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \ - vpcmpeqq minus_one, x, tmp1; \ - vpcmpeqq minus_two, x, tmp2; \ - vpsubq minus_two, x, x; \ - vpor tmp2, tmp1, tmp1; \ - vpslldq $8, tmp1, tmp1; \ - vpsubq tmp1, x, x; - -SYM_FUNC_START(camellia_ctr_32way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (32 blocks) - * %rdx: src (32 blocks) - * %rcx: iv (little endian, 128bit) - */ - FRAME_BEGIN - - vzeroupper; - - movq %rsp, %r10; - cmpq %rsi, %rdx; - je .Lctr_use_stack; - - /* dst can be used as temporary storage, src is not overwritten. */ - movq %rsi, %rax; - jmp .Lctr_continue; - -.Lctr_use_stack: - subq $(16 * 32), %rsp; - movq %rsp, %rax; - -.Lctr_continue: - vpcmpeqd %ymm15, %ymm15, %ymm15; - vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */ - vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */ - - /* load IV and byteswap */ - vmovdqu (%rcx), %xmm0; - vmovdqa %xmm0, %xmm1; - inc_le128(%xmm0, %xmm15, %xmm14); - vbroadcasti128 .Lbswap128_mask, %ymm14; - vinserti128 $1, %xmm0, %ymm1, %ymm0; - vpshufb %ymm14, %ymm0, %ymm13; - vmovdqu %ymm13, 15 * 32(%rax); - - /* construct IVs */ - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */ - vpshufb %ymm14, %ymm0, %ymm13; - vmovdqu %ymm13, 14 * 32(%rax); - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm13; - vmovdqu %ymm13, 13 * 32(%rax); - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm13; - vmovdqu %ymm13, 12 * 32(%rax); - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm13; - vmovdqu %ymm13, 11 * 32(%rax); - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm10; - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm9; - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm8; - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm7; - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm6; - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm5; - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm4; - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm3; - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm2; - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm1; - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vextracti128 $1, %ymm0, %xmm13; - vpshufb %ymm14, %ymm0, %ymm0; - inc_le128(%xmm13, %xmm15, %xmm14); - vmovdqu %xmm13, (%rcx); - - /* inpack32_pre: */ - vpbroadcastq (key_table)(CTX), %ymm15; - vpshufb .Lpack_bswap, %ymm15, %ymm15; - vpxor %ymm0, %ymm15, %ymm0; - vpxor %ymm1, %ymm15, %ymm1; - vpxor %ymm2, %ymm15, %ymm2; - vpxor %ymm3, %ymm15, %ymm3; - vpxor %ymm4, %ymm15, %ymm4; - vpxor %ymm5, %ymm15, %ymm5; - vpxor %ymm6, %ymm15, %ymm6; - vpxor %ymm7, %ymm15, %ymm7; - vpxor %ymm8, %ymm15, %ymm8; - vpxor %ymm9, %ymm15, %ymm9; - vpxor %ymm10, %ymm15, %ymm10; - vpxor 11 * 32(%rax), %ymm15, %ymm11; - vpxor 12 * 32(%rax), %ymm15, %ymm12; - vpxor 13 * 32(%rax), %ymm15, %ymm13; - vpxor 14 * 32(%rax), %ymm15, %ymm14; - vpxor 15 * 32(%rax), %ymm15, %ymm15; - - call __camellia_enc_blk32; - - movq %r10, %rsp; - - vpxor 0 * 32(%rdx), %ymm7, %ymm7; - vpxor 1 * 32(%rdx), %ymm6, %ymm6; - vpxor 2 * 32(%rdx), %ymm5, %ymm5; - vpxor 3 * 32(%rdx), %ymm4, %ymm4; - vpxor 4 * 32(%rdx), %ymm3, %ymm3; - vpxor 5 * 32(%rdx), %ymm2, %ymm2; - vpxor 6 * 32(%rdx), %ymm1, %ymm1; - vpxor 7 * 32(%rdx), %ymm0, %ymm0; - vpxor 8 * 32(%rdx), %ymm15, %ymm15; - vpxor 9 * 32(%rdx), %ymm14, %ymm14; - vpxor 10 * 32(%rdx), %ymm13, %ymm13; - vpxor 11 * 32(%rdx), %ymm12, %ymm12; - vpxor 12 * 32(%rdx), %ymm11, %ymm11; - vpxor 13 * 32(%rdx), %ymm10, %ymm10; - vpxor 14 * 32(%rdx), %ymm9, %ymm9; - vpxor 15 * 32(%rdx), %ymm8, %ymm8; - write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, - %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, - %ymm8, %rsi); - - vzeroupper; - - FRAME_END - ret; -SYM_FUNC_END(camellia_ctr_32way) diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index d956d0473668..8f25a2a6222e 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -22,8 +22,6 @@ asmlinkage void camellia_ecb_enc_32way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void camellia_ecb_dec_32way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void camellia_cbc_dec_32way(const void *ctx, u8 *dst, const u8 *src); -asmlinkage void camellia_ctr_32way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); static const struct common_glue_ctx camellia_enc = { .num_funcs = 4, @@ -44,25 +42,6 @@ static const struct common_glue_ctx camellia_enc = { } } }; -static const struct common_glue_ctx camellia_ctr = { - .num_funcs = 4, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, - .fn_u = { .ctr = camellia_ctr_32way } - }, { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .ctr = camellia_ctr_16way } - }, { - .num_blocks = 2, - .fn_u = { .ctr = camellia_crypt_ctr_2way } - }, { - .num_blocks = 1, - .fn_u = { .ctr = camellia_crypt_ctr } - } } -}; - static const struct common_glue_ctx camellia_dec = { .num_funcs = 4, .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, @@ -127,11 +106,6 @@ static int cbc_decrypt(struct skcipher_request *req) return glue_cbc_decrypt_req_128bit(&camellia_dec_cbc, req); } -static int ctr_crypt(struct skcipher_request *req) -{ - return glue_ctr_req_128bit(&camellia_ctr, req); -} - static struct skcipher_alg camellia_algs[] = { { .base.cra_name = "__ecb(camellia)", @@ -160,21 +134,6 @@ static struct skcipher_alg camellia_algs[] = { .setkey = camellia_setkey, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "__ctr(camellia)", - .base.cra_driver_name = "__ctr-camellia-aesni-avx2", - .base.cra_priority = 500, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct camellia_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = CAMELLIA_MIN_KEY_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE, - .ivsize = CAMELLIA_BLOCK_SIZE, - .chunksize = CAMELLIA_BLOCK_SIZE, - .setkey = camellia_setkey, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, }, }; diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index 44614f8a452c..22a89cdfedfb 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -26,10 +26,6 @@ EXPORT_SYMBOL_GPL(camellia_ecb_dec_16way); asmlinkage void camellia_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(camellia_cbc_dec_16way); -asmlinkage void camellia_ctr_16way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -EXPORT_SYMBOL_GPL(camellia_ctr_16way); - static const struct common_glue_ctx camellia_enc = { .num_funcs = 3, .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, @@ -46,22 +42,6 @@ static const struct common_glue_ctx camellia_enc = { } } }; -static const struct common_glue_ctx camellia_ctr = { - .num_funcs = 3, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .ctr = camellia_ctr_16way } - }, { - .num_blocks = 2, - .fn_u = { .ctr = camellia_crypt_ctr_2way } - }, { - .num_blocks = 1, - .fn_u = { .ctr = camellia_crypt_ctr } - } } -}; - static const struct common_glue_ctx camellia_dec = { .num_funcs = 3, .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, @@ -120,11 +100,6 @@ static int cbc_decrypt(struct skcipher_request *req) return glue_cbc_decrypt_req_128bit(&camellia_dec_cbc, req); } -static int ctr_crypt(struct skcipher_request *req) -{ - return glue_ctr_req_128bit(&camellia_ctr, req); -} - static struct skcipher_alg camellia_algs[] = { { .base.cra_name = "__ecb(camellia)", @@ -153,21 +128,6 @@ static struct skcipher_alg camellia_algs[] = { .setkey = camellia_setkey, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "__ctr(camellia)", - .base.cra_driver_name = "__ctr-camellia-aesni", - .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct camellia_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = CAMELLIA_MIN_KEY_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE, - .ivsize = CAMELLIA_BLOCK_SIZE, - .chunksize = CAMELLIA_BLOCK_SIZE, - .setkey = camellia_setkey, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, } }; diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index 242c056e5fa8..fefeedf2b33d 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -1274,42 +1274,6 @@ void camellia_decrypt_cbc_2way(const void *ctx, u8 *d, const u8 *s) } EXPORT_SYMBOL_GPL(camellia_decrypt_cbc_2way); -void camellia_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv) -{ - be128 ctrblk; - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - - if (dst != src) - *dst = *src; - - le128_to_be128(&ctrblk, iv); - le128_inc(iv); - - camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk); -} -EXPORT_SYMBOL_GPL(camellia_crypt_ctr); - -void camellia_crypt_ctr_2way(const void *ctx, u8 *d, const u8 *s, le128 *iv) -{ - be128 ctrblks[2]; - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - - if (dst != src) { - dst[0] = src[0]; - dst[1] = src[1]; - } - - le128_to_be128(&ctrblks[0], iv); - le128_inc(iv); - le128_to_be128(&ctrblks[1], iv); - le128_inc(iv); - - camellia_enc_blk_xor_2way(ctx, (u8 *)dst, (u8 *)ctrblks); -} -EXPORT_SYMBOL_GPL(camellia_crypt_ctr_2way); - static const struct common_glue_ctx camellia_enc = { .num_funcs = 2, .fpu_blocks_limit = -1, @@ -1323,19 +1287,6 @@ static const struct common_glue_ctx camellia_enc = { } } }; -static const struct common_glue_ctx camellia_ctr = { - .num_funcs = 2, - .fpu_blocks_limit = -1, - - .funcs = { { - .num_blocks = 2, - .fn_u = { .ctr = camellia_crypt_ctr_2way } - }, { - .num_blocks = 1, - .fn_u = { .ctr = camellia_crypt_ctr } - } } -}; - static const struct common_glue_ctx camellia_dec = { .num_funcs = 2, .fpu_blocks_limit = -1, @@ -1382,11 +1333,6 @@ static int cbc_decrypt(struct skcipher_request *req) return glue_cbc_decrypt_req_128bit(&camellia_dec_cbc, req); } -static int ctr_crypt(struct skcipher_request *req) -{ - return glue_ctr_req_128bit(&camellia_ctr, req); -} - static struct crypto_alg camellia_cipher_alg = { .cra_name = "camellia", .cra_driver_name = "camellia-asm", @@ -1433,20 +1379,6 @@ static struct skcipher_alg camellia_skcipher_algs[] = { .setkey = camellia_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "ctr(camellia)", - .base.cra_driver_name = "ctr-camellia-asm", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct camellia_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = CAMELLIA_MIN_KEY_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE, - .ivsize = CAMELLIA_BLOCK_SIZE, - .chunksize = CAMELLIA_BLOCK_SIZE, - .setkey = camellia_setkey_skcipher, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, } }; diff --git a/arch/x86/include/asm/crypto/camellia.h b/arch/x86/include/asm/crypto/camellia.h index 0e5f82adbaf9..1dcea79e8f8e 100644 --- a/arch/x86/include/asm/crypto/camellia.h +++ b/arch/x86/include/asm/crypto/camellia.h @@ -38,8 +38,6 @@ asmlinkage void camellia_ecb_enc_16way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void camellia_ecb_dec_16way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void camellia_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src); -asmlinkage void camellia_ctr_16way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); static inline void camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src) { @@ -65,9 +63,5 @@ static inline void camellia_enc_blk_xor_2way(const void *ctx, u8 *dst, /* glue helpers */ extern void camellia_decrypt_cbc_2way(const void *ctx, u8 *dst, const u8 *src); -extern void camellia_crypt_ctr(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -extern void camellia_crypt_ctr_2way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); #endif /* ASM_X86_CAMELLIA_H */ -- cgit v1.2.3 From 2e9440ae6eab492572463d8cb266381264867723 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Jan 2021 17:47:55 +0100 Subject: crypto: x86/serpent - drop CTR mode implementation Serpent in CTR mode is never used by the kernel directly, and is highly unlikely to be relied upon by dm-crypt or algif_skcipher. So let's drop the accelerated CTR mode implementation, and instead, rely on the CTR template and the bare cipher. Acked-by: Eric Biggers Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/serpent-avx-x86_64-asm_64.S | 20 --------- arch/x86/crypto/serpent-avx2-asm_64.S | 25 ----------- arch/x86/crypto/serpent_avx2_glue.c | 38 ---------------- arch/x86/crypto/serpent_avx_glue.c | 51 ---------------------- arch/x86/crypto/serpent_sse2_glue.c | 67 ----------------------------- 5 files changed, 201 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S index 6b41f46bcc76..b7ee24df7fba 100644 --- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S @@ -711,23 +711,3 @@ SYM_FUNC_START(serpent_cbc_dec_8way_avx) FRAME_END ret; SYM_FUNC_END(serpent_cbc_dec_8way_avx) - -SYM_FUNC_START(serpent_ctr_8way_avx) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: iv (little endian, 128bit) - */ - FRAME_BEGIN - - load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, - RD2, RK0, RK1, RK2); - - call __serpent_enc_blk8_avx; - - store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); - - FRAME_END - ret; -SYM_FUNC_END(serpent_ctr_8way_avx) diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S index a510a949f02f..9161b6e441f3 100644 --- a/arch/x86/crypto/serpent-avx2-asm_64.S +++ b/arch/x86/crypto/serpent-avx2-asm_64.S @@ -724,28 +724,3 @@ SYM_FUNC_START(serpent_cbc_dec_16way) FRAME_END ret; SYM_FUNC_END(serpent_cbc_dec_16way) - -SYM_FUNC_START(serpent_ctr_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (little endian, 128bit) - */ - FRAME_BEGIN - - vzeroupper; - - load_ctr_16way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, - RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT, - tp); - - call __serpent_enc_blk16; - - store_ctr_16way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); - - vzeroupper; - - FRAME_END - ret; -SYM_FUNC_END(serpent_ctr_16way) diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index 9cdf2c078e21..28e542c6512a 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -22,8 +22,6 @@ asmlinkage void serpent_ecb_enc_16way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void serpent_ecb_dec_16way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void serpent_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src); -asmlinkage void serpent_ctr_16way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); static int serpent_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { @@ -46,22 +44,6 @@ static const struct common_glue_ctx serpent_enc = { } } }; -static const struct common_glue_ctx serpent_ctr = { - .num_funcs = 3, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .ctr = serpent_ctr_16way } - }, { - .num_blocks = 8, - .fn_u = { .ctr = serpent_ctr_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .ctr = __serpent_crypt_ctr } - } } -}; - static const struct common_glue_ctx serpent_dec = { .num_funcs = 3, .fpu_blocks_limit = 8, @@ -114,11 +96,6 @@ static int cbc_decrypt(struct skcipher_request *req) return glue_cbc_decrypt_req_128bit(&serpent_dec_cbc, req); } -static int ctr_crypt(struct skcipher_request *req) -{ - return glue_ctr_req_128bit(&serpent_ctr, req); -} - static struct skcipher_alg serpent_algs[] = { { .base.cra_name = "__ecb(serpent)", @@ -147,21 +124,6 @@ static struct skcipher_alg serpent_algs[] = { .setkey = serpent_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "__ctr(serpent)", - .base.cra_driver_name = "__ctr-serpent-avx2", - .base.cra_priority = 600, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct serpent_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = SERPENT_MIN_KEY_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .chunksize = SERPENT_BLOCK_SIZE, - .setkey = serpent_setkey_skcipher, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, }, }; diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index b17a08b57a91..aa4605baf9d4 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -31,24 +31,6 @@ asmlinkage void serpent_cbc_dec_8way_avx(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(serpent_cbc_dec_8way_avx); -asmlinkage void serpent_ctr_8way_avx(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -EXPORT_SYMBOL_GPL(serpent_ctr_8way_avx); - -void __serpent_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv) -{ - be128 ctrblk; - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - - le128_to_be128(&ctrblk, iv); - le128_inc(iv); - - __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); - u128_xor(dst, src, (u128 *)&ctrblk); -} -EXPORT_SYMBOL_GPL(__serpent_crypt_ctr); - static int serpent_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { @@ -68,19 +50,6 @@ static const struct common_glue_ctx serpent_enc = { } } }; -static const struct common_glue_ctx serpent_ctr = { - .num_funcs = 2, - .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ctr = serpent_ctr_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .ctr = __serpent_crypt_ctr } - } } -}; - static const struct common_glue_ctx serpent_dec = { .num_funcs = 2, .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, @@ -127,11 +96,6 @@ static int cbc_decrypt(struct skcipher_request *req) return glue_cbc_decrypt_req_128bit(&serpent_dec_cbc, req); } -static int ctr_crypt(struct skcipher_request *req) -{ - return glue_ctr_req_128bit(&serpent_ctr, req); -} - static struct skcipher_alg serpent_algs[] = { { .base.cra_name = "__ecb(serpent)", @@ -160,21 +124,6 @@ static struct skcipher_alg serpent_algs[] = { .setkey = serpent_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "__ctr(serpent)", - .base.cra_driver_name = "__ctr-serpent-avx", - .base.cra_priority = 500, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct serpent_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = SERPENT_MIN_KEY_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .chunksize = SERPENT_BLOCK_SIZE, - .setkey = serpent_setkey_skcipher, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, }, }; diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index 4fed8d26b91a..9acb3bf28feb 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -10,8 +10,6 @@ * * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: * Copyright (c) 2006 Herbert Xu - * CTR part based on code (crypto/ctr.c) by: - * (C) Copyright IBM Corp. 2007 - Joy Latten */ #include @@ -47,38 +45,6 @@ static void serpent_decrypt_cbc_xway(const void *ctx, u8 *d, const u8 *s) u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); } -static void serpent_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv) -{ - be128 ctrblk; - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - - le128_to_be128(&ctrblk, iv); - le128_inc(iv); - - __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); - u128_xor(dst, src, (u128 *)&ctrblk); -} - -static void serpent_crypt_ctr_xway(const void *ctx, u8 *d, const u8 *s, - le128 *iv) -{ - be128 ctrblks[SERPENT_PARALLEL_BLOCKS]; - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - unsigned int i; - - for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) { - if (dst != src) - dst[i] = src[i]; - - le128_to_be128(&ctrblks[i], iv); - le128_inc(iv); - } - - serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks); -} - static const struct common_glue_ctx serpent_enc = { .num_funcs = 2, .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, @@ -92,19 +58,6 @@ static const struct common_glue_ctx serpent_enc = { } } }; -static const struct common_glue_ctx serpent_ctr = { - .num_funcs = 2, - .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ctr = serpent_crypt_ctr_xway } - }, { - .num_blocks = 1, - .fn_u = { .ctr = serpent_crypt_ctr } - } } -}; - static const struct common_glue_ctx serpent_dec = { .num_funcs = 2, .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, @@ -152,11 +105,6 @@ static int cbc_decrypt(struct skcipher_request *req) return glue_cbc_decrypt_req_128bit(&serpent_dec_cbc, req); } -static int ctr_crypt(struct skcipher_request *req) -{ - return glue_ctr_req_128bit(&serpent_ctr, req); -} - static struct skcipher_alg serpent_algs[] = { { .base.cra_name = "__ecb(serpent)", @@ -185,21 +133,6 @@ static struct skcipher_alg serpent_algs[] = { .setkey = serpent_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "__ctr(serpent)", - .base.cra_driver_name = "__ctr-serpent-sse2", - .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct serpent_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = SERPENT_MIN_KEY_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .chunksize = SERPENT_BLOCK_SIZE, - .setkey = serpent_setkey_skcipher, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, }, }; -- cgit v1.2.3 From e2d60e2f597a5b2a0a8724989742784bb83ada5d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Jan 2021 17:47:56 +0100 Subject: crypto: x86/cast5 - drop CTR mode implementation CAST5 in CTR mode is never used by the kernel directly, and is highly unlikely to be relied upon by dm-crypt or algif_skcipher. So let's drop the accelerated CTR mode implementation, and instead, rely on the CTR template and the bare cipher. Acked-by: Eric Biggers Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/cast5_avx_glue.c | 103 --------------------------------------- 1 file changed, 103 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index 384ccb00f9e1..e0d1c7903b29 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -23,8 +23,6 @@ asmlinkage void cast5_ecb_dec_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src); asmlinkage void cast5_cbc_dec_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src); -asmlinkage void cast5_ctr_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src, - __be64 *iv); static int cast5_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) @@ -214,92 +212,6 @@ static int cbc_decrypt(struct skcipher_request *req) return err; } -static void ctr_crypt_final(struct skcipher_walk *walk, struct cast5_ctx *ctx) -{ - u8 *ctrblk = walk->iv; - u8 keystream[CAST5_BLOCK_SIZE]; - u8 *src = walk->src.virt.addr; - u8 *dst = walk->dst.virt.addr; - unsigned int nbytes = walk->nbytes; - - __cast5_encrypt(ctx, keystream, ctrblk); - crypto_xor_cpy(dst, keystream, src, nbytes); - - crypto_inc(ctrblk, CAST5_BLOCK_SIZE); -} - -static unsigned int __ctr_crypt(struct skcipher_walk *walk, - struct cast5_ctx *ctx) -{ - const unsigned int bsize = CAST5_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u64 *src = (u64 *)walk->src.virt.addr; - u64 *dst = (u64 *)walk->dst.virt.addr; - - /* Process multi-block batch */ - if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { - do { - cast5_ctr_16way(ctx, (u8 *)dst, (u8 *)src, - (__be64 *)walk->iv); - - src += CAST5_PARALLEL_BLOCKS; - dst += CAST5_PARALLEL_BLOCKS; - nbytes -= bsize * CAST5_PARALLEL_BLOCKS; - } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - u64 ctrblk; - - if (dst != src) - *dst = *src; - - ctrblk = *(u64 *)walk->iv; - be64_add_cpu((__be64 *)walk->iv, 1); - - __cast5_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); - *dst ^= ctrblk; - - src += 1; - dst += 1; - nbytes -= bsize; - } while (nbytes >= bsize); - -done: - return nbytes; -} - -static int ctr_crypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm); - bool fpu_enabled = false; - struct skcipher_walk walk; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) { - fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes); - nbytes = __ctr_crypt(&walk, ctx); - err = skcipher_walk_done(&walk, nbytes); - } - - cast5_fpu_end(fpu_enabled); - - if (walk.nbytes) { - ctr_crypt_final(&walk, ctx); - err = skcipher_walk_done(&walk, 0); - } - - return err; -} - static struct skcipher_alg cast5_algs[] = { { .base.cra_name = "__ecb(cast5)", @@ -328,21 +240,6 @@ static struct skcipher_alg cast5_algs[] = { .setkey = cast5_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "__ctr(cast5)", - .base.cra_driver_name = "__ctr-cast5-avx", - .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct cast5_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = CAST5_MIN_KEY_SIZE, - .max_keysize = CAST5_MAX_KEY_SIZE, - .ivsize = CAST5_BLOCK_SIZE, - .chunksize = CAST5_BLOCK_SIZE, - .setkey = cast5_setkey_skcipher, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, } }; -- cgit v1.2.3 From 7a6623cc6867b5f24f750a7c16b996b0cbbc63b5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Jan 2021 17:47:57 +0100 Subject: crypto: x86/cast6 - drop CTR mode implementation CAST6 in CTR mode is never used by the kernel directly, and is highly unlikely to be relied upon by dm-crypt or algif_skcipher. So let's drop the accelerated CTR mode implementation, and instead, rely on the CTR template and the bare cipher. Acked-by: Eric Biggers Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/cast6-avx-x86_64-asm_64.S | 28 ------------------ arch/x86/crypto/cast6_avx_glue.c | 48 ------------------------------- 2 files changed, 76 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S index 0c1ea836215a..fbddcecc3e3f 100644 --- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S @@ -410,31 +410,3 @@ SYM_FUNC_START(cast6_cbc_dec_8way) FRAME_END ret; SYM_FUNC_END(cast6_cbc_dec_8way) - -SYM_FUNC_START(cast6_ctr_8way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: iv (little endian, 128bit) - */ - FRAME_BEGIN - pushq %r12; - pushq %r15 - - movq %rdi, CTX; - movq %rsi, %r11; - movq %rdx, %r12; - - load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, - RD2, RX, RKR, RKM); - - call __cast6_enc_blk8; - - store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); - - popq %r15; - popq %r12; - FRAME_END - ret; -SYM_FUNC_END(cast6_ctr_8way) diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index 5a21d3e9041c..790efcb6df3b 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -23,8 +23,6 @@ asmlinkage void cast6_ecb_enc_8way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void cast6_ecb_dec_8way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void cast6_cbc_dec_8way(const void *ctx, u8 *dst, const u8 *src); -asmlinkage void cast6_ctr_8way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); static int cast6_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) @@ -32,19 +30,6 @@ static int cast6_setkey_skcipher(struct crypto_skcipher *tfm, return cast6_setkey(&tfm->base, key, keylen); } -static void cast6_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv) -{ - be128 ctrblk; - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - - le128_to_be128(&ctrblk, iv); - le128_inc(iv); - - __cast6_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); - u128_xor(dst, src, (u128 *)&ctrblk); -} - static const struct common_glue_ctx cast6_enc = { .num_funcs = 2, .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, @@ -58,19 +43,6 @@ static const struct common_glue_ctx cast6_enc = { } } }; -static const struct common_glue_ctx cast6_ctr = { - .num_funcs = 2, - .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .ctr = cast6_ctr_8way } - }, { - .num_blocks = 1, - .fn_u = { .ctr = cast6_crypt_ctr } - } } -}; - static const struct common_glue_ctx cast6_dec = { .num_funcs = 2, .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, @@ -117,11 +89,6 @@ static int cbc_decrypt(struct skcipher_request *req) return glue_cbc_decrypt_req_128bit(&cast6_dec_cbc, req); } -static int ctr_crypt(struct skcipher_request *req) -{ - return glue_ctr_req_128bit(&cast6_ctr, req); -} - static struct skcipher_alg cast6_algs[] = { { .base.cra_name = "__ecb(cast6)", @@ -150,21 +117,6 @@ static struct skcipher_alg cast6_algs[] = { .setkey = cast6_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "__ctr(cast6)", - .base.cra_driver_name = "__ctr-cast6-avx", - .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct cast6_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = CAST6_MIN_KEY_SIZE, - .max_keysize = CAST6_MAX_KEY_SIZE, - .ivsize = CAST6_BLOCK_SIZE, - .chunksize = CAST6_BLOCK_SIZE, - .setkey = cast6_setkey_skcipher, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, }, }; -- cgit v1.2.3 From f43dcaf2c97eae986378f12c46b27fe21f8a885b Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Jan 2021 17:47:58 +0100 Subject: crypto: x86/twofish - drop CTR mode implementation Twofish in CTR mode is never used by the kernel directly, and is highly unlikely to be relied upon by dm-crypt or algif_skcipher. So let's drop the accelerated CTR mode implementation, and instead, rely on the CTR template and the bare cipher. Acked-by: Eric Biggers Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/twofish-avx-x86_64-asm_64.S | 27 ---------- arch/x86/crypto/twofish_avx_glue.c | 38 -------------- arch/x86/crypto/twofish_glue_3way.c | 78 ----------------------------- arch/x86/include/asm/crypto/twofish.h | 4 -- 4 files changed, 147 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S index 84e61ef03638..37e63b3c664e 100644 --- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S @@ -374,30 +374,3 @@ SYM_FUNC_START(twofish_cbc_dec_8way) FRAME_END ret; SYM_FUNC_END(twofish_cbc_dec_8way) - -SYM_FUNC_START(twofish_ctr_8way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: iv (little endian, 128bit) - */ - FRAME_BEGIN - - pushq %r12; - - movq %rsi, %r11; - movq %rdx, %r12; - - load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, - RD2, RX0, RX1, RY0); - - call __twofish_enc_blk8; - - store_ctr_8way(%r12, %r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); - - popq %r12; - - FRAME_END - ret; -SYM_FUNC_END(twofish_ctr_8way) diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index 7b539bbb108f..13f810b61034 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -25,8 +25,6 @@ asmlinkage void twofish_ecb_enc_8way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void twofish_ecb_dec_8way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void twofish_cbc_dec_8way(const void *ctx, u8 *dst, const u8 *src); -asmlinkage void twofish_ctr_8way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); static int twofish_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) @@ -55,22 +53,6 @@ static const struct common_glue_ctx twofish_enc = { } } }; -static const struct common_glue_ctx twofish_ctr = { - .num_funcs = 3, - .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .ctr = twofish_ctr_8way } - }, { - .num_blocks = 3, - .fn_u = { .ctr = twofish_enc_blk_ctr_3way } - }, { - .num_blocks = 1, - .fn_u = { .ctr = twofish_enc_blk_ctr } - } } -}; - static const struct common_glue_ctx twofish_dec = { .num_funcs = 3, .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, @@ -123,11 +105,6 @@ static int cbc_decrypt(struct skcipher_request *req) return glue_cbc_decrypt_req_128bit(&twofish_dec_cbc, req); } -static int ctr_crypt(struct skcipher_request *req) -{ - return glue_ctr_req_128bit(&twofish_ctr, req); -} - static struct skcipher_alg twofish_algs[] = { { .base.cra_name = "__ecb(twofish)", @@ -156,21 +133,6 @@ static struct skcipher_alg twofish_algs[] = { .setkey = twofish_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "__ctr(twofish)", - .base.cra_driver_name = "__ctr-twofish-avx", - .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct twofish_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .ivsize = TF_BLOCK_SIZE, - .chunksize = TF_BLOCK_SIZE, - .setkey = twofish_setkey_skcipher, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, }, }; diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 768af6075479..88252370db0a 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -30,12 +30,6 @@ static inline void twofish_enc_blk_3way(const void *ctx, u8 *dst, const u8 *src) __twofish_enc_blk_3way(ctx, dst, src, false); } -static inline void twofish_enc_blk_xor_3way(const void *ctx, u8 *dst, - const u8 *src) -{ - __twofish_enc_blk_3way(ctx, dst, src, true); -} - void twofish_dec_blk_cbc_3way(const void *ctx, u8 *d, const u8 *s) { u128 ivs[2]; @@ -52,46 +46,6 @@ void twofish_dec_blk_cbc_3way(const void *ctx, u8 *d, const u8 *s) } EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way); -void twofish_enc_blk_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv) -{ - be128 ctrblk; - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - - if (dst != src) - *dst = *src; - - le128_to_be128(&ctrblk, iv); - le128_inc(iv); - - twofish_enc_blk(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); - u128_xor(dst, dst, (u128 *)&ctrblk); -} -EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr); - -void twofish_enc_blk_ctr_3way(const void *ctx, u8 *d, const u8 *s, le128 *iv) -{ - be128 ctrblks[3]; - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - - if (dst != src) { - dst[0] = src[0]; - dst[1] = src[1]; - dst[2] = src[2]; - } - - le128_to_be128(&ctrblks[0], iv); - le128_inc(iv); - le128_to_be128(&ctrblks[1], iv); - le128_inc(iv); - le128_to_be128(&ctrblks[2], iv); - le128_inc(iv); - - twofish_enc_blk_xor_3way(ctx, (u8 *)dst, (u8 *)ctrblks); -} -EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr_3way); - static const struct common_glue_ctx twofish_enc = { .num_funcs = 2, .fpu_blocks_limit = -1, @@ -105,19 +59,6 @@ static const struct common_glue_ctx twofish_enc = { } } }; -static const struct common_glue_ctx twofish_ctr = { - .num_funcs = 2, - .fpu_blocks_limit = -1, - - .funcs = { { - .num_blocks = 3, - .fn_u = { .ctr = twofish_enc_blk_ctr_3way } - }, { - .num_blocks = 1, - .fn_u = { .ctr = twofish_enc_blk_ctr } - } } -}; - static const struct common_glue_ctx twofish_dec = { .num_funcs = 2, .fpu_blocks_limit = -1, @@ -164,11 +105,6 @@ static int cbc_decrypt(struct skcipher_request *req) return glue_cbc_decrypt_req_128bit(&twofish_dec_cbc, req); } -static int ctr_crypt(struct skcipher_request *req) -{ - return glue_ctr_req_128bit(&twofish_ctr, req); -} - static struct skcipher_alg tf_skciphers[] = { { .base.cra_name = "ecb(twofish)", @@ -195,20 +131,6 @@ static struct skcipher_alg tf_skciphers[] = { .setkey = twofish_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "ctr(twofish)", - .base.cra_driver_name = "ctr-twofish-3way", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct twofish_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .ivsize = TF_BLOCK_SIZE, - .chunksize = TF_BLOCK_SIZE, - .setkey = twofish_setkey_skcipher, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, }, }; diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/include/asm/crypto/twofish.h index 2c377a8042e1..12df400e6d53 100644 --- a/arch/x86/include/asm/crypto/twofish.h +++ b/arch/x86/include/asm/crypto/twofish.h @@ -17,9 +17,5 @@ asmlinkage void twofish_dec_blk_3way(const void *ctx, u8 *dst, const u8 *src); /* helpers from twofish_x86_64-3way module */ extern void twofish_dec_blk_cbc_3way(const void *ctx, u8 *dst, const u8 *src); -extern void twofish_enc_blk_ctr(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -extern void twofish_enc_blk_ctr_3way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); #endif /* ASM_X86_TWOFISH_H */ -- cgit v1.2.3 From 89b7ba5c8b9b20df043bc7b1d60065589f4103c3 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Jan 2021 17:47:59 +0100 Subject: crypto: x86/glue-helper - drop CTR helper routines The glue helper's CTR routines are no longer used, so drop them. Acked-by: Eric Biggers Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/glue_helper-asm-avx.S | 45 ------------------- arch/x86/crypto/glue_helper-asm-avx2.S | 58 ------------------------- arch/x86/crypto/glue_helper.c | 72 ------------------------------- arch/x86/include/asm/crypto/glue_helper.h | 32 -------------- 4 files changed, 207 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/glue_helper-asm-avx.S b/arch/x86/crypto/glue_helper-asm-avx.S index a94511432803..3da385271227 100644 --- a/arch/x86/crypto/glue_helper-asm-avx.S +++ b/arch/x86/crypto/glue_helper-asm-avx.S @@ -34,48 +34,3 @@ vpxor (5*16)(src), x6, x6; \ vpxor (6*16)(src), x7, x7; \ store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7); - -#define inc_le128(x, minus_one, tmp) \ - vpcmpeqq minus_one, x, tmp; \ - vpsubq minus_one, x, x; \ - vpslldq $8, tmp, tmp; \ - vpsubq tmp, x, x; - -#define load_ctr_8way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2) \ - vpcmpeqd t0, t0, t0; \ - vpsrldq $8, t0, t0; /* low: -1, high: 0 */ \ - vmovdqa bswap, t1; \ - \ - /* load IV and byteswap */ \ - vmovdqu (iv), x7; \ - vpshufb t1, x7, x0; \ - \ - /* construct IVs */ \ - inc_le128(x7, t0, t2); \ - vpshufb t1, x7, x1; \ - inc_le128(x7, t0, t2); \ - vpshufb t1, x7, x2; \ - inc_le128(x7, t0, t2); \ - vpshufb t1, x7, x3; \ - inc_le128(x7, t0, t2); \ - vpshufb t1, x7, x4; \ - inc_le128(x7, t0, t2); \ - vpshufb t1, x7, x5; \ - inc_le128(x7, t0, t2); \ - vpshufb t1, x7, x6; \ - inc_le128(x7, t0, t2); \ - vmovdqa x7, t2; \ - vpshufb t1, x7, x7; \ - inc_le128(t2, t0, t1); \ - vmovdqu t2, (iv); - -#define store_ctr_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \ - vpxor (0*16)(src), x0, x0; \ - vpxor (1*16)(src), x1, x1; \ - vpxor (2*16)(src), x2, x2; \ - vpxor (3*16)(src), x3, x3; \ - vpxor (4*16)(src), x4, x4; \ - vpxor (5*16)(src), x5, x5; \ - vpxor (6*16)(src), x6, x6; \ - vpxor (7*16)(src), x7, x7; \ - store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7); diff --git a/arch/x86/crypto/glue_helper-asm-avx2.S b/arch/x86/crypto/glue_helper-asm-avx2.S index 456bface1e5d..c77e9049431f 100644 --- a/arch/x86/crypto/glue_helper-asm-avx2.S +++ b/arch/x86/crypto/glue_helper-asm-avx2.S @@ -37,61 +37,3 @@ vpxor (5*32+16)(src), x6, x6; \ vpxor (6*32+16)(src), x7, x7; \ store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7); - -#define inc_le128(x, minus_one, tmp) \ - vpcmpeqq minus_one, x, tmp; \ - vpsubq minus_one, x, x; \ - vpslldq $8, tmp, tmp; \ - vpsubq tmp, x, x; - -#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \ - vpcmpeqq minus_one, x, tmp1; \ - vpcmpeqq minus_two, x, tmp2; \ - vpsubq minus_two, x, x; \ - vpor tmp2, tmp1, tmp1; \ - vpslldq $8, tmp1, tmp1; \ - vpsubq tmp1, x, x; - -#define load_ctr_16way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t0x, t1, \ - t1x, t2, t2x, t3, t3x, t4, t5) \ - vpcmpeqd t0, t0, t0; \ - vpsrldq $8, t0, t0; /* ab: -1:0 ; cd: -1:0 */ \ - vpaddq t0, t0, t4; /* ab: -2:0 ; cd: -2:0 */\ - \ - /* load IV and byteswap */ \ - vmovdqu (iv), t2x; \ - vmovdqa t2x, t3x; \ - inc_le128(t2x, t0x, t1x); \ - vbroadcasti128 bswap, t1; \ - vinserti128 $1, t2x, t3, t2; /* ab: le0 ; cd: le1 */ \ - vpshufb t1, t2, x0; \ - \ - /* construct IVs */ \ - add2_le128(t2, t0, t4, t3, t5); /* ab: le2 ; cd: le3 */ \ - vpshufb t1, t2, x1; \ - add2_le128(t2, t0, t4, t3, t5); \ - vpshufb t1, t2, x2; \ - add2_le128(t2, t0, t4, t3, t5); \ - vpshufb t1, t2, x3; \ - add2_le128(t2, t0, t4, t3, t5); \ - vpshufb t1, t2, x4; \ - add2_le128(t2, t0, t4, t3, t5); \ - vpshufb t1, t2, x5; \ - add2_le128(t2, t0, t4, t3, t5); \ - vpshufb t1, t2, x6; \ - add2_le128(t2, t0, t4, t3, t5); \ - vpshufb t1, t2, x7; \ - vextracti128 $1, t2, t2x; \ - inc_le128(t2x, t0x, t3x); \ - vmovdqu t2x, (iv); - -#define store_ctr_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \ - vpxor (0*32)(src), x0, x0; \ - vpxor (1*32)(src), x1, x1; \ - vpxor (2*32)(src), x2, x2; \ - vpxor (3*32)(src), x3, x3; \ - vpxor (4*32)(src), x4, x4; \ - vpxor (5*32)(src), x5, x5; \ - vpxor (6*32)(src), x6, x6; \ - vpxor (7*32)(src), x7, x7; \ - store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7); diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c index 786ffda1caf4..895d34150c3f 100644 --- a/arch/x86/crypto/glue_helper.c +++ b/arch/x86/crypto/glue_helper.c @@ -6,8 +6,6 @@ * * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: * Copyright (c) 2006 Herbert Xu - * CTR part based on code (crypto/ctr.c) by: - * (C) Copyright IBM Corp. 2007 - Joy Latten */ #include @@ -154,74 +152,4 @@ done: } EXPORT_SYMBOL_GPL(glue_cbc_decrypt_req_128bit); -int glue_ctr_req_128bit(const struct common_glue_ctx *gctx, - struct skcipher_request *req) -{ - void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); - const unsigned int bsize = 128 / 8; - struct skcipher_walk walk; - bool fpu_enabled = false; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes) >= bsize) { - const u128 *src = walk.src.virt.addr; - u128 *dst = walk.dst.virt.addr; - unsigned int func_bytes, num_blocks; - unsigned int i; - le128 ctrblk; - - fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - &walk, fpu_enabled, nbytes); - - be128_to_le128(&ctrblk, (be128 *)walk.iv); - - for (i = 0; i < gctx->num_funcs; i++) { - num_blocks = gctx->funcs[i].num_blocks; - func_bytes = bsize * num_blocks; - - if (nbytes < func_bytes) - continue; - - /* Process multi-block batch */ - do { - gctx->funcs[i].fn_u.ctr(ctx, (u8 *)dst, - (const u8 *)src, - &ctrblk); - src += num_blocks; - dst += num_blocks; - nbytes -= func_bytes; - } while (nbytes >= func_bytes); - - if (nbytes < bsize) - break; - } - - le128_to_be128((be128 *)walk.iv, &ctrblk); - err = skcipher_walk_done(&walk, nbytes); - } - - glue_fpu_end(fpu_enabled); - - if (nbytes) { - le128 ctrblk; - u128 tmp; - - be128_to_le128(&ctrblk, (be128 *)walk.iv); - memcpy(&tmp, walk.src.virt.addr, nbytes); - gctx->funcs[gctx->num_funcs - 1].fn_u.ctr(ctx, (u8 *)&tmp, - (const u8 *)&tmp, - &ctrblk); - memcpy(walk.dst.virt.addr, &tmp, nbytes); - le128_to_be128((be128 *)walk.iv, &ctrblk); - - err = skcipher_walk_done(&walk, 0); - } - - return err; -} -EXPORT_SYMBOL_GPL(glue_ctr_req_128bit); - MODULE_LICENSE("GPL"); diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h index 62680775d189..23e09efd2aa6 100644 --- a/arch/x86/include/asm/crypto/glue_helper.h +++ b/arch/x86/include/asm/crypto/glue_helper.h @@ -9,19 +9,15 @@ #include #include #include -#include typedef void (*common_glue_func_t)(const void *ctx, u8 *dst, const u8 *src); typedef void (*common_glue_cbc_func_t)(const void *ctx, u8 *dst, const u8 *src); -typedef void (*common_glue_ctr_func_t)(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); struct common_glue_func_entry { unsigned int num_blocks; /* number of blocks that @fn will process */ union { common_glue_func_t ecb; common_glue_cbc_func_t cbc; - common_glue_ctr_func_t ctr; } fn_u; }; @@ -66,31 +62,6 @@ static inline void glue_fpu_end(bool fpu_enabled) kernel_fpu_end(); } -static inline void le128_to_be128(be128 *dst, const le128 *src) -{ - dst->a = cpu_to_be64(le64_to_cpu(src->a)); - dst->b = cpu_to_be64(le64_to_cpu(src->b)); -} - -static inline void be128_to_le128(le128 *dst, const be128 *src) -{ - dst->a = cpu_to_le64(be64_to_cpu(src->a)); - dst->b = cpu_to_le64(be64_to_cpu(src->b)); -} - -static inline void le128_inc(le128 *i) -{ - u64 a = le64_to_cpu(i->a); - u64 b = le64_to_cpu(i->b); - - b++; - if (!b) - a++; - - i->a = cpu_to_le64(a); - i->b = cpu_to_le64(b); -} - extern int glue_ecb_req_128bit(const struct common_glue_ctx *gctx, struct skcipher_request *req); @@ -100,7 +71,4 @@ extern int glue_cbc_encrypt_req_128bit(const common_glue_func_t fn, extern int glue_cbc_decrypt_req_128bit(const struct common_glue_ctx *gctx, struct skcipher_request *req); -extern int glue_ctr_req_128bit(const struct common_glue_ctx *gctx, - struct skcipher_request *req); - #endif /* _CRYPTO_GLUE_HELPER_H */ -- cgit v1.2.3 From 768db5fee3bb338174cd078878d3c4ff815a7fcf Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Jan 2021 17:48:00 +0100 Subject: crypto: x86/des - drop CTR mode implementation DES or Triple DES in counter mode is never used in the kernel, so there is no point in keeping an accelerated implementation around. Acked-by: Eric Biggers Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/des3_ede_glue.c | 104 ---------------------------------------- 1 file changed, 104 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c index 89830e531350..e7cb68a3db3b 100644 --- a/arch/x86/crypto/des3_ede_glue.c +++ b/arch/x86/crypto/des3_ede_glue.c @@ -6,8 +6,6 @@ * * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: * Copyright (c) 2006 Herbert Xu - * CTR part based on code (crypto/ctr.c) by: - * (C) Copyright IBM Corp. 2007 - Joy Latten */ #include @@ -253,94 +251,6 @@ static int cbc_decrypt(struct skcipher_request *req) return err; } -static void ctr_crypt_final(struct des3_ede_x86_ctx *ctx, - struct skcipher_walk *walk) -{ - u8 *ctrblk = walk->iv; - u8 keystream[DES3_EDE_BLOCK_SIZE]; - u8 *src = walk->src.virt.addr; - u8 *dst = walk->dst.virt.addr; - unsigned int nbytes = walk->nbytes; - - des3_ede_enc_blk(ctx, keystream, ctrblk); - crypto_xor_cpy(dst, keystream, src, nbytes); - - crypto_inc(ctrblk, DES3_EDE_BLOCK_SIZE); -} - -static unsigned int __ctr_crypt(struct des3_ede_x86_ctx *ctx, - struct skcipher_walk *walk) -{ - unsigned int bsize = DES3_EDE_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - __be64 *src = (__be64 *)walk->src.virt.addr; - __be64 *dst = (__be64 *)walk->dst.virt.addr; - u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv); - __be64 ctrblocks[3]; - - /* Process four block batch */ - if (nbytes >= bsize * 3) { - do { - /* create ctrblks for parallel encrypt */ - ctrblocks[0] = cpu_to_be64(ctrblk++); - ctrblocks[1] = cpu_to_be64(ctrblk++); - ctrblocks[2] = cpu_to_be64(ctrblk++); - - des3_ede_enc_blk_3way(ctx, (u8 *)ctrblocks, - (u8 *)ctrblocks); - - dst[0] = src[0] ^ ctrblocks[0]; - dst[1] = src[1] ^ ctrblocks[1]; - dst[2] = src[2] ^ ctrblocks[2]; - - src += 3; - dst += 3; - } while ((nbytes -= bsize * 3) >= bsize * 3); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - ctrblocks[0] = cpu_to_be64(ctrblk++); - - des3_ede_enc_blk(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks); - - dst[0] = src[0] ^ ctrblocks[0]; - - src += 1; - dst += 1; - } while ((nbytes -= bsize) >= bsize); - -done: - *(__be64 *)walk->iv = cpu_to_be64(ctrblk); - return nbytes; -} - -static int ctr_crypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes) >= DES3_EDE_BLOCK_SIZE) { - nbytes = __ctr_crypt(ctx, &walk); - err = skcipher_walk_done(&walk, nbytes); - } - - if (nbytes) { - ctr_crypt_final(ctx, &walk); - err = skcipher_walk_done(&walk, 0); - } - - return err; -} - static int des3_ede_x86_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen) { @@ -428,20 +338,6 @@ static struct skcipher_alg des3_ede_skciphers[] = { .setkey = des3_ede_x86_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "ctr(des3_ede)", - .base.cra_driver_name = "ctr-des3_ede-asm", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct des3_ede_x86_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = DES3_EDE_KEY_SIZE, - .max_keysize = DES3_EDE_KEY_SIZE, - .ivsize = DES3_EDE_BLOCK_SIZE, - .chunksize = DES3_EDE_BLOCK_SIZE, - .setkey = des3_ede_x86_setkey_skcipher, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, } }; -- cgit v1.2.3 From c0a64926c53e05fc6f69c7d632967606defe5f61 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Jan 2021 17:48:01 +0100 Subject: crypto: x86/blowfish - drop CTR mode implementation Blowfish in counter mode is never used in the kernel, so there is no point in keeping an accelerated implementation around. Acked-by: Eric Biggers Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/blowfish_glue.c | 107 ---------------------------------------- 1 file changed, 107 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index cedfdba69ce3..a880e0b1c255 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -6,8 +6,6 @@ * * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: * Copyright (c) 2006 Herbert Xu - * CTR part based on code (crypto/ctr.c) by: - * (C) Copyright IBM Corp. 2007 - Joy Latten */ #include @@ -247,97 +245,6 @@ static int cbc_decrypt(struct skcipher_request *req) return err; } -static void ctr_crypt_final(struct bf_ctx *ctx, struct skcipher_walk *walk) -{ - u8 *ctrblk = walk->iv; - u8 keystream[BF_BLOCK_SIZE]; - u8 *src = walk->src.virt.addr; - u8 *dst = walk->dst.virt.addr; - unsigned int nbytes = walk->nbytes; - - blowfish_enc_blk(ctx, keystream, ctrblk); - crypto_xor_cpy(dst, keystream, src, nbytes); - - crypto_inc(ctrblk, BF_BLOCK_SIZE); -} - -static unsigned int __ctr_crypt(struct bf_ctx *ctx, struct skcipher_walk *walk) -{ - unsigned int bsize = BF_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u64 *src = (u64 *)walk->src.virt.addr; - u64 *dst = (u64 *)walk->dst.virt.addr; - u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv); - __be64 ctrblocks[4]; - - /* Process four block batch */ - if (nbytes >= bsize * 4) { - do { - if (dst != src) { - dst[0] = src[0]; - dst[1] = src[1]; - dst[2] = src[2]; - dst[3] = src[3]; - } - - /* create ctrblks for parallel encrypt */ - ctrblocks[0] = cpu_to_be64(ctrblk++); - ctrblocks[1] = cpu_to_be64(ctrblk++); - ctrblocks[2] = cpu_to_be64(ctrblk++); - ctrblocks[3] = cpu_to_be64(ctrblk++); - - blowfish_enc_blk_xor_4way(ctx, (u8 *)dst, - (u8 *)ctrblocks); - - src += 4; - dst += 4; - } while ((nbytes -= bsize * 4) >= bsize * 4); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - if (dst != src) - *dst = *src; - - ctrblocks[0] = cpu_to_be64(ctrblk++); - - blowfish_enc_blk_xor(ctx, (u8 *)dst, (u8 *)ctrblocks); - - src += 1; - dst += 1; - } while ((nbytes -= bsize) >= bsize); - -done: - *(__be64 *)walk->iv = cpu_to_be64(ctrblk); - return nbytes; -} - -static int ctr_crypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct bf_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes) >= BF_BLOCK_SIZE) { - nbytes = __ctr_crypt(ctx, &walk); - err = skcipher_walk_done(&walk, nbytes); - } - - if (nbytes) { - ctr_crypt_final(ctx, &walk); - err = skcipher_walk_done(&walk, 0); - } - - return err; -} - static struct crypto_alg bf_cipher_alg = { .cra_name = "blowfish", .cra_driver_name = "blowfish-asm", @@ -384,20 +291,6 @@ static struct skcipher_alg bf_skcipher_algs[] = { .setkey = blowfish_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "ctr(blowfish)", - .base.cra_driver_name = "ctr-blowfish-asm", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct bf_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = BF_MIN_KEY_SIZE, - .max_keysize = BF_MAX_KEY_SIZE, - .ivsize = BF_BLOCK_SIZE, - .chunksize = BF_BLOCK_SIZE, - .setkey = blowfish_setkey_skcipher, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, }, }; -- cgit v1.2.3 From 827ee47228a6bfa446ddb81999adf400ae901106 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Jan 2021 17:48:02 +0100 Subject: crypto: x86 - add some helper macros for ECB and CBC modes The x86 glue helper module is starting to show its age: - It relies heavily on function pointers to invoke asm helper functions that operate on fixed input sizes that are relatively small. This means the performance is severely impacted by retpolines. - It goes to great lengths to amortize the cost of kernel_fpu_begin()/end() over as much work as possible, which is no longer necessary now that FPU save/restore is done lazily, and doing so may cause unbounded scheduling blackouts due to the fact that enabling the FPU in kernel mode disables preemption. - The CBC mode decryption helper makes backward strides through the input, in order to avoid a single block size memcpy() between chunks. Consuming the input in this manner is highly likely to defeat any hardware prefetchers, so it is better to go through the data linearly, and perform the extra memcpy() where needed (which is turned into direct loads and stores by the compiler anyway). Note that benchmarks won't show this effect, given that the memory they use is always cache hot. - It implements blockwise XOR in terms of le128 pointers, which imply an alignment that is not guaranteed by the API, violating the C standard. GCC does not seem to be smart enough to elide the indirect calls when the function pointers are passed as arguments to static inline helper routines modeled after the existing ones. So instead, let's create some CPP macros that encapsulate the core of the ECB and CBC processing, so we can wire them up for existing users of the glue helper module, i.e., Camellia, Serpent, Twofish and CAST6. Acked-by: Eric Biggers Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/ecb_cbc_helpers.h | 76 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 arch/x86/crypto/ecb_cbc_helpers.h (limited to 'arch') diff --git a/arch/x86/crypto/ecb_cbc_helpers.h b/arch/x86/crypto/ecb_cbc_helpers.h new file mode 100644 index 000000000000..eaa15c7b29d6 --- /dev/null +++ b/arch/x86/crypto/ecb_cbc_helpers.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _CRYPTO_ECB_CBC_HELPER_H +#define _CRYPTO_ECB_CBC_HELPER_H + +#include +#include + +/* + * Mode helpers to instantiate parameterized skcipher ECB/CBC modes without + * having to rely on indirect calls and retpolines. + */ + +#define ECB_WALK_START(req, bsize, fpu_blocks) do { \ + void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); \ + const int __bsize = (bsize); \ + struct skcipher_walk walk; \ + int err = skcipher_walk_virt(&walk, (req), false); \ + while (walk.nbytes > 0) { \ + unsigned int nbytes = walk.nbytes; \ + bool do_fpu = (fpu_blocks) != -1 && \ + nbytes >= (fpu_blocks) * __bsize; \ + const u8 *src = walk.src.virt.addr; \ + u8 *dst = walk.dst.virt.addr; \ + u8 __maybe_unused buf[(bsize)]; \ + if (do_fpu) kernel_fpu_begin() + +#define CBC_WALK_START(req, bsize, fpu_blocks) \ + ECB_WALK_START(req, bsize, fpu_blocks) + +#define ECB_WALK_ADVANCE(blocks) do { \ + dst += (blocks) * __bsize; \ + src += (blocks) * __bsize; \ + nbytes -= (blocks) * __bsize; \ +} while (0) + +#define ECB_BLOCK(blocks, func) do { \ + while (nbytes >= (blocks) * __bsize) { \ + (func)(ctx, dst, src); \ + ECB_WALK_ADVANCE(blocks); \ + } \ +} while (0) + +#define CBC_ENC_BLOCK(func) do { \ + const u8 *__iv = walk.iv; \ + while (nbytes >= __bsize) { \ + crypto_xor_cpy(dst, src, __iv, __bsize); \ + (func)(ctx, dst, dst); \ + __iv = dst; \ + ECB_WALK_ADVANCE(1); \ + } \ + memcpy(walk.iv, __iv, __bsize); \ +} while (0) + +#define CBC_DEC_BLOCK(blocks, func) do { \ + while (nbytes >= (blocks) * __bsize) { \ + const u8 *__iv = src + ((blocks) - 1) * __bsize; \ + if (dst == src) \ + __iv = memcpy(buf, __iv, __bsize); \ + (func)(ctx, dst, src); \ + crypto_xor(dst, walk.iv, __bsize); \ + memcpy(walk.iv, __iv, __bsize); \ + ECB_WALK_ADVANCE(blocks); \ + } \ +} while (0) + +#define ECB_WALK_END() \ + if (do_fpu) kernel_fpu_end(); \ + err = skcipher_walk_done(&walk, nbytes); \ + } \ + return err; \ +} while (0) + +#define CBC_WALK_END() ECB_WALK_END() + +#endif -- cgit v1.2.3 From 407d409a8102a5ba042215aed7b2ef2d6e6c67a8 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Jan 2021 17:48:03 +0100 Subject: crypto: x86/camellia - drop dependency on glue helper Replace the glue helper dependency with implementations of ECB and CBC based on the new CPP macros, which avoid the need for indirect calls. Acked-by: Eric Biggers Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/camellia_aesni_avx2_glue.c | 85 ++++++++---------------------- arch/x86/crypto/camellia_aesni_avx_glue.c | 73 +++++++------------------ arch/x86/crypto/camellia_glue.c | 75 +++++++++----------------- 3 files changed, 67 insertions(+), 166 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index 8f25a2a6222e..ef5c0f094584 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -6,7 +6,6 @@ */ #include -#include #include #include #include @@ -14,6 +13,8 @@ #include #include +#include "ecb_cbc_helpers.h" + #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16 #define CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS 32 @@ -23,63 +24,6 @@ asmlinkage void camellia_ecb_dec_32way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void camellia_cbc_dec_32way(const void *ctx, u8 *dst, const u8 *src); -static const struct common_glue_ctx camellia_enc = { - .num_funcs = 4, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, - .fn_u = { .ecb = camellia_ecb_enc_32way } - }, { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .ecb = camellia_ecb_enc_16way } - }, { - .num_blocks = 2, - .fn_u = { .ecb = camellia_enc_blk_2way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = camellia_enc_blk } - } } -}; - -static const struct common_glue_ctx camellia_dec = { - .num_funcs = 4, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, - .fn_u = { .ecb = camellia_ecb_dec_32way } - }, { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .ecb = camellia_ecb_dec_16way } - }, { - .num_blocks = 2, - .fn_u = { .ecb = camellia_dec_blk_2way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = camellia_dec_blk } - } } -}; - -static const struct common_glue_ctx camellia_dec_cbc = { - .num_funcs = 4, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, - .fn_u = { .cbc = camellia_cbc_dec_32way } - }, { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .cbc = camellia_cbc_dec_16way } - }, { - .num_blocks = 2, - .fn_u = { .cbc = camellia_decrypt_cbc_2way } - }, { - .num_blocks = 1, - .fn_u = { .cbc = camellia_dec_blk } - } } -}; - static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { @@ -88,22 +32,39 @@ static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, static int ecb_encrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&camellia_enc, req); + ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS); + ECB_BLOCK(CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, camellia_ecb_enc_32way); + ECB_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_ecb_enc_16way); + ECB_BLOCK(2, camellia_enc_blk_2way); + ECB_BLOCK(1, camellia_enc_blk); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&camellia_dec, req); + ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS); + ECB_BLOCK(CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, camellia_ecb_dec_32way); + ECB_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_ecb_dec_16way); + ECB_BLOCK(2, camellia_dec_blk_2way); + ECB_BLOCK(1, camellia_dec_blk); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(camellia_enc_blk, req); + CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(camellia_enc_blk); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_req_128bit(&camellia_dec_cbc, req); + CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS); + CBC_DEC_BLOCK(CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, camellia_cbc_dec_32way); + CBC_DEC_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_cbc_dec_16way); + CBC_DEC_BLOCK(2, camellia_decrypt_cbc_2way); + CBC_DEC_BLOCK(1, camellia_dec_blk); + CBC_WALK_END(); } static struct skcipher_alg camellia_algs[] = { diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index 22a89cdfedfb..68fed0a79889 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -6,7 +6,6 @@ */ #include -#include #include #include #include @@ -14,6 +13,8 @@ #include #include +#include "ecb_cbc_helpers.h" + #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16 /* 16-way parallel cipher functions (avx/aes-ni) */ @@ -26,54 +27,6 @@ EXPORT_SYMBOL_GPL(camellia_ecb_dec_16way); asmlinkage void camellia_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(camellia_cbc_dec_16way); -static const struct common_glue_ctx camellia_enc = { - .num_funcs = 3, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .ecb = camellia_ecb_enc_16way } - }, { - .num_blocks = 2, - .fn_u = { .ecb = camellia_enc_blk_2way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = camellia_enc_blk } - } } -}; - -static const struct common_glue_ctx camellia_dec = { - .num_funcs = 3, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .ecb = camellia_ecb_dec_16way } - }, { - .num_blocks = 2, - .fn_u = { .ecb = camellia_dec_blk_2way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = camellia_dec_blk } - } } -}; - -static const struct common_glue_ctx camellia_dec_cbc = { - .num_funcs = 3, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .cbc = camellia_cbc_dec_16way } - }, { - .num_blocks = 2, - .fn_u = { .cbc = camellia_decrypt_cbc_2way } - }, { - .num_blocks = 1, - .fn_u = { .cbc = camellia_dec_blk } - } } -}; - static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { @@ -82,22 +35,36 @@ static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, static int ecb_encrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&camellia_enc, req); + ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS); + ECB_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_ecb_enc_16way); + ECB_BLOCK(2, camellia_enc_blk_2way); + ECB_BLOCK(1, camellia_enc_blk); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&camellia_dec, req); + ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS); + ECB_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_ecb_dec_16way); + ECB_BLOCK(2, camellia_dec_blk_2way); + ECB_BLOCK(1, camellia_dec_blk); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(camellia_enc_blk, req); + CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(camellia_enc_blk); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_req_128bit(&camellia_dec_cbc, req); + CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS); + CBC_DEC_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_cbc_dec_16way); + CBC_DEC_BLOCK(2, camellia_decrypt_cbc_2way); + CBC_DEC_BLOCK(1, camellia_dec_blk); + CBC_WALK_END(); } static struct skcipher_alg camellia_algs[] = { diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index fefeedf2b33d..0bc00ce68484 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -15,7 +15,8 @@ #include #include #include -#include + +#include "ecb_cbc_helpers.h" /* regular block cipher functions */ asmlinkage void __camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src, @@ -1262,75 +1263,47 @@ static int camellia_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, return camellia_setkey(&tfm->base, key, key_len); } -void camellia_decrypt_cbc_2way(const void *ctx, u8 *d, const u8 *s) +void camellia_decrypt_cbc_2way(const void *ctx, u8 *dst, const u8 *src) { - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - u128 iv = *src; - - camellia_dec_blk_2way(ctx, (u8 *)dst, (u8 *)src); + u8 buf[CAMELLIA_BLOCK_SIZE]; + const u8 *iv = src; - u128_xor(&dst[1], &dst[1], &iv); + if (dst == src) + iv = memcpy(buf, iv, sizeof(buf)); + camellia_dec_blk_2way(ctx, dst, src); + crypto_xor(dst + CAMELLIA_BLOCK_SIZE, iv, CAMELLIA_BLOCK_SIZE); } EXPORT_SYMBOL_GPL(camellia_decrypt_cbc_2way); -static const struct common_glue_ctx camellia_enc = { - .num_funcs = 2, - .fpu_blocks_limit = -1, - - .funcs = { { - .num_blocks = 2, - .fn_u = { .ecb = camellia_enc_blk_2way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = camellia_enc_blk } - } } -}; - -static const struct common_glue_ctx camellia_dec = { - .num_funcs = 2, - .fpu_blocks_limit = -1, - - .funcs = { { - .num_blocks = 2, - .fn_u = { .ecb = camellia_dec_blk_2way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = camellia_dec_blk } - } } -}; - -static const struct common_glue_ctx camellia_dec_cbc = { - .num_funcs = 2, - .fpu_blocks_limit = -1, - - .funcs = { { - .num_blocks = 2, - .fn_u = { .cbc = camellia_decrypt_cbc_2way } - }, { - .num_blocks = 1, - .fn_u = { .cbc = camellia_dec_blk } - } } -}; - static int ecb_encrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&camellia_enc, req); + ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1); + ECB_BLOCK(2, camellia_enc_blk_2way); + ECB_BLOCK(1, camellia_enc_blk); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&camellia_dec, req); + ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1); + ECB_BLOCK(2, camellia_dec_blk_2way); + ECB_BLOCK(1, camellia_dec_blk); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(camellia_enc_blk, req); + CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(camellia_enc_blk); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_req_128bit(&camellia_dec_cbc, req); + CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1); + CBC_DEC_BLOCK(2, camellia_decrypt_cbc_2way); + CBC_DEC_BLOCK(1, camellia_dec_blk); + CBC_WALK_END(); } static struct crypto_alg camellia_cipher_alg = { -- cgit v1.2.3 From 9ad58b46f814edd5b8b288b66f94cf57c97eaea3 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Jan 2021 17:48:04 +0100 Subject: crypto: x86/serpent - drop dependency on glue helper Replace the glue helper dependency with implementations of ECB and CBC based on the new CPP macros, which avoid the need for indirect calls. Acked-by: Eric Biggers Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/serpent_avx2_glue.c | 73 +++++++++------------------------ arch/x86/crypto/serpent_avx_glue.c | 61 ++++++++-------------------- arch/x86/crypto/serpent_sse2_glue.c | 81 +++++++++++-------------------------- 3 files changed, 61 insertions(+), 154 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index 28e542c6512a..261c9ac2d762 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -12,9 +12,10 @@ #include #include #include -#include #include +#include "ecb_cbc_helpers.h" + #define SERPENT_AVX2_PARALLEL_BLOCKS 16 /* 16-way AVX2 parallel cipher functions */ @@ -28,72 +29,38 @@ static int serpent_setkey_skcipher(struct crypto_skcipher *tfm, return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen); } -static const struct common_glue_ctx serpent_enc = { - .num_funcs = 3, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .ecb = serpent_ecb_enc_16way } - }, { - .num_blocks = 8, - .fn_u = { .ecb = serpent_ecb_enc_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .ecb = __serpent_encrypt } - } } -}; - -static const struct common_glue_ctx serpent_dec = { - .num_funcs = 3, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .ecb = serpent_ecb_dec_16way } - }, { - .num_blocks = 8, - .fn_u = { .ecb = serpent_ecb_dec_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .ecb = __serpent_decrypt } - } } -}; - -static const struct common_glue_ctx serpent_dec_cbc = { - .num_funcs = 3, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .cbc = serpent_cbc_dec_16way } - }, { - .num_blocks = 8, - .fn_u = { .cbc = serpent_cbc_dec_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .cbc = __serpent_decrypt } - } } -}; - static int ecb_encrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&serpent_enc, req); + ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); + ECB_BLOCK(SERPENT_AVX2_PARALLEL_BLOCKS, serpent_ecb_enc_16way); + ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_enc_8way_avx); + ECB_BLOCK(1, __serpent_encrypt); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&serpent_dec, req); + ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); + ECB_BLOCK(SERPENT_AVX2_PARALLEL_BLOCKS, serpent_ecb_dec_16way); + ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_dec_8way_avx); + ECB_BLOCK(1, __serpent_decrypt); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(__serpent_encrypt, req); + CBC_WALK_START(req, SERPENT_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(__serpent_encrypt); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_req_128bit(&serpent_dec_cbc, req); + CBC_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); + CBC_DEC_BLOCK(SERPENT_AVX2_PARALLEL_BLOCKS, serpent_cbc_dec_16way); + CBC_DEC_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_cbc_dec_8way_avx); + CBC_DEC_BLOCK(1, __serpent_decrypt); + CBC_WALK_END(); } static struct skcipher_alg serpent_algs[] = { diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index aa4605baf9d4..5fe01d2a5b1d 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -15,9 +15,10 @@ #include #include #include -#include #include +#include "ecb_cbc_helpers.h" + /* 8-way parallel cipher functions */ asmlinkage void serpent_ecb_enc_8way_avx(const void *ctx, u8 *dst, const u8 *src); @@ -37,63 +38,35 @@ static int serpent_setkey_skcipher(struct crypto_skcipher *tfm, return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen); } -static const struct common_glue_ctx serpent_enc = { - .num_funcs = 2, - .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ecb = serpent_ecb_enc_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .ecb = __serpent_encrypt } - } } -}; - -static const struct common_glue_ctx serpent_dec = { - .num_funcs = 2, - .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ecb = serpent_ecb_dec_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .ecb = __serpent_decrypt } - } } -}; - -static const struct common_glue_ctx serpent_dec_cbc = { - .num_funcs = 2, - .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .cbc = serpent_cbc_dec_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .cbc = __serpent_decrypt } - } } -}; - static int ecb_encrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&serpent_enc, req); + ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); + ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_enc_8way_avx); + ECB_BLOCK(1, __serpent_encrypt); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&serpent_dec, req); + ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); + ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_dec_8way_avx); + ECB_BLOCK(1, __serpent_decrypt); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(__serpent_encrypt, req); + CBC_WALK_START(req, SERPENT_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(__serpent_encrypt); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_req_128bit(&serpent_dec_cbc, req); + CBC_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); + CBC_DEC_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_cbc_dec_8way_avx); + CBC_DEC_BLOCK(1, __serpent_decrypt); + CBC_WALK_END(); } static struct skcipher_alg serpent_algs[] = { diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index 9acb3bf28feb..e28d60949c16 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -21,7 +21,8 @@ #include #include #include -#include + +#include "ecb_cbc_helpers.h" static int serpent_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) @@ -29,80 +30,46 @@ static int serpent_setkey_skcipher(struct crypto_skcipher *tfm, return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen); } -static void serpent_decrypt_cbc_xway(const void *ctx, u8 *d, const u8 *s) +static void serpent_decrypt_cbc_xway(const void *ctx, u8 *dst, const u8 *src) { - u128 ivs[SERPENT_PARALLEL_BLOCKS - 1]; - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - unsigned int j; - - for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++) - ivs[j] = src[j]; + u8 buf[SERPENT_PARALLEL_BLOCKS - 1][SERPENT_BLOCK_SIZE]; + const u8 *s = src; - serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); - - for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++) - u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); + if (dst == src) + s = memcpy(buf, src, sizeof(buf)); + serpent_dec_blk_xway(ctx, dst, src); + crypto_xor(dst + SERPENT_BLOCK_SIZE, s, sizeof(buf)); } -static const struct common_glue_ctx serpent_enc = { - .num_funcs = 2, - .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ecb = serpent_enc_blk_xway } - }, { - .num_blocks = 1, - .fn_u = { .ecb = __serpent_encrypt } - } } -}; - -static const struct common_glue_ctx serpent_dec = { - .num_funcs = 2, - .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ecb = serpent_dec_blk_xway } - }, { - .num_blocks = 1, - .fn_u = { .ecb = __serpent_decrypt } - } } -}; - -static const struct common_glue_ctx serpent_dec_cbc = { - .num_funcs = 2, - .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .cbc = serpent_decrypt_cbc_xway } - }, { - .num_blocks = 1, - .fn_u = { .cbc = __serpent_decrypt } - } } -}; - static int ecb_encrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&serpent_enc, req); + ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); + ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_enc_blk_xway); + ECB_BLOCK(1, __serpent_encrypt); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&serpent_dec, req); + ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); + ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_dec_blk_xway); + ECB_BLOCK(1, __serpent_decrypt); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(__serpent_encrypt, - req); + CBC_WALK_START(req, SERPENT_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(__serpent_encrypt); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_req_128bit(&serpent_dec_cbc, req); + CBC_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); + CBC_DEC_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_decrypt_cbc_xway); + CBC_DEC_BLOCK(1, __serpent_decrypt); + CBC_WALK_END(); } static struct skcipher_alg serpent_algs[] = { -- cgit v1.2.3 From 674d40abac42d502e226da6045fad61d7206e5fb Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Jan 2021 17:48:05 +0100 Subject: crypto: x86/cast5 - drop dependency on glue helper Replace the glue helper dependency with implementations of ECB and CBC based on the new CPP macros, which avoid the need for indirect calls. Acked-by: Eric Biggers Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/cast5_avx_glue.c | 184 ++++----------------------------------- 1 file changed, 17 insertions(+), 167 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index e0d1c7903b29..3976a87f92ad 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -6,7 +6,6 @@ * */ -#include #include #include #include @@ -15,6 +14,8 @@ #include #include +#include "ecb_cbc_helpers.h" + #define CAST5_PARALLEL_BLOCKS 16 asmlinkage void cast5_ecb_enc_16way(struct cast5_ctx *ctx, u8 *dst, @@ -30,186 +31,35 @@ static int cast5_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, return cast5_setkey(&tfm->base, key, keylen); } -static inline bool cast5_fpu_begin(bool fpu_enabled, struct skcipher_walk *walk, - unsigned int nbytes) -{ - return glue_fpu_begin(CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS, - walk, fpu_enabled, nbytes); -} - -static inline void cast5_fpu_end(bool fpu_enabled) -{ - return glue_fpu_end(fpu_enabled); -} - -static int ecb_crypt(struct skcipher_request *req, bool enc) -{ - bool fpu_enabled = false; - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - const unsigned int bsize = CAST5_BLOCK_SIZE; - unsigned int nbytes; - void (*fn)(struct cast5_ctx *ctx, u8 *dst, const u8 *src); - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes)) { - u8 *wsrc = walk.src.virt.addr; - u8 *wdst = walk.dst.virt.addr; - - fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes); - - /* Process multi-block batch */ - if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { - fn = (enc) ? cast5_ecb_enc_16way : cast5_ecb_dec_16way; - do { - fn(ctx, wdst, wsrc); - - wsrc += bsize * CAST5_PARALLEL_BLOCKS; - wdst += bsize * CAST5_PARALLEL_BLOCKS; - nbytes -= bsize * CAST5_PARALLEL_BLOCKS; - } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - fn = (enc) ? __cast5_encrypt : __cast5_decrypt; - - /* Handle leftovers */ - do { - fn(ctx, wdst, wsrc); - - wsrc += bsize; - wdst += bsize; - nbytes -= bsize; - } while (nbytes >= bsize); - -done: - err = skcipher_walk_done(&walk, nbytes); - } - - cast5_fpu_end(fpu_enabled); - return err; -} - static int ecb_encrypt(struct skcipher_request *req) { - return ecb_crypt(req, true); + ECB_WALK_START(req, CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS); + ECB_BLOCK(CAST5_PARALLEL_BLOCKS, cast5_ecb_enc_16way); + ECB_BLOCK(1, __cast5_encrypt); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return ecb_crypt(req, false); + ECB_WALK_START(req, CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS); + ECB_BLOCK(CAST5_PARALLEL_BLOCKS, cast5_ecb_dec_16way); + ECB_BLOCK(1, __cast5_decrypt); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - const unsigned int bsize = CAST5_BLOCK_SIZE; - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes)) { - u64 *src = (u64 *)walk.src.virt.addr; - u64 *dst = (u64 *)walk.dst.virt.addr; - u64 *iv = (u64 *)walk.iv; - - do { - *dst = *src ^ *iv; - __cast5_encrypt(ctx, (u8 *)dst, (u8 *)dst); - iv = dst; - src++; - dst++; - nbytes -= bsize; - } while (nbytes >= bsize); - - *(u64 *)walk.iv = *iv; - err = skcipher_walk_done(&walk, nbytes); - } - - return err; -} - -static unsigned int __cbc_decrypt(struct cast5_ctx *ctx, - struct skcipher_walk *walk) -{ - const unsigned int bsize = CAST5_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u64 *src = (u64 *)walk->src.virt.addr; - u64 *dst = (u64 *)walk->dst.virt.addr; - u64 last_iv; - - /* Start of the last block. */ - src += nbytes / bsize - 1; - dst += nbytes / bsize - 1; - - last_iv = *src; - - /* Process multi-block batch */ - if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { - do { - nbytes -= bsize * (CAST5_PARALLEL_BLOCKS - 1); - src -= CAST5_PARALLEL_BLOCKS - 1; - dst -= CAST5_PARALLEL_BLOCKS - 1; - - cast5_cbc_dec_16way(ctx, (u8 *)dst, (u8 *)src); - - nbytes -= bsize; - if (nbytes < bsize) - goto done; - - *dst ^= *(src - 1); - src -= 1; - dst -= 1; - } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS); - } - - /* Handle leftovers */ - for (;;) { - __cast5_decrypt(ctx, (u8 *)dst, (u8 *)src); - - nbytes -= bsize; - if (nbytes < bsize) - break; - - *dst ^= *(src - 1); - src -= 1; - dst -= 1; - } - -done: - *dst ^= *(u64 *)walk->iv; - *(u64 *)walk->iv = last_iv; - - return nbytes; + CBC_WALK_START(req, CAST5_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(__cast5_encrypt); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm); - bool fpu_enabled = false; - struct skcipher_walk walk; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes)) { - fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes); - nbytes = __cbc_decrypt(ctx, &walk); - err = skcipher_walk_done(&walk, nbytes); - } - - cast5_fpu_end(fpu_enabled); - return err; + CBC_WALK_START(req, CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS); + CBC_DEC_BLOCK(CAST5_PARALLEL_BLOCKS, cast5_cbc_dec_16way); + CBC_DEC_BLOCK(1, __cast5_decrypt); + CBC_WALK_END(); } static struct skcipher_alg cast5_algs[] = { -- cgit v1.2.3 From ea55cfc3f920c95ee8d01ddc51e586b09a1194ee Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Jan 2021 17:48:06 +0100 Subject: crypto: x86/cast6 - drop dependency on glue helper Replace the glue helper dependency with implementations of ECB and CBC based on the new CPP macros, which avoid the need for indirect calls. Acked-by: Eric Biggers Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/cast6_avx_glue.c | 61 +++++++++++----------------------------- 1 file changed, 17 insertions(+), 44 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index 790efcb6df3b..7e2aea372349 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -15,7 +15,8 @@ #include #include #include -#include + +#include "ecb_cbc_helpers.h" #define CAST6_PARALLEL_BLOCKS 8 @@ -30,63 +31,35 @@ static int cast6_setkey_skcipher(struct crypto_skcipher *tfm, return cast6_setkey(&tfm->base, key, keylen); } -static const struct common_glue_ctx cast6_enc = { - .num_funcs = 2, - .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .ecb = cast6_ecb_enc_8way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = __cast6_encrypt } - } } -}; - -static const struct common_glue_ctx cast6_dec = { - .num_funcs = 2, - .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .ecb = cast6_ecb_dec_8way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = __cast6_decrypt } - } } -}; - -static const struct common_glue_ctx cast6_dec_cbc = { - .num_funcs = 2, - .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .cbc = cast6_cbc_dec_8way } - }, { - .num_blocks = 1, - .fn_u = { .cbc = __cast6_decrypt } - } } -}; - static int ecb_encrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&cast6_enc, req); + ECB_WALK_START(req, CAST6_BLOCK_SIZE, CAST6_PARALLEL_BLOCKS); + ECB_BLOCK(CAST6_PARALLEL_BLOCKS, cast6_ecb_enc_8way); + ECB_BLOCK(1, __cast6_encrypt); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&cast6_dec, req); + ECB_WALK_START(req, CAST6_BLOCK_SIZE, CAST6_PARALLEL_BLOCKS); + ECB_BLOCK(CAST6_PARALLEL_BLOCKS, cast6_ecb_dec_8way); + ECB_BLOCK(1, __cast6_decrypt); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(__cast6_encrypt, req); + CBC_WALK_START(req, CAST6_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(__cast6_encrypt); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_req_128bit(&cast6_dec_cbc, req); + CBC_WALK_START(req, CAST6_BLOCK_SIZE, CAST6_PARALLEL_BLOCKS); + CBC_DEC_BLOCK(CAST6_PARALLEL_BLOCKS, cast6_cbc_dec_8way); + CBC_DEC_BLOCK(1, __cast6_decrypt); + CBC_WALK_END(); } static struct skcipher_alg cast6_algs[] = { -- cgit v1.2.3 From 165f357334cc92435aa9b5c9161567e0d0ab8f2a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Jan 2021 17:48:07 +0100 Subject: crypto: x86/twofish - drop dependency on glue helper Replace the glue helper dependency with implementations of ECB and CBC based on the new CPP macros, which avoid the need for indirect calls. Acked-by: Eric Biggers Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/twofish_avx_glue.c | 73 ++++++++++----------------------- arch/x86/crypto/twofish_glue_3way.c | 80 +++++++++++-------------------------- 2 files changed, 44 insertions(+), 109 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index 13f810b61034..6ce198f808a5 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -15,9 +15,10 @@ #include #include #include -#include #include +#include "ecb_cbc_helpers.h" + #define TWOFISH_PARALLEL_BLOCKS 8 /* 8-way parallel cipher functions */ @@ -37,72 +38,38 @@ static inline void twofish_enc_blk_3way(const void *ctx, u8 *dst, const u8 *src) __twofish_enc_blk_3way(ctx, dst, src, false); } -static const struct common_glue_ctx twofish_enc = { - .num_funcs = 3, - .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .ecb = twofish_ecb_enc_8way } - }, { - .num_blocks = 3, - .fn_u = { .ecb = twofish_enc_blk_3way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = twofish_enc_blk } - } } -}; - -static const struct common_glue_ctx twofish_dec = { - .num_funcs = 3, - .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .ecb = twofish_ecb_dec_8way } - }, { - .num_blocks = 3, - .fn_u = { .ecb = twofish_dec_blk_3way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = twofish_dec_blk } - } } -}; - -static const struct common_glue_ctx twofish_dec_cbc = { - .num_funcs = 3, - .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .cbc = twofish_cbc_dec_8way } - }, { - .num_blocks = 3, - .fn_u = { .cbc = twofish_dec_blk_cbc_3way } - }, { - .num_blocks = 1, - .fn_u = { .cbc = twofish_dec_blk } - } } -}; - static int ecb_encrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&twofish_enc, req); + ECB_WALK_START(req, TF_BLOCK_SIZE, TWOFISH_PARALLEL_BLOCKS); + ECB_BLOCK(TWOFISH_PARALLEL_BLOCKS, twofish_ecb_enc_8way); + ECB_BLOCK(3, twofish_enc_blk_3way); + ECB_BLOCK(1, twofish_enc_blk); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&twofish_dec, req); + ECB_WALK_START(req, TF_BLOCK_SIZE, TWOFISH_PARALLEL_BLOCKS); + ECB_BLOCK(TWOFISH_PARALLEL_BLOCKS, twofish_ecb_dec_8way); + ECB_BLOCK(3, twofish_dec_blk_3way); + ECB_BLOCK(1, twofish_dec_blk); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(twofish_enc_blk, req); + CBC_WALK_START(req, TF_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(twofish_enc_blk); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_req_128bit(&twofish_dec_cbc, req); + CBC_WALK_START(req, TF_BLOCK_SIZE, TWOFISH_PARALLEL_BLOCKS); + CBC_DEC_BLOCK(TWOFISH_PARALLEL_BLOCKS, twofish_cbc_dec_8way); + CBC_DEC_BLOCK(3, twofish_dec_blk_cbc_3way); + CBC_DEC_BLOCK(1, twofish_dec_blk); + CBC_WALK_END(); } static struct skcipher_alg twofish_algs[] = { diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 88252370db0a..d1fdefa5195a 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -5,17 +5,16 @@ * Copyright (c) 2011 Jussi Kivilinna */ -#include #include #include -#include -#include #include #include #include #include #include +#include "ecb_cbc_helpers.h" + EXPORT_SYMBOL_GPL(__twofish_enc_blk_3way); EXPORT_SYMBOL_GPL(twofish_dec_blk_3way); @@ -30,79 +29,48 @@ static inline void twofish_enc_blk_3way(const void *ctx, u8 *dst, const u8 *src) __twofish_enc_blk_3way(ctx, dst, src, false); } -void twofish_dec_blk_cbc_3way(const void *ctx, u8 *d, const u8 *s) +void twofish_dec_blk_cbc_3way(const void *ctx, u8 *dst, const u8 *src) { - u128 ivs[2]; - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - - ivs[0] = src[0]; - ivs[1] = src[1]; + u8 buf[2][TF_BLOCK_SIZE]; + const u8 *s = src; - twofish_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src); + if (dst == src) + s = memcpy(buf, src, sizeof(buf)); + twofish_dec_blk_3way(ctx, dst, src); + crypto_xor(dst + TF_BLOCK_SIZE, s, sizeof(buf)); - u128_xor(&dst[1], &dst[1], &ivs[0]); - u128_xor(&dst[2], &dst[2], &ivs[1]); } EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way); -static const struct common_glue_ctx twofish_enc = { - .num_funcs = 2, - .fpu_blocks_limit = -1, - - .funcs = { { - .num_blocks = 3, - .fn_u = { .ecb = twofish_enc_blk_3way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = twofish_enc_blk } - } } -}; - -static const struct common_glue_ctx twofish_dec = { - .num_funcs = 2, - .fpu_blocks_limit = -1, - - .funcs = { { - .num_blocks = 3, - .fn_u = { .ecb = twofish_dec_blk_3way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = twofish_dec_blk } - } } -}; - -static const struct common_glue_ctx twofish_dec_cbc = { - .num_funcs = 2, - .fpu_blocks_limit = -1, - - .funcs = { { - .num_blocks = 3, - .fn_u = { .cbc = twofish_dec_blk_cbc_3way } - }, { - .num_blocks = 1, - .fn_u = { .cbc = twofish_dec_blk } - } } -}; - static int ecb_encrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&twofish_enc, req); + ECB_WALK_START(req, TF_BLOCK_SIZE, -1); + ECB_BLOCK(3, twofish_enc_blk_3way); + ECB_BLOCK(1, twofish_enc_blk); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&twofish_dec, req); + ECB_WALK_START(req, TF_BLOCK_SIZE, -1); + ECB_BLOCK(3, twofish_dec_blk_3way); + ECB_BLOCK(1, twofish_dec_blk); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(twofish_enc_blk, req); + CBC_WALK_START(req, TF_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(twofish_enc_blk); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_req_128bit(&twofish_dec_cbc, req); + CBC_WALK_START(req, TF_BLOCK_SIZE, -1); + CBC_DEC_BLOCK(3, twofish_dec_blk_cbc_3way); + CBC_DEC_BLOCK(1, twofish_dec_blk); + CBC_WALK_END(); } static struct skcipher_alg tf_skciphers[] = { -- cgit v1.2.3 From 64ca771cd6bf48bd01f630ad1440ab151d1d19d5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Jan 2021 17:48:08 +0100 Subject: crypto: x86 - remove glue helper module All dependencies on the x86 glue helper module have been replaced by local instantiations of the new ECB/CBC preprocessor helper macros, so the glue helper module can be retired. Acked-by: Eric Biggers Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 2 - arch/x86/crypto/glue_helper.c | 155 ------------------------------ arch/x86/include/asm/crypto/glue_helper.h | 74 -------------- 3 files changed, 231 deletions(-) delete mode 100644 arch/x86/crypto/glue_helper.c delete mode 100644 arch/x86/include/asm/crypto/glue_helper.h (limited to 'arch') diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index a31de0c6ccde..b28e36b7c96b 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -4,8 +4,6 @@ OBJECT_FILES_NON_STANDARD := y -obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o - obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c deleted file mode 100644 index 895d34150c3f..000000000000 --- a/arch/x86/crypto/glue_helper.c +++ /dev/null @@ -1,155 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Shared glue code for 128bit block ciphers - * - * Copyright © 2012-2013 Jussi Kivilinna - * - * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: - * Copyright (c) 2006 Herbert Xu - */ - -#include -#include -#include -#include -#include - -int glue_ecb_req_128bit(const struct common_glue_ctx *gctx, - struct skcipher_request *req) -{ - void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); - const unsigned int bsize = 128 / 8; - struct skcipher_walk walk; - bool fpu_enabled = false; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes)) { - const u8 *src = walk.src.virt.addr; - u8 *dst = walk.dst.virt.addr; - unsigned int func_bytes; - unsigned int i; - - fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - &walk, fpu_enabled, nbytes); - for (i = 0; i < gctx->num_funcs; i++) { - func_bytes = bsize * gctx->funcs[i].num_blocks; - - if (nbytes < func_bytes) - continue; - - /* Process multi-block batch */ - do { - gctx->funcs[i].fn_u.ecb(ctx, dst, src); - src += func_bytes; - dst += func_bytes; - nbytes -= func_bytes; - } while (nbytes >= func_bytes); - - if (nbytes < bsize) - break; - } - err = skcipher_walk_done(&walk, nbytes); - } - - glue_fpu_end(fpu_enabled); - return err; -} -EXPORT_SYMBOL_GPL(glue_ecb_req_128bit); - -int glue_cbc_encrypt_req_128bit(const common_glue_func_t fn, - struct skcipher_request *req) -{ - void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); - const unsigned int bsize = 128 / 8; - struct skcipher_walk walk; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes)) { - const u128 *src = (u128 *)walk.src.virt.addr; - u128 *dst = (u128 *)walk.dst.virt.addr; - u128 *iv = (u128 *)walk.iv; - - do { - u128_xor(dst, src, iv); - fn(ctx, (u8 *)dst, (u8 *)dst); - iv = dst; - src++; - dst++; - nbytes -= bsize; - } while (nbytes >= bsize); - - *(u128 *)walk.iv = *iv; - err = skcipher_walk_done(&walk, nbytes); - } - return err; -} -EXPORT_SYMBOL_GPL(glue_cbc_encrypt_req_128bit); - -int glue_cbc_decrypt_req_128bit(const struct common_glue_ctx *gctx, - struct skcipher_request *req) -{ - void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); - const unsigned int bsize = 128 / 8; - struct skcipher_walk walk; - bool fpu_enabled = false; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes)) { - const u128 *src = walk.src.virt.addr; - u128 *dst = walk.dst.virt.addr; - unsigned int func_bytes, num_blocks; - unsigned int i; - u128 last_iv; - - fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - &walk, fpu_enabled, nbytes); - /* Start of the last block. */ - src += nbytes / bsize - 1; - dst += nbytes / bsize - 1; - - last_iv = *src; - - for (i = 0; i < gctx->num_funcs; i++) { - num_blocks = gctx->funcs[i].num_blocks; - func_bytes = bsize * num_blocks; - - if (nbytes < func_bytes) - continue; - - /* Process multi-block batch */ - do { - src -= num_blocks - 1; - dst -= num_blocks - 1; - - gctx->funcs[i].fn_u.cbc(ctx, (u8 *)dst, - (const u8 *)src); - - nbytes -= func_bytes; - if (nbytes < bsize) - goto done; - - u128_xor(dst, dst, --src); - dst--; - } while (nbytes >= func_bytes); - } -done: - u128_xor(dst, dst, (u128 *)walk.iv); - *(u128 *)walk.iv = last_iv; - err = skcipher_walk_done(&walk, nbytes); - } - - glue_fpu_end(fpu_enabled); - return err; -} -EXPORT_SYMBOL_GPL(glue_cbc_decrypt_req_128bit); - -MODULE_LICENSE("GPL"); diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h deleted file mode 100644 index 23e09efd2aa6..000000000000 --- a/arch/x86/include/asm/crypto/glue_helper.h +++ /dev/null @@ -1,74 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Shared glue code for 128bit block ciphers - */ - -#ifndef _CRYPTO_GLUE_HELPER_H -#define _CRYPTO_GLUE_HELPER_H - -#include -#include -#include - -typedef void (*common_glue_func_t)(const void *ctx, u8 *dst, const u8 *src); -typedef void (*common_glue_cbc_func_t)(const void *ctx, u8 *dst, const u8 *src); - -struct common_glue_func_entry { - unsigned int num_blocks; /* number of blocks that @fn will process */ - union { - common_glue_func_t ecb; - common_glue_cbc_func_t cbc; - } fn_u; -}; - -struct common_glue_ctx { - unsigned int num_funcs; - int fpu_blocks_limit; /* -1 means fpu not needed at all */ - - /* - * First funcs entry must have largest num_blocks and last funcs entry - * must have num_blocks == 1! - */ - struct common_glue_func_entry funcs[]; -}; - -static inline bool glue_fpu_begin(unsigned int bsize, int fpu_blocks_limit, - struct skcipher_walk *walk, - bool fpu_enabled, unsigned int nbytes) -{ - if (likely(fpu_blocks_limit < 0)) - return false; - - if (fpu_enabled) - return true; - - /* - * Vector-registers are only used when chunk to be processed is large - * enough, so do not enable FPU until it is necessary. - */ - if (nbytes < bsize * (unsigned int)fpu_blocks_limit) - return false; - - /* prevent sleeping if FPU is in use */ - skcipher_walk_atomise(walk); - - kernel_fpu_begin(); - return true; -} - -static inline void glue_fpu_end(bool fpu_enabled) -{ - if (fpu_enabled) - kernel_fpu_end(); -} - -extern int glue_ecb_req_128bit(const struct common_glue_ctx *gctx, - struct skcipher_request *req); - -extern int glue_cbc_encrypt_req_128bit(const common_glue_func_t fn, - struct skcipher_request *req); - -extern int glue_cbc_decrypt_req_128bit(const struct common_glue_ctx *gctx, - struct skcipher_request *req); - -#endif /* _CRYPTO_GLUE_HELPER_H */ -- cgit v1.2.3 From a04ea6f7ffa27d5825b56cb1591ad0992910992c Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Jan 2021 17:48:09 +0100 Subject: crypto: x86 - use local headers for x86 specific shared declarations The Camellia, Serpent and Twofish related header files only contain declarations that are shared between different implementations of the respective algorithms residing under arch/x86/crypto, and none of their contents should be used elsewhere. So move the header files into the same location, and use local #includes instead. Acked-by: Eric Biggers Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/camellia.h | 67 ++++++++++++++++++++++++++++++ arch/x86/crypto/camellia_aesni_avx2_glue.c | 2 +- arch/x86/crypto/camellia_aesni_avx_glue.c | 2 +- arch/x86/crypto/camellia_glue.c | 2 +- arch/x86/crypto/serpent-avx.h | 21 ++++++++++ arch/x86/crypto/serpent-sse2.h | 60 ++++++++++++++++++++++++++ arch/x86/crypto/serpent_avx2_glue.c | 2 +- arch/x86/crypto/serpent_avx_glue.c | 2 +- arch/x86/crypto/serpent_sse2_glue.c | 2 +- arch/x86/crypto/twofish.h | 21 ++++++++++ arch/x86/crypto/twofish_avx_glue.c | 2 +- arch/x86/crypto/twofish_glue_3way.c | 2 +- arch/x86/include/asm/crypto/camellia.h | 67 ------------------------------ arch/x86/include/asm/crypto/serpent-avx.h | 21 ---------- arch/x86/include/asm/crypto/serpent-sse2.h | 60 -------------------------- arch/x86/include/asm/crypto/twofish.h | 21 ---------- 16 files changed, 177 insertions(+), 177 deletions(-) create mode 100644 arch/x86/crypto/camellia.h create mode 100644 arch/x86/crypto/serpent-avx.h create mode 100644 arch/x86/crypto/serpent-sse2.h create mode 100644 arch/x86/crypto/twofish.h delete mode 100644 arch/x86/include/asm/crypto/camellia.h delete mode 100644 arch/x86/include/asm/crypto/serpent-avx.h delete mode 100644 arch/x86/include/asm/crypto/serpent-sse2.h delete mode 100644 arch/x86/include/asm/crypto/twofish.h (limited to 'arch') diff --git a/arch/x86/crypto/camellia.h b/arch/x86/crypto/camellia.h new file mode 100644 index 000000000000..1dcea79e8f8e --- /dev/null +++ b/arch/x86/crypto/camellia.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef ASM_X86_CAMELLIA_H +#define ASM_X86_CAMELLIA_H + +#include +#include +#include + +#define CAMELLIA_MIN_KEY_SIZE 16 +#define CAMELLIA_MAX_KEY_SIZE 32 +#define CAMELLIA_BLOCK_SIZE 16 +#define CAMELLIA_TABLE_BYTE_LEN 272 +#define CAMELLIA_PARALLEL_BLOCKS 2 + +struct crypto_skcipher; + +struct camellia_ctx { + u64 key_table[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)]; + u32 key_length; +}; + +extern int __camellia_setkey(struct camellia_ctx *cctx, + const unsigned char *key, + unsigned int key_len); + +/* regular block cipher functions */ +asmlinkage void __camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src, + bool xor); +asmlinkage void camellia_dec_blk(const void *ctx, u8 *dst, const u8 *src); + +/* 2-way parallel cipher functions */ +asmlinkage void __camellia_enc_blk_2way(const void *ctx, u8 *dst, const u8 *src, + bool xor); +asmlinkage void camellia_dec_blk_2way(const void *ctx, u8 *dst, const u8 *src); + +/* 16-way parallel cipher functions (avx/aes-ni) */ +asmlinkage void camellia_ecb_enc_16way(const void *ctx, u8 *dst, const u8 *src); +asmlinkage void camellia_ecb_dec_16way(const void *ctx, u8 *dst, const u8 *src); + +asmlinkage void camellia_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src); + +static inline void camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src) +{ + __camellia_enc_blk(ctx, dst, src, false); +} + +static inline void camellia_enc_blk_xor(const void *ctx, u8 *dst, const u8 *src) +{ + __camellia_enc_blk(ctx, dst, src, true); +} + +static inline void camellia_enc_blk_2way(const void *ctx, u8 *dst, + const u8 *src) +{ + __camellia_enc_blk_2way(ctx, dst, src, false); +} + +static inline void camellia_enc_blk_xor_2way(const void *ctx, u8 *dst, + const u8 *src) +{ + __camellia_enc_blk_2way(ctx, dst, src, true); +} + +/* glue helpers */ +extern void camellia_decrypt_cbc_2way(const void *ctx, u8 *dst, const u8 *src); + +#endif /* ASM_X86_CAMELLIA_H */ diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index ef5c0f094584..e7e4d64e9577 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -5,7 +5,6 @@ * Copyright © 2013 Jussi Kivilinna */ -#include #include #include #include @@ -13,6 +12,7 @@ #include #include +#include "camellia.h" #include "ecb_cbc_helpers.h" #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16 diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index 68fed0a79889..c7ccf63e741e 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -5,7 +5,6 @@ * Copyright © 2012-2013 Jussi Kivilinna */ -#include #include #include #include @@ -13,6 +12,7 @@ #include #include +#include "camellia.h" #include "ecb_cbc_helpers.h" #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16 diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index 0bc00ce68484..66c435ba9d3d 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -14,8 +14,8 @@ #include #include #include -#include +#include "camellia.h" #include "ecb_cbc_helpers.h" /* regular block cipher functions */ diff --git a/arch/x86/crypto/serpent-avx.h b/arch/x86/crypto/serpent-avx.h new file mode 100644 index 000000000000..23f3361a0e72 --- /dev/null +++ b/arch/x86/crypto/serpent-avx.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef ASM_X86_SERPENT_AVX_H +#define ASM_X86_SERPENT_AVX_H + +#include +#include +#include + +struct crypto_skcipher; + +#define SERPENT_PARALLEL_BLOCKS 8 + +asmlinkage void serpent_ecb_enc_8way_avx(const void *ctx, u8 *dst, + const u8 *src); +asmlinkage void serpent_ecb_dec_8way_avx(const void *ctx, u8 *dst, + const u8 *src); + +asmlinkage void serpent_cbc_dec_8way_avx(const void *ctx, u8 *dst, + const u8 *src); + +#endif diff --git a/arch/x86/crypto/serpent-sse2.h b/arch/x86/crypto/serpent-sse2.h new file mode 100644 index 000000000000..860ca248914b --- /dev/null +++ b/arch/x86/crypto/serpent-sse2.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef ASM_X86_SERPENT_SSE2_H +#define ASM_X86_SERPENT_SSE2_H + +#include +#include + +#ifdef CONFIG_X86_32 + +#define SERPENT_PARALLEL_BLOCKS 4 + +asmlinkage void __serpent_enc_blk_4way(const struct serpent_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void serpent_dec_blk_4way(const struct serpent_ctx *ctx, u8 *dst, + const u8 *src); + +static inline void serpent_enc_blk_xway(const void *ctx, u8 *dst, const u8 *src) +{ + __serpent_enc_blk_4way(ctx, dst, src, false); +} + +static inline void serpent_enc_blk_xway_xor(const struct serpent_ctx *ctx, + u8 *dst, const u8 *src) +{ + __serpent_enc_blk_4way(ctx, dst, src, true); +} + +static inline void serpent_dec_blk_xway(const void *ctx, u8 *dst, const u8 *src) +{ + serpent_dec_blk_4way(ctx, dst, src); +} + +#else + +#define SERPENT_PARALLEL_BLOCKS 8 + +asmlinkage void __serpent_enc_blk_8way(const struct serpent_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void serpent_dec_blk_8way(const struct serpent_ctx *ctx, u8 *dst, + const u8 *src); + +static inline void serpent_enc_blk_xway(const void *ctx, u8 *dst, const u8 *src) +{ + __serpent_enc_blk_8way(ctx, dst, src, false); +} + +static inline void serpent_enc_blk_xway_xor(const struct serpent_ctx *ctx, + u8 *dst, const u8 *src) +{ + __serpent_enc_blk_8way(ctx, dst, src, true); +} + +static inline void serpent_dec_blk_xway(const void *ctx, u8 *dst, const u8 *src) +{ + serpent_dec_blk_8way(ctx, dst, src); +} + +#endif + +#endif diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index 261c9ac2d762..ccf0b5fa4933 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -12,8 +12,8 @@ #include #include #include -#include +#include "serpent-avx.h" #include "ecb_cbc_helpers.h" #define SERPENT_AVX2_PARALLEL_BLOCKS 16 diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index 5fe01d2a5b1d..6c248e1ea4ef 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -15,8 +15,8 @@ #include #include #include -#include +#include "serpent-avx.h" #include "ecb_cbc_helpers.h" /* 8-way parallel cipher functions */ diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index e28d60949c16..d78f37e9b2cf 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -20,8 +20,8 @@ #include #include #include -#include +#include "serpent-sse2.h" #include "ecb_cbc_helpers.h" static int serpent_setkey_skcipher(struct crypto_skcipher *tfm, diff --git a/arch/x86/crypto/twofish.h b/arch/x86/crypto/twofish.h new file mode 100644 index 000000000000..12df400e6d53 --- /dev/null +++ b/arch/x86/crypto/twofish.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef ASM_X86_TWOFISH_H +#define ASM_X86_TWOFISH_H + +#include +#include +#include + +/* regular block cipher functions from twofish_x86_64 module */ +asmlinkage void twofish_enc_blk(const void *ctx, u8 *dst, const u8 *src); +asmlinkage void twofish_dec_blk(const void *ctx, u8 *dst, const u8 *src); + +/* 3-way parallel cipher functions */ +asmlinkage void __twofish_enc_blk_3way(const void *ctx, u8 *dst, const u8 *src, + bool xor); +asmlinkage void twofish_dec_blk_3way(const void *ctx, u8 *dst, const u8 *src); + +/* helpers from twofish_x86_64-3way module */ +extern void twofish_dec_blk_cbc_3way(const void *ctx, u8 *dst, const u8 *src); + +#endif /* ASM_X86_TWOFISH_H */ diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index 6ce198f808a5..3eb3440b477a 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -15,8 +15,8 @@ #include #include #include -#include +#include "twofish.h" #include "ecb_cbc_helpers.h" #define TWOFISH_PARALLEL_BLOCKS 8 diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index d1fdefa5195a..03725696397c 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -5,7 +5,6 @@ * Copyright (c) 2011 Jussi Kivilinna */ -#include #include #include #include @@ -13,6 +12,7 @@ #include #include +#include "twofish.h" #include "ecb_cbc_helpers.h" EXPORT_SYMBOL_GPL(__twofish_enc_blk_3way); diff --git a/arch/x86/include/asm/crypto/camellia.h b/arch/x86/include/asm/crypto/camellia.h deleted file mode 100644 index 1dcea79e8f8e..000000000000 --- a/arch/x86/include/asm/crypto/camellia.h +++ /dev/null @@ -1,67 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ASM_X86_CAMELLIA_H -#define ASM_X86_CAMELLIA_H - -#include -#include -#include - -#define CAMELLIA_MIN_KEY_SIZE 16 -#define CAMELLIA_MAX_KEY_SIZE 32 -#define CAMELLIA_BLOCK_SIZE 16 -#define CAMELLIA_TABLE_BYTE_LEN 272 -#define CAMELLIA_PARALLEL_BLOCKS 2 - -struct crypto_skcipher; - -struct camellia_ctx { - u64 key_table[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)]; - u32 key_length; -}; - -extern int __camellia_setkey(struct camellia_ctx *cctx, - const unsigned char *key, - unsigned int key_len); - -/* regular block cipher functions */ -asmlinkage void __camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src, - bool xor); -asmlinkage void camellia_dec_blk(const void *ctx, u8 *dst, const u8 *src); - -/* 2-way parallel cipher functions */ -asmlinkage void __camellia_enc_blk_2way(const void *ctx, u8 *dst, const u8 *src, - bool xor); -asmlinkage void camellia_dec_blk_2way(const void *ctx, u8 *dst, const u8 *src); - -/* 16-way parallel cipher functions (avx/aes-ni) */ -asmlinkage void camellia_ecb_enc_16way(const void *ctx, u8 *dst, const u8 *src); -asmlinkage void camellia_ecb_dec_16way(const void *ctx, u8 *dst, const u8 *src); - -asmlinkage void camellia_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src); - -static inline void camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src) -{ - __camellia_enc_blk(ctx, dst, src, false); -} - -static inline void camellia_enc_blk_xor(const void *ctx, u8 *dst, const u8 *src) -{ - __camellia_enc_blk(ctx, dst, src, true); -} - -static inline void camellia_enc_blk_2way(const void *ctx, u8 *dst, - const u8 *src) -{ - __camellia_enc_blk_2way(ctx, dst, src, false); -} - -static inline void camellia_enc_blk_xor_2way(const void *ctx, u8 *dst, - const u8 *src) -{ - __camellia_enc_blk_2way(ctx, dst, src, true); -} - -/* glue helpers */ -extern void camellia_decrypt_cbc_2way(const void *ctx, u8 *dst, const u8 *src); - -#endif /* ASM_X86_CAMELLIA_H */ diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h deleted file mode 100644 index 23f3361a0e72..000000000000 --- a/arch/x86/include/asm/crypto/serpent-avx.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ASM_X86_SERPENT_AVX_H -#define ASM_X86_SERPENT_AVX_H - -#include -#include -#include - -struct crypto_skcipher; - -#define SERPENT_PARALLEL_BLOCKS 8 - -asmlinkage void serpent_ecb_enc_8way_avx(const void *ctx, u8 *dst, - const u8 *src); -asmlinkage void serpent_ecb_dec_8way_avx(const void *ctx, u8 *dst, - const u8 *src); - -asmlinkage void serpent_cbc_dec_8way_avx(const void *ctx, u8 *dst, - const u8 *src); - -#endif diff --git a/arch/x86/include/asm/crypto/serpent-sse2.h b/arch/x86/include/asm/crypto/serpent-sse2.h deleted file mode 100644 index 860ca248914b..000000000000 --- a/arch/x86/include/asm/crypto/serpent-sse2.h +++ /dev/null @@ -1,60 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ASM_X86_SERPENT_SSE2_H -#define ASM_X86_SERPENT_SSE2_H - -#include -#include - -#ifdef CONFIG_X86_32 - -#define SERPENT_PARALLEL_BLOCKS 4 - -asmlinkage void __serpent_enc_blk_4way(const struct serpent_ctx *ctx, u8 *dst, - const u8 *src, bool xor); -asmlinkage void serpent_dec_blk_4way(const struct serpent_ctx *ctx, u8 *dst, - const u8 *src); - -static inline void serpent_enc_blk_xway(const void *ctx, u8 *dst, const u8 *src) -{ - __serpent_enc_blk_4way(ctx, dst, src, false); -} - -static inline void serpent_enc_blk_xway_xor(const struct serpent_ctx *ctx, - u8 *dst, const u8 *src) -{ - __serpent_enc_blk_4way(ctx, dst, src, true); -} - -static inline void serpent_dec_blk_xway(const void *ctx, u8 *dst, const u8 *src) -{ - serpent_dec_blk_4way(ctx, dst, src); -} - -#else - -#define SERPENT_PARALLEL_BLOCKS 8 - -asmlinkage void __serpent_enc_blk_8way(const struct serpent_ctx *ctx, u8 *dst, - const u8 *src, bool xor); -asmlinkage void serpent_dec_blk_8way(const struct serpent_ctx *ctx, u8 *dst, - const u8 *src); - -static inline void serpent_enc_blk_xway(const void *ctx, u8 *dst, const u8 *src) -{ - __serpent_enc_blk_8way(ctx, dst, src, false); -} - -static inline void serpent_enc_blk_xway_xor(const struct serpent_ctx *ctx, - u8 *dst, const u8 *src) -{ - __serpent_enc_blk_8way(ctx, dst, src, true); -} - -static inline void serpent_dec_blk_xway(const void *ctx, u8 *dst, const u8 *src) -{ - serpent_dec_blk_8way(ctx, dst, src); -} - -#endif - -#endif diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/include/asm/crypto/twofish.h deleted file mode 100644 index 12df400e6d53..000000000000 --- a/arch/x86/include/asm/crypto/twofish.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ASM_X86_TWOFISH_H -#define ASM_X86_TWOFISH_H - -#include -#include -#include - -/* regular block cipher functions from twofish_x86_64 module */ -asmlinkage void twofish_enc_blk(const void *ctx, u8 *dst, const u8 *src); -asmlinkage void twofish_dec_blk(const void *ctx, u8 *dst, const u8 *src); - -/* 3-way parallel cipher functions */ -asmlinkage void __twofish_enc_blk_3way(const void *ctx, u8 *dst, const u8 *src, - bool xor); -asmlinkage void twofish_dec_blk_3way(const void *ctx, u8 *dst, const u8 *src); - -/* helpers from twofish_x86_64-3way module */ -extern void twofish_dec_blk_cbc_3way(const void *ctx, u8 *dst, const u8 *src); - -#endif /* ASM_X86_TWOFISH_H */ -- cgit v1.2.3 From 0df07d8117c3576f1603b05b84089742a118d10a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 14 Jan 2021 19:10:10 +0100 Subject: crypto: arm64/sha - add missing module aliases The accelerated, instruction based implementations of SHA1, SHA2 and SHA3 are autoloaded based on CPU capabilities, given that the code is modest in size, and widely used, which means that resolving the algo name, loading all compatible modules and picking the one with the highest priority is taken to be suboptimal. However, if these algorithms are requested before this CPU feature based matching and autoloading occurs, these modules are not even considered, and we end up with suboptimal performance. So add the missing module aliases for the various SHA implementations. Cc: Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/sha1-ce-glue.c | 1 + arch/arm64/crypto/sha2-ce-glue.c | 2 ++ arch/arm64/crypto/sha3-ce-glue.c | 4 ++++ arch/arm64/crypto/sha512-ce-glue.c | 2 ++ 4 files changed, 9 insertions(+) (limited to 'arch') diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c index c93121bcfdeb..c1362861765f 100644 --- a/arch/arm64/crypto/sha1-ce-glue.c +++ b/arch/arm64/crypto/sha1-ce-glue.c @@ -19,6 +19,7 @@ MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel "); MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("sha1"); struct sha1_ce_state { struct sha1_state sst; diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c index 31ba3da5e61b..ded3a6488f81 100644 --- a/arch/arm64/crypto/sha2-ce-glue.c +++ b/arch/arm64/crypto/sha2-ce-glue.c @@ -19,6 +19,8 @@ MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel "); MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("sha224"); +MODULE_ALIAS_CRYPTO("sha256"); struct sha256_ce_state { struct sha256_state sst; diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c index e5a2936f0886..7288d3046354 100644 --- a/arch/arm64/crypto/sha3-ce-glue.c +++ b/arch/arm64/crypto/sha3-ce-glue.c @@ -23,6 +23,10 @@ MODULE_DESCRIPTION("SHA3 secure hash using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel "); MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("sha3-224"); +MODULE_ALIAS_CRYPTO("sha3-256"); +MODULE_ALIAS_CRYPTO("sha3-384"); +MODULE_ALIAS_CRYPTO("sha3-512"); asmlinkage void sha3_ce_transform(u64 *st, const u8 *data, int blocks, int md_len); diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c index faa83f6cf376..a6b1adf31c56 100644 --- a/arch/arm64/crypto/sha512-ce-glue.c +++ b/arch/arm64/crypto/sha512-ce-glue.c @@ -23,6 +23,8 @@ MODULE_DESCRIPTION("SHA-384/SHA-512 secure hash using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel "); MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("sha384"); +MODULE_ALIAS_CRYPTO("sha512"); asmlinkage void sha512_ce_transform(struct sha512_state *sst, u8 const *src, int blocks); -- cgit v1.2.3 From 64a49b85953cafeaba2b4c2c13d089b3ed41cca6 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 16 Jan 2021 17:48:09 +0100 Subject: crypto: aesni - replace CTR function pointer with static call Indirect calls are very expensive on x86, so use a static call to set the system-wide AES-NI/CTR asm helper. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_glue.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index a548fdbc3073..d96685457196 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -34,6 +34,7 @@ #include #include #include +#include #define AESNI_ALIGN 16 @@ -107,10 +108,9 @@ asmlinkage void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *out, #ifdef CONFIG_X86_64 -static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out, - const u8 *in, unsigned int len, u8 *iv); asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); +DEFINE_STATIC_CALL(aesni_ctr_enc_tfm, aesni_ctr_enc); /* Scatter / Gather routines, with args similar to above */ asmlinkage void aesni_gcm_init(void *ctx, @@ -520,8 +520,10 @@ static int ctr_crypt(struct skcipher_request *req) kernel_fpu_begin(); while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) { - aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr, - nbytes & AES_BLOCK_MASK, walk.iv); + static_call(aesni_ctr_enc_tfm)(ctx, walk.dst.virt.addr, + walk.src.virt.addr, + nbytes & AES_BLOCK_MASK, + walk.iv); nbytes &= AES_BLOCK_SIZE - 1; err = skcipher_walk_done(&walk, nbytes); } @@ -1160,10 +1162,9 @@ static int __init aesni_init(void) } else { pr_info("SSE version of gcm_enc/dec engaged.\n"); } - aesni_ctr_enc_tfm = aesni_ctr_enc; if (boot_cpu_has(X86_FEATURE_AVX)) { /* optimize performance of ctr mode encryption transform */ - aesni_ctr_enc_tfm = aesni_ctr_enc_avx_tfm; + static_call_update(aesni_ctr_enc_tfm, aesni_ctr_enc_avx_tfm); pr_info("AES CTR mode by8 optimization enabled\n"); } #endif -- cgit v1.2.3 From 65d1e3c415f6e380f6168faf333a59ec235eac5d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 16 Jan 2021 17:48:10 +0100 Subject: crypto: aesni - release FPU during skcipher walk API calls Taking ownership of the FPU in kernel mode disables preemption, and this may result in excessive scheduling blackouts if the size of the data being processed on the FPU is unbounded. Given that taking and releasing the FPU is cheap these days on x86, we can limit the impact of this issue easily for skcipher implementations, by moving the FPU begin/end calls inside the skcipher walk processing loop. Considering that skcipher walks operate on at most one page at a time, doing so fully mitigates this issue. This also permits the skcipher walk logic to use non-atomic kmalloc() calls etc so we can change the 'atomic' bool argument in the calls to skcipher_walk_virt() to false as well. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_glue.c | 73 +++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 41 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index d96685457196..2144e54a6c89 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -283,16 +283,16 @@ static int ecb_encrypt(struct skcipher_request *req) unsigned int nbytes; int err; - err = skcipher_walk_virt(&walk, req, true); + err = skcipher_walk_virt(&walk, req, false); - kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { + kernel_fpu_begin(); aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK); + kernel_fpu_end(); nbytes &= AES_BLOCK_SIZE - 1; err = skcipher_walk_done(&walk, nbytes); } - kernel_fpu_end(); return err; } @@ -305,16 +305,16 @@ static int ecb_decrypt(struct skcipher_request *req) unsigned int nbytes; int err; - err = skcipher_walk_virt(&walk, req, true); + err = skcipher_walk_virt(&walk, req, false); - kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { + kernel_fpu_begin(); aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK); + kernel_fpu_end(); nbytes &= AES_BLOCK_SIZE - 1; err = skcipher_walk_done(&walk, nbytes); } - kernel_fpu_end(); return err; } @@ -327,16 +327,16 @@ static int cbc_encrypt(struct skcipher_request *req) unsigned int nbytes; int err; - err = skcipher_walk_virt(&walk, req, true); + err = skcipher_walk_virt(&walk, req, false); - kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { + kernel_fpu_begin(); aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK, walk.iv); + kernel_fpu_end(); nbytes &= AES_BLOCK_SIZE - 1; err = skcipher_walk_done(&walk, nbytes); } - kernel_fpu_end(); return err; } @@ -349,16 +349,16 @@ static int cbc_decrypt(struct skcipher_request *req) unsigned int nbytes; int err; - err = skcipher_walk_virt(&walk, req, true); + err = skcipher_walk_virt(&walk, req, false); - kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { + kernel_fpu_begin(); aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK, walk.iv); + kernel_fpu_end(); nbytes &= AES_BLOCK_SIZE - 1; err = skcipher_walk_done(&walk, nbytes); } - kernel_fpu_end(); return err; } @@ -476,21 +476,6 @@ static int cts_cbc_decrypt(struct skcipher_request *req) } #ifdef CONFIG_X86_64 -static void ctr_crypt_final(struct crypto_aes_ctx *ctx, - struct skcipher_walk *walk) -{ - u8 *ctrblk = walk->iv; - u8 keystream[AES_BLOCK_SIZE]; - u8 *src = walk->src.virt.addr; - u8 *dst = walk->dst.virt.addr; - unsigned int nbytes = walk->nbytes; - - aesni_enc(ctx, keystream, ctrblk); - crypto_xor_cpy(dst, keystream, src, nbytes); - - crypto_inc(ctrblk, AES_BLOCK_SIZE); -} - static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv) { @@ -512,27 +497,33 @@ static int ctr_crypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); + u8 keystream[AES_BLOCK_SIZE]; struct skcipher_walk walk; unsigned int nbytes; int err; - err = skcipher_walk_virt(&walk, req, true); + err = skcipher_walk_virt(&walk, req, false); - kernel_fpu_begin(); - while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) { - static_call(aesni_ctr_enc_tfm)(ctx, walk.dst.virt.addr, - walk.src.virt.addr, - nbytes & AES_BLOCK_MASK, - walk.iv); - nbytes &= AES_BLOCK_SIZE - 1; + while ((nbytes = walk.nbytes) > 0) { + kernel_fpu_begin(); + if (nbytes & AES_BLOCK_MASK) + static_call(aesni_ctr_enc_tfm)(ctx, walk.dst.virt.addr, + walk.src.virt.addr, + nbytes & AES_BLOCK_MASK, + walk.iv); + nbytes &= ~AES_BLOCK_MASK; + + if (walk.nbytes == walk.total && nbytes > 0) { + aesni_enc(ctx, keystream, walk.iv); + crypto_xor_cpy(walk.dst.virt.addr + walk.nbytes - nbytes, + walk.src.virt.addr + walk.nbytes - nbytes, + keystream, nbytes); + crypto_inc(walk.iv, AES_BLOCK_SIZE); + nbytes = 0; + } + kernel_fpu_end(); err = skcipher_walk_done(&walk, nbytes); } - if (walk.nbytes) { - ctr_crypt_final(ctx, &walk); - err = skcipher_walk_done(&walk, 0); - } - kernel_fpu_end(); - return err; } -- cgit v1.2.3 From 578f23d359bf7c988b1c9026d4711de7112b0c1c Mon Sep 17 00:00:00 2001 From: Yang Li Date: Tue, 2 Feb 2021 11:17:30 +0800 Subject: crypto: powerpc/sha256 - remove unneeded semicolon Eliminate the following coccicheck warning: ./arch/powerpc/crypto/sha256-spe-glue.c:132:2-3: Unneeded semicolon Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Herbert Xu --- arch/powerpc/crypto/sha256-spe-glue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/crypto/sha256-spe-glue.c b/arch/powerpc/crypto/sha256-spe-glue.c index a6e650a97d8f..ffedea7e4bef 100644 --- a/arch/powerpc/crypto/sha256-spe-glue.c +++ b/arch/powerpc/crypto/sha256-spe-glue.c @@ -129,7 +129,7 @@ static int ppc_spe_sha256_update(struct shash_desc *desc, const u8 *data, src += bytes; len -= bytes; - }; + } memcpy((char *)sctx->buf, src, len); return 0; -- cgit v1.2.3 From 5a69e1b73d5460953b8198ab03e9e1c86c5aeb11 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 3 Feb 2021 12:36:19 +0100 Subject: crypto: arm64/sha1-ce - simplify NEON yield Instead of calling into kernel_neon_end() and kernel_neon_begin() (and potentially into schedule()) from the assembler code when running in task mode and a reschedule is pending, perform only the preempt count check in assembler, but simply return early in this case, and let the C code deal with the consequences. This reverts commit 7df8d164753e6e6f229b72767595072bc6a71f48. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/sha1-ce-core.S | 47 +++++++++++++++------------------------- arch/arm64/crypto/sha1-ce-glue.c | 22 ++++++++++--------- 2 files changed, 29 insertions(+), 40 deletions(-) (limited to 'arch') diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S index 92d0d2753e81..8c02bbc2684e 100644 --- a/arch/arm64/crypto/sha1-ce-core.S +++ b/arch/arm64/crypto/sha1-ce-core.S @@ -62,40 +62,34 @@ .endm /* - * void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src, - * int blocks) + * int sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src, + * int blocks) */ SYM_FUNC_START(sha1_ce_transform) - frame_push 3 - - mov x19, x0 - mov x20, x1 - mov x21, x2 - /* load round constants */ -0: loadrc k0.4s, 0x5a827999, w6 + loadrc k0.4s, 0x5a827999, w6 loadrc k1.4s, 0x6ed9eba1, w6 loadrc k2.4s, 0x8f1bbcdc, w6 loadrc k3.4s, 0xca62c1d6, w6 /* load state */ - ld1 {dgav.4s}, [x19] - ldr dgb, [x19, #16] + ld1 {dgav.4s}, [x0] + ldr dgb, [x0, #16] /* load sha1_ce_state::finalize */ ldr_l w4, sha1_ce_offsetof_finalize, x4 - ldr w4, [x19, x4] + ldr w4, [x0, x4] /* load input */ -1: ld1 {v8.4s-v11.4s}, [x20], #64 - sub w21, w21, #1 +0: ld1 {v8.4s-v11.4s}, [x1], #64 + sub w2, w2, #1 CPU_LE( rev32 v8.16b, v8.16b ) CPU_LE( rev32 v9.16b, v9.16b ) CPU_LE( rev32 v10.16b, v10.16b ) CPU_LE( rev32 v11.16b, v11.16b ) -2: add t0.4s, v8.4s, k0.4s +1: add t0.4s, v8.4s, k0.4s mov dg0v.16b, dgav.16b add_update c, ev, k0, 8, 9, 10, 11, dgb @@ -126,25 +120,18 @@ CPU_LE( rev32 v11.16b, v11.16b ) add dgbv.2s, dgbv.2s, dg1v.2s add dgav.4s, dgav.4s, dg0v.4s - cbz w21, 3f - - if_will_cond_yield_neon - st1 {dgav.4s}, [x19] - str dgb, [x19, #16] - do_cond_yield_neon + cbz w2, 2f + cond_yield 3f, x5 b 0b - endif_yield_neon - - b 1b /* * Final block: add padding and total bit count. * Skip if the input size was not a round multiple of the block size, * the padding is handled by the C code in that case. */ -3: cbz x4, 4f +2: cbz x4, 3f ldr_l w4, sha1_ce_offsetof_count, x4 - ldr x4, [x19, x4] + ldr x4, [x0, x4] movi v9.2d, #0 mov x8, #0x80000000 movi v10.2d, #0 @@ -153,11 +140,11 @@ CPU_LE( rev32 v11.16b, v11.16b ) mov x4, #0 mov v11.d[0], xzr mov v11.d[1], x7 - b 2b + b 1b /* store new state */ -4: st1 {dgav.4s}, [x19] - str dgb, [x19, #16] - frame_pop +3: st1 {dgav.4s}, [x0] + str dgb, [x0, #16] + mov w0, w2 ret SYM_FUNC_END(sha1_ce_transform) diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c index c1362861765f..71fa4f1122d7 100644 --- a/arch/arm64/crypto/sha1-ce-glue.c +++ b/arch/arm64/crypto/sha1-ce-glue.c @@ -29,14 +29,22 @@ struct sha1_ce_state { extern const u32 sha1_ce_offsetof_count; extern const u32 sha1_ce_offsetof_finalize; -asmlinkage void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src, - int blocks); +asmlinkage int sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src, + int blocks); static void __sha1_ce_transform(struct sha1_state *sst, u8 const *src, int blocks) { - sha1_ce_transform(container_of(sst, struct sha1_ce_state, sst), src, - blocks); + while (blocks) { + int rem; + + kernel_neon_begin(); + rem = sha1_ce_transform(container_of(sst, struct sha1_ce_state, + sst), src, blocks); + kernel_neon_end(); + src += (blocks - rem) * SHA1_BLOCK_SIZE; + blocks = rem; + } } const u32 sha1_ce_offsetof_count = offsetof(struct sha1_ce_state, sst.count); @@ -51,9 +59,7 @@ static int sha1_ce_update(struct shash_desc *desc, const u8 *data, return crypto_sha1_update(desc, data, len); sctx->finalize = 0; - kernel_neon_begin(); sha1_base_do_update(desc, data, len, __sha1_ce_transform); - kernel_neon_end(); return 0; } @@ -73,11 +79,9 @@ static int sha1_ce_finup(struct shash_desc *desc, const u8 *data, */ sctx->finalize = finalize; - kernel_neon_begin(); sha1_base_do_update(desc, data, len, __sha1_ce_transform); if (!finalize) sha1_base_do_finalize(desc, __sha1_ce_transform); - kernel_neon_end(); return sha1_base_finish(desc, out); } @@ -89,9 +93,7 @@ static int sha1_ce_final(struct shash_desc *desc, u8 *out) return crypto_sha1_finup(desc, NULL, 0, out); sctx->finalize = 0; - kernel_neon_begin(); sha1_base_do_finalize(desc, __sha1_ce_transform); - kernel_neon_end(); return sha1_base_finish(desc, out); } -- cgit v1.2.3 From b2eadbf40e8f82279f145aa841727b2e01f7dc1d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 3 Feb 2021 12:36:20 +0100 Subject: crypto: arm64/sha2-ce - simplify NEON yield Instead of calling into kernel_neon_end() and kernel_neon_begin() (and potentially into schedule()) from the assembler code when running in task mode and a reschedule is pending, perform only the preempt count check in assembler, but simply return early in this case, and let the C code deal with the consequences. This reverts commit d82f37ab5e2426287013eba38b1212e8b71e5be3. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/sha2-ce-core.S | 38 +++++++++++++------------------------- arch/arm64/crypto/sha2-ce-glue.c | 22 ++++++++++++---------- 2 files changed, 25 insertions(+), 35 deletions(-) (limited to 'arch') diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S index 3f9d0f326987..6cdea7d56059 100644 --- a/arch/arm64/crypto/sha2-ce-core.S +++ b/arch/arm64/crypto/sha2-ce-core.S @@ -76,36 +76,30 @@ */ .text SYM_FUNC_START(sha2_ce_transform) - frame_push 3 - - mov x19, x0 - mov x20, x1 - mov x21, x2 - /* load round constants */ -0: adr_l x8, .Lsha2_rcon + adr_l x8, .Lsha2_rcon ld1 { v0.4s- v3.4s}, [x8], #64 ld1 { v4.4s- v7.4s}, [x8], #64 ld1 { v8.4s-v11.4s}, [x8], #64 ld1 {v12.4s-v15.4s}, [x8] /* load state */ - ld1 {dgav.4s, dgbv.4s}, [x19] + ld1 {dgav.4s, dgbv.4s}, [x0] /* load sha256_ce_state::finalize */ ldr_l w4, sha256_ce_offsetof_finalize, x4 - ldr w4, [x19, x4] + ldr w4, [x0, x4] /* load input */ -1: ld1 {v16.4s-v19.4s}, [x20], #64 - sub w21, w21, #1 +0: ld1 {v16.4s-v19.4s}, [x1], #64 + sub w2, w2, #1 CPU_LE( rev32 v16.16b, v16.16b ) CPU_LE( rev32 v17.16b, v17.16b ) CPU_LE( rev32 v18.16b, v18.16b ) CPU_LE( rev32 v19.16b, v19.16b ) -2: add t0.4s, v16.4s, v0.4s +1: add t0.4s, v16.4s, v0.4s mov dg0v.16b, dgav.16b mov dg1v.16b, dgbv.16b @@ -134,24 +128,18 @@ CPU_LE( rev32 v19.16b, v19.16b ) add dgbv.4s, dgbv.4s, dg1v.4s /* handled all input blocks? */ - cbz w21, 3f - - if_will_cond_yield_neon - st1 {dgav.4s, dgbv.4s}, [x19] - do_cond_yield_neon + cbz w2, 2f + cond_yield 3f, x5 b 0b - endif_yield_neon - - b 1b /* * Final block: add padding and total bit count. * Skip if the input size was not a round multiple of the block size, * the padding is handled by the C code in that case. */ -3: cbz x4, 4f +2: cbz x4, 3f ldr_l w4, sha256_ce_offsetof_count, x4 - ldr x4, [x19, x4] + ldr x4, [x0, x4] movi v17.2d, #0 mov x8, #0x80000000 movi v18.2d, #0 @@ -160,10 +148,10 @@ CPU_LE( rev32 v19.16b, v19.16b ) mov x4, #0 mov v19.d[0], xzr mov v19.d[1], x7 - b 2b + b 1b /* store new state */ -4: st1 {dgav.4s, dgbv.4s}, [x19] - frame_pop +3: st1 {dgav.4s, dgbv.4s}, [x0] + mov w0, w2 ret SYM_FUNC_END(sha2_ce_transform) diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c index ded3a6488f81..c57a6119fefc 100644 --- a/arch/arm64/crypto/sha2-ce-glue.c +++ b/arch/arm64/crypto/sha2-ce-glue.c @@ -30,14 +30,22 @@ struct sha256_ce_state { extern const u32 sha256_ce_offsetof_count; extern const u32 sha256_ce_offsetof_finalize; -asmlinkage void sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src, - int blocks); +asmlinkage int sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src, + int blocks); static void __sha2_ce_transform(struct sha256_state *sst, u8 const *src, int blocks) { - sha2_ce_transform(container_of(sst, struct sha256_ce_state, sst), src, - blocks); + while (blocks) { + int rem; + + kernel_neon_begin(); + rem = sha2_ce_transform(container_of(sst, struct sha256_ce_state, + sst), src, blocks); + kernel_neon_end(); + src += (blocks - rem) * SHA256_BLOCK_SIZE; + blocks = rem; + } } const u32 sha256_ce_offsetof_count = offsetof(struct sha256_ce_state, @@ -63,9 +71,7 @@ static int sha256_ce_update(struct shash_desc *desc, const u8 *data, __sha256_block_data_order); sctx->finalize = 0; - kernel_neon_begin(); sha256_base_do_update(desc, data, len, __sha2_ce_transform); - kernel_neon_end(); return 0; } @@ -90,11 +96,9 @@ static int sha256_ce_finup(struct shash_desc *desc, const u8 *data, */ sctx->finalize = finalize; - kernel_neon_begin(); sha256_base_do_update(desc, data, len, __sha2_ce_transform); if (!finalize) sha256_base_do_finalize(desc, __sha2_ce_transform); - kernel_neon_end(); return sha256_base_finish(desc, out); } @@ -108,9 +112,7 @@ static int sha256_ce_final(struct shash_desc *desc, u8 *out) } sctx->finalize = 0; - kernel_neon_begin(); sha256_base_do_finalize(desc, __sha2_ce_transform); - kernel_neon_end(); return sha256_base_finish(desc, out); } -- cgit v1.2.3 From 9ecc9f31d0a43d538d80f51debfb25d75da44892 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 3 Feb 2021 12:36:21 +0100 Subject: crypto: arm64/sha3-ce - simplify NEON yield Instead of calling into kernel_neon_end() and kernel_neon_begin() (and potentially into schedule()) from the assembler code when running in task mode and a reschedule is pending, perform only the preempt count check in assembler, but simply return early in this case, and let the C code deal with the consequences. This reverts commit 7edc86cb1c18b4c274672232117586ea2bef1d9a. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/sha3-ce-core.S | 81 +++++++++++++++------------------------- arch/arm64/crypto/sha3-ce-glue.c | 14 ++++--- 2 files changed, 39 insertions(+), 56 deletions(-) (limited to 'arch') diff --git a/arch/arm64/crypto/sha3-ce-core.S b/arch/arm64/crypto/sha3-ce-core.S index 1cfb768df350..6f5208414fe3 100644 --- a/arch/arm64/crypto/sha3-ce-core.S +++ b/arch/arm64/crypto/sha3-ce-core.S @@ -37,20 +37,13 @@ .endm /* - * sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size) + * int sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size) */ .text SYM_FUNC_START(sha3_ce_transform) - frame_push 4 - - mov x19, x0 - mov x20, x1 - mov x21, x2 - mov x22, x3 - -0: /* load state */ - add x8, x19, #32 - ld1 { v0.1d- v3.1d}, [x19] + /* load state */ + add x8, x0, #32 + ld1 { v0.1d- v3.1d}, [x0] ld1 { v4.1d- v7.1d}, [x8], #32 ld1 { v8.1d-v11.1d}, [x8], #32 ld1 {v12.1d-v15.1d}, [x8], #32 @@ -58,13 +51,13 @@ SYM_FUNC_START(sha3_ce_transform) ld1 {v20.1d-v23.1d}, [x8], #32 ld1 {v24.1d}, [x8] -1: sub w21, w21, #1 +0: sub w2, w2, #1 mov w8, #24 adr_l x9, .Lsha3_rcon /* load input */ - ld1 {v25.8b-v28.8b}, [x20], #32 - ld1 {v29.8b-v31.8b}, [x20], #24 + ld1 {v25.8b-v28.8b}, [x1], #32 + ld1 {v29.8b-v31.8b}, [x1], #24 eor v0.8b, v0.8b, v25.8b eor v1.8b, v1.8b, v26.8b eor v2.8b, v2.8b, v27.8b @@ -73,10 +66,10 @@ SYM_FUNC_START(sha3_ce_transform) eor v5.8b, v5.8b, v30.8b eor v6.8b, v6.8b, v31.8b - tbnz x22, #6, 3f // SHA3-512 + tbnz x3, #6, 2f // SHA3-512 - ld1 {v25.8b-v28.8b}, [x20], #32 - ld1 {v29.8b-v30.8b}, [x20], #16 + ld1 {v25.8b-v28.8b}, [x1], #32 + ld1 {v29.8b-v30.8b}, [x1], #16 eor v7.8b, v7.8b, v25.8b eor v8.8b, v8.8b, v26.8b eor v9.8b, v9.8b, v27.8b @@ -84,34 +77,34 @@ SYM_FUNC_START(sha3_ce_transform) eor v11.8b, v11.8b, v29.8b eor v12.8b, v12.8b, v30.8b - tbnz x22, #4, 2f // SHA3-384 or SHA3-224 + tbnz x3, #4, 1f // SHA3-384 or SHA3-224 // SHA3-256 - ld1 {v25.8b-v28.8b}, [x20], #32 + ld1 {v25.8b-v28.8b}, [x1], #32 eor v13.8b, v13.8b, v25.8b eor v14.8b, v14.8b, v26.8b eor v15.8b, v15.8b, v27.8b eor v16.8b, v16.8b, v28.8b - b 4f + b 3f -2: tbz x22, #2, 4f // bit 2 cleared? SHA-384 +1: tbz x3, #2, 3f // bit 2 cleared? SHA-384 // SHA3-224 - ld1 {v25.8b-v28.8b}, [x20], #32 - ld1 {v29.8b}, [x20], #8 + ld1 {v25.8b-v28.8b}, [x1], #32 + ld1 {v29.8b}, [x1], #8 eor v13.8b, v13.8b, v25.8b eor v14.8b, v14.8b, v26.8b eor v15.8b, v15.8b, v27.8b eor v16.8b, v16.8b, v28.8b eor v17.8b, v17.8b, v29.8b - b 4f + b 3f // SHA3-512 -3: ld1 {v25.8b-v26.8b}, [x20], #16 +2: ld1 {v25.8b-v26.8b}, [x1], #16 eor v7.8b, v7.8b, v25.8b eor v8.8b, v8.8b, v26.8b -4: sub w8, w8, #1 +3: sub w8, w8, #1 eor3 v29.16b, v4.16b, v9.16b, v14.16b eor3 v26.16b, v1.16b, v6.16b, v11.16b @@ -190,33 +183,19 @@ SYM_FUNC_START(sha3_ce_transform) eor v0.16b, v0.16b, v31.16b - cbnz w8, 4b - cbz w21, 5f - - if_will_cond_yield_neon - add x8, x19, #32 - st1 { v0.1d- v3.1d}, [x19] - st1 { v4.1d- v7.1d}, [x8], #32 - st1 { v8.1d-v11.1d}, [x8], #32 - st1 {v12.1d-v15.1d}, [x8], #32 - st1 {v16.1d-v19.1d}, [x8], #32 - st1 {v20.1d-v23.1d}, [x8], #32 - st1 {v24.1d}, [x8] - do_cond_yield_neon - b 0b - endif_yield_neon - - b 1b + cbnz w8, 3b + cond_yield 3f, x8 + cbnz w2, 0b /* save state */ -5: st1 { v0.1d- v3.1d}, [x19], #32 - st1 { v4.1d- v7.1d}, [x19], #32 - st1 { v8.1d-v11.1d}, [x19], #32 - st1 {v12.1d-v15.1d}, [x19], #32 - st1 {v16.1d-v19.1d}, [x19], #32 - st1 {v20.1d-v23.1d}, [x19], #32 - st1 {v24.1d}, [x19] - frame_pop +3: st1 { v0.1d- v3.1d}, [x0], #32 + st1 { v4.1d- v7.1d}, [x0], #32 + st1 { v8.1d-v11.1d}, [x0], #32 + st1 {v12.1d-v15.1d}, [x0], #32 + st1 {v16.1d-v19.1d}, [x0], #32 + st1 {v20.1d-v23.1d}, [x0], #32 + st1 {v24.1d}, [x0] + mov w0, w2 ret SYM_FUNC_END(sha3_ce_transform) diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c index 7288d3046354..8c65cecf560a 100644 --- a/arch/arm64/crypto/sha3-ce-glue.c +++ b/arch/arm64/crypto/sha3-ce-glue.c @@ -28,8 +28,8 @@ MODULE_ALIAS_CRYPTO("sha3-256"); MODULE_ALIAS_CRYPTO("sha3-384"); MODULE_ALIAS_CRYPTO("sha3-512"); -asmlinkage void sha3_ce_transform(u64 *st, const u8 *data, int blocks, - int md_len); +asmlinkage int sha3_ce_transform(u64 *st, const u8 *data, int blocks, + int md_len); static int sha3_update(struct shash_desc *desc, const u8 *data, unsigned int len) @@ -59,11 +59,15 @@ static int sha3_update(struct shash_desc *desc, const u8 *data, blocks = len / sctx->rsiz; len %= sctx->rsiz; - if (blocks) { + while (blocks) { + int rem; + kernel_neon_begin(); - sha3_ce_transform(sctx->st, data, blocks, digest_size); + rem = sha3_ce_transform(sctx->st, data, blocks, + digest_size); kernel_neon_end(); - data += blocks * sctx->rsiz; + data += (blocks - rem) * sctx->rsiz; + blocks = rem; } } -- cgit v1.2.3 From 5f6cb2e6176815cf631593eb7a94a2725d8528e5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 3 Feb 2021 12:36:22 +0100 Subject: crypto: arm64/sha512-ce - simplify NEON yield Instead of calling into kernel_neon_end() and kernel_neon_begin() (and potentially into schedule()) from the assembler code when running in task mode and a reschedule is pending, perform only the preempt count check in assembler, but simply return early in this case, and let the C code deal with the consequences. This reverts commit 6caf7adc5e458f77f550b6c6ca8effa152d61b4a. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/sha512-ce-core.S | 29 ++++++--------------- arch/arm64/crypto/sha512-ce-glue.c | 53 +++++++++++++++++++------------------- 2 files changed, 34 insertions(+), 48 deletions(-) (limited to 'arch') diff --git a/arch/arm64/crypto/sha512-ce-core.S b/arch/arm64/crypto/sha512-ce-core.S index cde606c0323e..d6e7f6c95fa6 100644 --- a/arch/arm64/crypto/sha512-ce-core.S +++ b/arch/arm64/crypto/sha512-ce-core.S @@ -107,23 +107,17 @@ */ .text SYM_FUNC_START(sha512_ce_transform) - frame_push 3 - - mov x19, x0 - mov x20, x1 - mov x21, x2 - /* load state */ -0: ld1 {v8.2d-v11.2d}, [x19] + ld1 {v8.2d-v11.2d}, [x0] /* load first 4 round constants */ adr_l x3, .Lsha512_rcon ld1 {v20.2d-v23.2d}, [x3], #64 /* load input */ -1: ld1 {v12.2d-v15.2d}, [x20], #64 - ld1 {v16.2d-v19.2d}, [x20], #64 - sub w21, w21, #1 +0: ld1 {v12.2d-v15.2d}, [x1], #64 + ld1 {v16.2d-v19.2d}, [x1], #64 + sub w2, w2, #1 CPU_LE( rev64 v12.16b, v12.16b ) CPU_LE( rev64 v13.16b, v13.16b ) @@ -201,19 +195,12 @@ CPU_LE( rev64 v19.16b, v19.16b ) add v10.2d, v10.2d, v2.2d add v11.2d, v11.2d, v3.2d + cond_yield 3f, x4 /* handled all input blocks? */ - cbz w21, 3f - - if_will_cond_yield_neon - st1 {v8.2d-v11.2d}, [x19] - do_cond_yield_neon - b 0b - endif_yield_neon - - b 1b + cbnz w2, 0b /* store new state */ -3: st1 {v8.2d-v11.2d}, [x19] - frame_pop +3: st1 {v8.2d-v11.2d}, [x0] + mov w0, w2 ret SYM_FUNC_END(sha512_ce_transform) diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c index a6b1adf31c56..e62a094a9d52 100644 --- a/arch/arm64/crypto/sha512-ce-glue.c +++ b/arch/arm64/crypto/sha512-ce-glue.c @@ -26,11 +26,25 @@ MODULE_LICENSE("GPL v2"); MODULE_ALIAS_CRYPTO("sha384"); MODULE_ALIAS_CRYPTO("sha512"); -asmlinkage void sha512_ce_transform(struct sha512_state *sst, u8 const *src, - int blocks); +asmlinkage int sha512_ce_transform(struct sha512_state *sst, u8 const *src, + int blocks); asmlinkage void sha512_block_data_order(u64 *digest, u8 const *src, int blocks); +static void __sha512_ce_transform(struct sha512_state *sst, u8 const *src, + int blocks) +{ + while (blocks) { + int rem; + + kernel_neon_begin(); + rem = sha512_ce_transform(sst, src, blocks); + kernel_neon_end(); + src += (blocks - rem) * SHA512_BLOCK_SIZE; + blocks = rem; + } +} + static void __sha512_block_data_order(struct sha512_state *sst, u8 const *src, int blocks) { @@ -40,45 +54,30 @@ static void __sha512_block_data_order(struct sha512_state *sst, u8 const *src, static int sha512_ce_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - if (!crypto_simd_usable()) - return sha512_base_do_update(desc, data, len, - __sha512_block_data_order); - - kernel_neon_begin(); - sha512_base_do_update(desc, data, len, sha512_ce_transform); - kernel_neon_end(); + sha512_block_fn *fn = crypto_simd_usable() ? __sha512_ce_transform + : __sha512_block_data_order; + sha512_base_do_update(desc, data, len, fn); return 0; } static int sha512_ce_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { - if (!crypto_simd_usable()) { - if (len) - sha512_base_do_update(desc, data, len, - __sha512_block_data_order); - sha512_base_do_finalize(desc, __sha512_block_data_order); - return sha512_base_finish(desc, out); - } + sha512_block_fn *fn = crypto_simd_usable() ? __sha512_ce_transform + : __sha512_block_data_order; - kernel_neon_begin(); - sha512_base_do_update(desc, data, len, sha512_ce_transform); - sha512_base_do_finalize(desc, sha512_ce_transform); - kernel_neon_end(); + sha512_base_do_update(desc, data, len, fn); + sha512_base_do_finalize(desc, fn); return sha512_base_finish(desc, out); } static int sha512_ce_final(struct shash_desc *desc, u8 *out) { - if (!crypto_simd_usable()) { - sha512_base_do_finalize(desc, __sha512_block_data_order); - return sha512_base_finish(desc, out); - } + sha512_block_fn *fn = crypto_simd_usable() ? __sha512_ce_transform + : __sha512_block_data_order; - kernel_neon_begin(); - sha512_base_do_finalize(desc, sha512_ce_transform); - kernel_neon_end(); + sha512_base_do_finalize(desc, fn); return sha512_base_finish(desc, out); } -- cgit v1.2.3 From f5943ef456f8961ed1266a5713b8faf73019405b Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 3 Feb 2021 12:36:23 +0100 Subject: crypto: arm64/aes-neonbs - remove NEON yield calls There is no need for elaborate yield handling in the bit-sliced NEON implementation of AES, given that skciphers are naturally bounded by the size of the chunks returned by the skcipher_walk API. So remove the yield calls from the asm code. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/aes-neonbs-core.S | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/arm64/crypto/aes-neonbs-core.S b/arch/arm64/crypto/aes-neonbs-core.S index 63a52ad9a75c..a3405b8c344b 100644 --- a/arch/arm64/crypto/aes-neonbs-core.S +++ b/arch/arm64/crypto/aes-neonbs-core.S @@ -613,7 +613,6 @@ SYM_FUNC_END(aesbs_decrypt8) st1 {\o7\().16b}, [x19], #16 cbz x23, 1f - cond_yield_neon b 99b 1: frame_pop @@ -715,7 +714,6 @@ SYM_FUNC_START(aesbs_cbc_decrypt) 1: st1 {v24.16b}, [x24] // store IV cbz x23, 2f - cond_yield_neon b 99b 2: frame_pop @@ -801,7 +799,7 @@ SYM_FUNC_END(__xts_crypt8) mov x23, x4 mov x24, x5 -0: movi v30.2s, #0x1 + movi v30.2s, #0x1 movi v25.2s, #0x87 uzp1 v30.4s, v30.4s, v25.4s ld1 {v25.16b}, [x24] @@ -846,7 +844,6 @@ SYM_FUNC_END(__xts_crypt8) cbz x23, 1f st1 {v25.16b}, [x24] - cond_yield_neon 0b b 99b 1: st1 {v25.16b}, [x24] @@ -889,7 +886,7 @@ SYM_FUNC_START(aesbs_ctr_encrypt) cset x26, ne add x23, x23, x26 // do one extra block if final -98: ldp x7, x8, [x24] + ldp x7, x8, [x24] ld1 {v0.16b}, [x24] CPU_LE( rev x7, x7 ) CPU_LE( rev x8, x8 ) @@ -967,7 +964,6 @@ CPU_LE( rev x8, x8 ) st1 {v0.16b}, [x24] cbz x23, .Lctr_done - cond_yield_neon 98b b 99b .Lctr_done: -- cgit v1.2.3 From f0070f4a7934e4deba83fdde70c79d9798b2366b Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 3 Feb 2021 12:36:24 +0100 Subject: crypto: arm64/aes-ce-mac - simplify NEON yield Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/aes-glue.c | 21 +++++++++++------ arch/arm64/crypto/aes-modes.S | 52 ++++++++++++++++--------------------------- 2 files changed, 33 insertions(+), 40 deletions(-) (limited to 'arch') diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index e7f116d833b9..17e735931a0c 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -105,9 +105,9 @@ asmlinkage void aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds, int blocks, u8 iv[], u32 const rk2[]); -asmlinkage void aes_mac_update(u8 const in[], u32 const rk[], int rounds, - int blocks, u8 dg[], int enc_before, - int enc_after); +asmlinkage int aes_mac_update(u8 const in[], u32 const rk[], int rounds, + int blocks, u8 dg[], int enc_before, + int enc_after); struct crypto_aes_xts_ctx { struct crypto_aes_ctx key1; @@ -856,10 +856,17 @@ static void mac_do_update(struct crypto_aes_ctx *ctx, u8 const in[], int blocks, int rounds = 6 + ctx->key_length / 4; if (crypto_simd_usable()) { - kernel_neon_begin(); - aes_mac_update(in, ctx->key_enc, rounds, blocks, dg, enc_before, - enc_after); - kernel_neon_end(); + int rem; + + do { + kernel_neon_begin(); + rem = aes_mac_update(in, ctx->key_enc, rounds, blocks, + dg, enc_before, enc_after); + kernel_neon_end(); + in += (blocks - rem) * AES_BLOCK_SIZE; + blocks = rem; + enc_before = 0; + } while (blocks); } else { if (enc_before) aes_encrypt(ctx, dg, dg); diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S index 3d1f97799899..bbdb54702aa7 100644 --- a/arch/arm64/crypto/aes-modes.S +++ b/arch/arm64/crypto/aes-modes.S @@ -678,61 +678,47 @@ AES_FUNC_END(aes_xts_decrypt) * int blocks, u8 dg[], int enc_before, int enc_after) */ AES_FUNC_START(aes_mac_update) - frame_push 6 - - mov x19, x0 - mov x20, x1 - mov x21, x2 - mov x22, x3 - mov x23, x4 - mov x24, x6 - - ld1 {v0.16b}, [x23] /* get dg */ + ld1 {v0.16b}, [x4] /* get dg */ enc_prepare w2, x1, x7 cbz w5, .Lmacloop4x encrypt_block v0, w2, x1, x7, w8 .Lmacloop4x: - subs w22, w22, #4 + subs w3, w3, #4 bmi .Lmac1x - ld1 {v1.16b-v4.16b}, [x19], #64 /* get next pt block */ + ld1 {v1.16b-v4.16b}, [x0], #64 /* get next pt block */ eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ - encrypt_block v0, w21, x20, x7, w8 + encrypt_block v0, w2, x1, x7, w8 eor v0.16b, v0.16b, v2.16b - encrypt_block v0, w21, x20, x7, w8 + encrypt_block v0, w2, x1, x7, w8 eor v0.16b, v0.16b, v3.16b - encrypt_block v0, w21, x20, x7, w8 + encrypt_block v0, w2, x1, x7, w8 eor v0.16b, v0.16b, v4.16b - cmp w22, wzr - csinv x5, x24, xzr, eq + cmp w3, wzr + csinv x5, x6, xzr, eq cbz w5, .Lmacout - encrypt_block v0, w21, x20, x7, w8 - st1 {v0.16b}, [x23] /* return dg */ - cond_yield_neon .Lmacrestart + encrypt_block v0, w2, x1, x7, w8 + st1 {v0.16b}, [x4] /* return dg */ + cond_yield .Lmacout, x7 b .Lmacloop4x .Lmac1x: - add w22, w22, #4 + add w3, w3, #4 .Lmacloop: - cbz w22, .Lmacout - ld1 {v1.16b}, [x19], #16 /* get next pt block */ + cbz w3, .Lmacout + ld1 {v1.16b}, [x0], #16 /* get next pt block */ eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ - subs w22, w22, #1 - csinv x5, x24, xzr, eq + subs w3, w3, #1 + csinv x5, x6, xzr, eq cbz w5, .Lmacout .Lmacenc: - encrypt_block v0, w21, x20, x7, w8 + encrypt_block v0, w2, x1, x7, w8 b .Lmacloop .Lmacout: - st1 {v0.16b}, [x23] /* return dg */ - frame_pop + st1 {v0.16b}, [x4] /* return dg */ + mov w0, w3 ret - -.Lmacrestart: - ld1 {v0.16b}, [x23] /* get dg */ - enc_prepare w21, x20, x0 - b .Lmacloop4x AES_FUNC_END(aes_mac_update) -- cgit v1.2.3 From fc754c024a343b836cfbb794afd3c7a87f625dbb Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 3 Feb 2021 12:36:25 +0100 Subject: crypto: arm64/crc-t10dif - move NEON yield to C code Instead of yielding from the bowels of the asm routine if a reschedule is needed, divide up the input into 4 KB chunks in the C glue. This simplifies the code substantially, and avoids scheduling out the task with the asm routine on the call stack, which is undesirable from a CFI/instrumentation point of view. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/crct10dif-ce-core.S | 43 +++++++++-------------------------- arch/arm64/crypto/crct10dif-ce-glue.c | 30 +++++++++++++++++++----- 2 files changed, 35 insertions(+), 38 deletions(-) (limited to 'arch') diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S index 111d9c9abddd..dce6dcebfca1 100644 --- a/arch/arm64/crypto/crct10dif-ce-core.S +++ b/arch/arm64/crypto/crct10dif-ce-core.S @@ -68,10 +68,10 @@ .text .arch armv8-a+crypto - init_crc .req w19 - buf .req x20 - len .req x21 - fold_consts_ptr .req x22 + init_crc .req w0 + buf .req x1 + len .req x2 + fold_consts_ptr .req x3 fold_consts .req v10 @@ -257,12 +257,6 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 ) .endm .macro crc_t10dif_pmull, p - frame_push 4, 128 - - mov init_crc, w0 - mov buf, x1 - mov len, x2 - __pmull_init_\p // For sizes less than 256 bytes, we can't fold 128 bytes at a time. @@ -317,26 +311,7 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) fold_32_bytes \p, v6, v7 subs len, len, #128 - b.lt .Lfold_128_bytes_loop_done_\@ - - if_will_cond_yield_neon - stp q0, q1, [sp, #.Lframe_local_offset] - stp q2, q3, [sp, #.Lframe_local_offset + 32] - stp q4, q5, [sp, #.Lframe_local_offset + 64] - stp q6, q7, [sp, #.Lframe_local_offset + 96] - do_cond_yield_neon - ldp q0, q1, [sp, #.Lframe_local_offset] - ldp q2, q3, [sp, #.Lframe_local_offset + 32] - ldp q4, q5, [sp, #.Lframe_local_offset + 64] - ldp q6, q7, [sp, #.Lframe_local_offset + 96] - ld1 {fold_consts.2d}, [fold_consts_ptr] - __pmull_init_\p - __pmull_pre_\p fold_consts - endif_yield_neon - - b .Lfold_128_bytes_loop_\@ - -.Lfold_128_bytes_loop_done_\@: + b.ge .Lfold_128_bytes_loop_\@ // Now fold the 112 bytes in v0-v6 into the 16 bytes in v7. @@ -453,7 +428,9 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0. umov w0, v0.h[0] - frame_pop + .ifc \p, p8 + ldp x29, x30, [sp], #16 + .endif ret .Lless_than_256_bytes_\@: @@ -489,7 +466,9 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) // Assumes len >= 16. // SYM_FUNC_START(crc_t10dif_pmull_p8) - crc_t10dif_pmull p8 + stp x29, x30, [sp, #-16]! + mov x29, sp + crc_t10dif_pmull p8 SYM_FUNC_END(crc_t10dif_pmull_p8) .align 5 diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c index ccc3f6067742..09eb1456aed4 100644 --- a/arch/arm64/crypto/crct10dif-ce-glue.c +++ b/arch/arm64/crypto/crct10dif-ce-glue.c @@ -37,9 +37,18 @@ static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data, u16 *crc = shash_desc_ctx(desc); if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) { - kernel_neon_begin(); - *crc = crc_t10dif_pmull_p8(*crc, data, length); - kernel_neon_end(); + do { + unsigned int chunk = length; + + if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE) + chunk = SZ_4K; + + kernel_neon_begin(); + *crc = crc_t10dif_pmull_p8(*crc, data, chunk); + kernel_neon_end(); + data += chunk; + length -= chunk; + } while (length); } else { *crc = crc_t10dif_generic(*crc, data, length); } @@ -53,9 +62,18 @@ static int crct10dif_update_pmull_p64(struct shash_desc *desc, const u8 *data, u16 *crc = shash_desc_ctx(desc); if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) { - kernel_neon_begin(); - *crc = crc_t10dif_pmull_p64(*crc, data, length); - kernel_neon_end(); + do { + unsigned int chunk = length; + + if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE) + chunk = SZ_4K; + + kernel_neon_begin(); + *crc = crc_t10dif_pmull_p64(*crc, data, chunk); + kernel_neon_end(); + data += chunk; + length -= chunk; + } while (length); } else { *crc = crc_t10dif_generic(*crc, data, length); } -- cgit v1.2.3