aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFathi Boudra <fathi.boudra@linaro.org>2013-05-26 12:15:05 +0300
committerFathi Boudra <fathi.boudra@linaro.org>2013-06-29 15:09:45 +0300
commita7dbcd9ed1d4d9cf3c5e327d4daed85e393303a1 (patch)
tree84e2fda53d6fd06ff21b3be1b211a44f917155ec
parentbb612d6a59521b30e8dbe7b91cd696e2980cbf6b (diff)
Imported Debian patch 1.0.1c-4ubuntu9~linaro1HEADmaster
-rw-r--r--debian/changelog112
-rw-r--r--debian/control4
-rw-r--r--debian/libssl1.0.0.postinst2
-rw-r--r--debian/patches/arm64-support19
-rw-r--r--debian/patches/debian-targets.patch2
-rw-r--r--debian/patches/old/0001-Added-CTR-and-CBC-decrypt-hooks-for-NEON-bit-sliced-.patch1518
-rw-r--r--debian/patches/old/0002-bsaes-armv7.pl-Big-endian-fixes.patch216
-rw-r--r--debian/patches/old/0003-bsaes-armv7.pl-avoid-bit-sliced-AES-CBC-for-block-si.patch24
-rw-r--r--debian/patches/pic.patch18
-rw-r--r--debian/patches/series2
-rw-r--r--debian/patches/ubuntu_deb676533_arm_asm.patch2
11 files changed, 122 insertions, 1797 deletions
diff --git a/debian/changelog b/debian/changelog
index 9c76aae..abb81fa 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,6 +1,6 @@
-openssl (1.0.1c-3ubuntu2.5~linaro2) quantal; urgency=low
+openssl (1.0.1c-4ubuntu9~linaro1) raring; urgency=low
- * Update patches - merged upstream ():
+ * Update patches - merged upstream:
- 0000-bsaes-armv7.patch
- 0000-crypto-modes-modes_lcl.h-let-STRICT_ALIGNMENT-be-on-.patch
- 0001-bsaes-armv7.pl-take-it-into-build-loop.patch
@@ -8,48 +8,40 @@ openssl (1.0.1c-3ubuntu2.5~linaro2) quantal; urgency=low
- 0003-bsaes-armv7.pl-avoid-bit-sliced-AES-CBC-for-block-si.patch
* Update debian/control: build on armhf architecture only.
- -- Fathi Boudra <fathi.boudra@linaro.org> Wed, 24 Apr 2013 12:45:04 +0300
+ -- Fathi Boudra <fathi.boudra@linaro.org> Sun, 26 May 2013 12:15:05 +0300
-openssl (1.0.1c-3ubuntu2.5~linaro1) quantal; urgency=low
+openssl (1.0.1c-4ubuntu8) raring; urgency=low
- * Add patches from Ard Biesheuvel:
- - 0001-Added-CTR-and-CBC-decrypt-hooks-for-NEON-bit-sliced-.patch
- - 0002-bsaes-armv7.pl-Big-endian-fixes.patch
- - 0003-bsaes-armv7.pl-avoid-bit-sliced-AES-CBC-for-block-si.patch
+ * SECURITY UPDATE: "Lucky Thirteen" timing side-channel TLS attack
+ - debian/patches/CVE-2013-0169.patch: re-enabled patch and added extra
+ commit from upstream to fix regression.
+ - CVE-2013-0169
- -- Fathi Boudra <fathi.boudra@linaro.org> Tue, 16 Apr 2013 12:43:11 +0300
+ -- Marc Deslauriers <marc.deslauriers@ubuntu.com> Tue, 19 Mar 2013 14:33:14 -0400
-openssl (1.0.1c-3ubuntu2.4) quantal; urgency=low
+openssl (1.0.1c-4ubuntu7) raring; urgency=low
- [ Dmitrijs Ledkovs ]
- * Enable arm assembly code. (LP: #1083498) (Closes: #676533)
* Enable optimized 64bit elliptic curve code contributed by Google. (LP: #1018522)
- [ Marc Deslauriers ]
- * debian/patches/fix_key_decoding_deadlock.patch: Fix possible deadlock
- when decoding public keys. (LP: #1066032)
-
- -- Dmitrijs Ledkovs <dmitrij.ledkov@ubuntu.com> Thu, 04 Apr 2013 12:15:11 +0100
+ -- Dmitrijs Ledkovs <dmitrij.ledkov@ubuntu.com> Thu, 07 Mar 2013 15:36:16 +0000
-openssl (1.0.1c-3ubuntu2.3) quantal-security; urgency=low
+openssl (1.0.1c-4ubuntu6) raring; urgency=low
- * SECURITY UPDATE: "Lucky Thirteen" timing side-channel TLS attack
- - debian/patches/CVE-2013-0169.patch: re-enabled patch and added extra
- commit from upstream to fix regression.
- - CVE-2013-0169
+ * debian/patches/fix_key_decoding_deadlock.patch: Fix possible deadlock
+ when decoding public keys. (LP: #1066032)
- -- Marc Deslauriers <marc.deslauriers@ubuntu.com> Tue, 19 Mar 2013 14:35:24 -0400
+ -- Marc Deslauriers <marc.deslauriers@ubuntu.com> Wed, 06 Mar 2013 08:11:19 -0500
-openssl (1.0.1c-3ubuntu2.2) quantal-security; urgency=low
+openssl (1.0.1c-4ubuntu5) raring; urgency=low
* REGRESSION FIX: decryption errors on AES-NI hardware (LP: #1134873,
LP: #1133333)
- debian/patches/CVE-2013-0169.patch: disabled for now until fix is
available from upstream.
- -- Marc Deslauriers <marc.deslauriers@ubuntu.com> Thu, 28 Feb 2013 10:56:42 -0500
+ -- Marc Deslauriers <marc.deslauriers@ubuntu.com> Thu, 28 Feb 2013 11:01:29 -0500
-openssl (1.0.1c-3ubuntu2.1) quantal-security; urgency=low
+openssl (1.0.1c-4ubuntu4) raring; urgency=low
* SECURITY UPDATE: denial of service via invalid OCSP key
- debian/patches/CVE-2013-0166.patch: properly handle NULL key in
@@ -62,7 +54,73 @@ openssl (1.0.1c-3ubuntu2.1) quantal-security; urgency=low
- Fix included in CVE-2013-0169 patch
- CVE-2012-2686
- -- Marc Deslauriers <marc.deslauriers@ubuntu.com> Mon, 18 Feb 2013 13:13:42 -0500
+ -- Marc Deslauriers <marc.deslauriers@ubuntu.com> Tue, 19 Feb 2013 13:25:24 -0500
+
+openssl (1.0.1c-4ubuntu3) raring; urgency=low
+
+ * Add basic arm64 support (no assembler) (LP: #1102107)
+
+ -- Wookey <wookey@wookware.org> Sun, 20 Jan 2013 17:30:15 +0000
+
+openssl (1.0.1c-4ubuntu2) raring; urgency=low
+
+ * Enable arm assembly code. (LP: #1083498) (Closes: #676533)
+
+ -- Dmitrijs Ledkovs <dmitrij.ledkov@ubuntu.com> Wed, 28 Nov 2012 00:08:45 +0000
+
+openssl (1.0.1c-4ubuntu1) raring; urgency=low
+
+ * Resynchronise with Debian (LP: #1077228). Remaining changes:
+ - debian/libssl1.0.0.postinst:
+ + Display a system restart required notification on libssl1.0.0
+ upgrade on servers.
+ + Use a different priority for libssl1.0.0/restart-services depending
+ on whether a desktop, or server dist-upgrade is being performed.
+ - debian/{libssl1.0.0-udeb.dirs, control, rules}: Create
+ libssl1.0.0-udeb, for the benefit of wget-udeb (no wget-udeb package
+ in Debian).
+ - debian/{libcrypto1.0.0-udeb.dirs, libssl1.0.0.dirs, libssl1.0.0.files,
+ rules}: Move runtime libraries to /lib, for the benefit of
+ wpasupplicant.
+ - debian/patches/perlpath-quilt.patch: Don't change perl #! paths under
+ .pc.
+ - debian/rules:
+ + Don't run 'make test' when cross-building.
+ + Use host compiler when cross-building. Patch from Neil Williams.
+ + Don't build for processors no longer supported: i586 (on i386)
+ + Fix Makefile to properly clean up libs/ dirs in clean target.
+ + Replace duplicate files in the doc directory with symlinks.
+ - Unapply patch c_rehash-multi and comment it out in the series as it
+ breaks parsing of certificates with CRLF line endings and other cases
+ (see Debian #642314 for discussion), it also changes the semantics of
+ c_rehash directories by requiring applications to parse hash link
+ targets as files containing potentially *multiple* certificates rather
+ than exactly one.
+ - Bump version passed to dh_makeshlibs to 1.0.1 for new symbols.
+ - debian/patches/tls12_workarounds.patch: Workaround large client hello
+ issues when TLS 1.1 and lower is in use
+ - debian/control: Mark Debian Vcs-* as XS-Debian-Vcs-*
+ * Dropped changes:
+ - Drop openssl-doc in favour of the libssl-doc package introduced by
+ Debian. Add Conflicts/Replaces until the next LTS release.
+ + Drop the Conflicts/Replaces because 12.04 LTS was 'the next LTS
+ release'
+
+ -- Tyler Hicks <tyhicks@canonical.com> Fri, 09 Nov 2012 14:49:13 -0800
+
+openssl (1.0.1c-4) unstable; urgency=low
+
+ * Fix the configure rules for alpha (Closes: #672710)
+ * Switch the postinst to sh again, there never was a reason to
+ switch it to bash (Closes: #676398)
+ * Fix pic.patch to not use #ifdef in x86cpuid.s, only .S files are
+ preprocessed. We generate the file again for pic anyway.
+ (Closes: #677468)
+ * Drop Breaks against openssh as it was only for upgrades
+ between versions that were only in testing/unstable.
+ (Closes: #668600)
+
+ -- Kurt Roeckx <kurt@roeckx.be> Tue, 17 Jul 2012 11:49:19 +0200
openssl (1.0.1c-3ubuntu2) quantal; urgency=low
diff --git a/debian/control b/debian/control
index 8868169..b2c0212 100644
--- a/debian/control
+++ b/debian/control
@@ -34,7 +34,6 @@ Architecture: armhf
Multi-Arch: same
Pre-Depends: ${misc:Pre-Depends}
Depends: ${shlibs:Depends}, ${misc:Depends}
-Breaks: openssh-client (<< 1:5.9p1-4), openssh-server (<< 1:5.9p1-4)
Description: SSL shared libraries
libssl and libcrypto shared libraries needed by programs like
apache-ssl, telnet-ssl and openssh.
@@ -79,8 +78,7 @@ Package: libssl-doc
Section: doc
Priority: optional
Architecture: all
-Replaces: libssl-dev (<< 1.0.0), openssl-doc
-Conflicts: openssl-doc
+Replaces: libssl-dev (<< 1.0.0)
Breaks: libssl-dev (<< 1.0.0)
Depends: ${shlibs:Depends}, ${perl:Depends}, ${misc:Depends}
Description: SSL development documentation documentation
diff --git a/debian/libssl1.0.0.postinst b/debian/libssl1.0.0.postinst
index 4e8a17c..57ae577 100644
--- a/debian/libssl1.0.0.postinst
+++ b/debian/libssl1.0.0.postinst
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
. /usr/share/debconf/confmodule
diff --git a/debian/patches/arm64-support b/debian/patches/arm64-support
new file mode 100644
index 0000000..a2f6cd6
--- /dev/null
+++ b/debian/patches/arm64-support
@@ -0,0 +1,19 @@
+Description: Add arm64 support
+ Add 'debian-arm64' to configure so it at least tries to build
+Author: Wookey <wookey@wookware.org>, Riku Voipio <riku.voipio@linaro.org>
+Last-Update: <2013-01-20>
+
+---
+ Configure | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/Configure
++++ b/Configure
+@@ -346,6 +346,7 @@ my %table=(
+ "debian-alpha","gcc:-DTERMIO ${debian_cflags}::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_UNROLL:${alpha_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+ "debian-alpha-ev4","gcc:-DTERMIO ${debian_cflags} -mcpu=ev4::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_UNROLL:${alpha_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+ "debian-alpha-ev5","gcc:-DTERMIO ${debian_cflags} -mcpu=ev5::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_UNROLL:${alpha_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
++"debian-arm64","gcc:-DL_ENDIAN -DTERMIO ${debian_cflags}::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+ "debian-armel","gcc:-DL_ENDIAN -DTERMIO ${debian_cflags}::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${armv4_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+ "debian-armhf","gcc:-DL_ENDIAN -DTERMIO ${debian_cflags}::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${armv4_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+ "debian-amd64", "gcc:-m64 -DL_ENDIAN -DTERMIO ${debian_cflags} -DMD32_REG_T=int::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:elf:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::",
diff --git a/debian/patches/debian-targets.patch b/debian/patches/debian-targets.patch
index 0727967..1a4da29 100644
--- a/debian/patches/debian-targets.patch
+++ b/debian/patches/debian-targets.patch
@@ -20,7 +20,7 @@
"tru64-alpha-cc", "cc:-std1 -tune host -fast -readonly_strings::-pthread:::SIXTY_FOUR_BIT_LONG RC4_CHUNK:${alpha_asm}:dlfcn:alpha-osf1-shared::-msym:.so",
+# Debian GNU/* (various architectures)
-+"debian-alpha","gcc:-DTERMIO $debian_cflag::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_UNROLL:${alpha_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
++"debian-alpha","gcc:-DTERMIO ${debian_cflags}::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_UNROLL:${alpha_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+"debian-alpha-ev4","gcc:-DTERMIO ${debian_cflags} -mcpu=ev4::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_UNROLL:${alpha_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+"debian-alpha-ev5","gcc:-DTERMIO ${debian_cflags} -mcpu=ev5::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_UNROLL:${alpha_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+"debian-armeb","gcc:-DB_ENDIAN -DTERMIO ${debian_cflags}::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
diff --git a/debian/patches/old/0001-Added-CTR-and-CBC-decrypt-hooks-for-NEON-bit-sliced-.patch b/debian/patches/old/0001-Added-CTR-and-CBC-decrypt-hooks-for-NEON-bit-sliced-.patch
deleted file mode 100644
index 0ebaba8..0000000
--- a/debian/patches/old/0001-Added-CTR-and-CBC-decrypt-hooks-for-NEON-bit-sliced-.patch
+++ /dev/null
@@ -1,1518 +0,0 @@
-From 5e51c6b42b8b7d773ab45dcabec1189a1451bebd Mon Sep 17 00:00:00 2001
-From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
-Date: Thu, 11 Apr 2013 17:33:36 +0200
-Subject: [PATCH 1/3] Added CTR and CBC-decrypt hooks for NEON bit sliced AES
-
-The actual bit sliced AES transform was already there, only the
-hooks were missing. These are based heavily on the existing hooks
-for x86_64.
----
- Configure | 2
- crypto/aes/Makefile | 3
- crypto/aes/asm/bsaes-armv7.pl | 1447 ++++++++++++++++++++++++++++++++++++++++++
- crypto/evp/e_aes.c | 8
- 4 files changed, 1458 insertions(+), 2 deletions(-)
-
---- a/Configure
-+++ b/Configure
-@@ -140,7 +140,7 @@ my $alpha_asm="alphacpuid.o:bn_asm.o alp
- my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o::::::::";
- my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o::::::::";
- my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o::aes-s390x.o aes-ctr.o aes-xts.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:";
--my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o::void";
-+my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o bsaes-armv7.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o::void";
- my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::32";
- my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::64";
- my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::::";
---- a/crypto/aes/Makefile
-+++ b/crypto/aes/Makefile
-@@ -79,8 +79,9 @@ aes-mips.S: asm/aes-mips.pl
- $(PERL) asm/aes-mips.pl $(PERLASM_SCHEME) $@
-
- # GNU make "catch all"
--aes-%.S: asm/aes-%.pl; $(PERL) $< $(PERLASM_SCHEME) > $@
-+%.S: asm/%.pl; $(PERL) $< $(PERLASM_SCHEME) > $@
- aes-armv4.o: aes-armv4.S
-+bsaes-armv7.o: bsaes-armv7.S
-
- files:
- $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
---- /dev/null
-+++ b/crypto/aes/asm/bsaes-armv7.pl
-@@ -0,0 +1,1447 @@
-+#!/usr/bin/env perl
-+
-+# ====================================================================
-+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-+# project. The module is, however, dual licensed under OpenSSL and
-+# CRYPTOGAMS licenses depending on where you obtain it. For further
-+# details see http://www.openssl.org/~appro/cryptogams/.
-+# ====================================================================
-+
-+# Bit-sliced AES for ARM NEON
-+#
-+# February 2012.
-+#
-+# This implementation is direct adaptation of bsaes-x86_64 module for
-+# ARM NEON. Except that this module is endian-neutral [in sense that
-+# it can be compiled for either endianness] by courtesy of vld1.8's
-+# neutrality. Initial version doesn't implement interface to OpenSSL,
-+# only low-level primitives and unsupported entry points, just enough
-+# to collect performance results, which for Cortex-A8 core are:
-+#
-+# encrypt 19.5 cycles per byte processed with 128-bit key
-+# decrypt 24.0 cycles per byte processed with 128-bit key
-+# key conv. 440 cycles per 128-bit key/0.18 of 8x block
-+#
-+# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 22.6,
-+# which is [much] worse than anticipated (for further details see
-+# http://www.openssl.org/~appro/Snapdragon-S4.html).
-+#
-+# When comparing to x86_64 results keep in mind that NEON unit is
-+# [mostly] single-issue and thus can't [fully] benefit from
-+# instruction-level parallelism. And when comparing to aes-armv4
-+# results keep in mind key schedule conversion overhead (see
-+# bsaes-x86_64.pl for further details)...
-+#
-+# <appro@openssl.org>
-+
-+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-+open STDOUT,">$output";
-+
-+my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
-+my @XMM=map("q$_",(0..15));
-+
-+{
-+my ($key,$rounds,$const)=("r4","r5","r6");
-+
-+sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
-+sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
-+
-+sub Sbox {
-+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
-+# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
-+my @b=@_[0..7];
-+my @t=@_[8..11];
-+my @s=@_[12..15];
-+ &InBasisChange (@b);
-+ &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
-+ &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
-+}
-+
-+sub InBasisChange {
-+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
-+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
-+my @b=@_[0..7];
-+$code.=<<___;
-+ veor @b[2], @b[2], @b[1]
-+ veor @b[5], @b[5], @b[6]
-+ veor @b[3], @b[3], @b[0]
-+ veor @b[6], @b[6], @b[2]
-+ veor @b[5], @b[5], @b[0]
-+
-+ veor @b[6], @b[6], @b[3]
-+ veor @b[3], @b[3], @b[7]
-+ veor @b[7], @b[7], @b[5]
-+ veor @b[3], @b[3], @b[4]
-+ veor @b[4], @b[4], @b[5]
-+
-+ veor @b[2], @b[2], @b[7]
-+ veor @b[3], @b[3], @b[1]
-+ veor @b[1], @b[1], @b[5]
-+___
-+}
-+
-+sub OutBasisChange {
-+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
-+# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
-+my @b=@_[0..7];
-+$code.=<<___;
-+ veor @b[0], @b[0], @b[6]
-+ veor @b[1], @b[1], @b[4]
-+ veor @b[4], @b[4], @b[6]
-+ veor @b[2], @b[2], @b[0]
-+ veor @b[6], @b[6], @b[1]
-+
-+ veor @b[1], @b[1], @b[5]
-+ veor @b[5], @b[5], @b[3]
-+ veor @b[3], @b[3], @b[7]
-+ veor @b[7], @b[7], @b[5]
-+ veor @b[2], @b[2], @b[5]
-+
-+ veor @b[4], @b[4], @b[7]
-+___
-+}
-+
-+sub InvSbox {
-+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
-+# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
-+my @b=@_[0..7];
-+my @t=@_[8..11];
-+my @s=@_[12..15];
-+ &InvInBasisChange (@b);
-+ &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
-+ &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
-+}
-+
-+sub InvInBasisChange { # OutBasisChange in reverse (with twist)
-+my @b=@_[5,1,2,6,3,7,0,4];
-+$code.=<<___
-+ veor @b[1], @b[1], @b[7]
-+ veor @b[4], @b[4], @b[7]
-+
-+ veor @b[7], @b[7], @b[5]
-+ veor @b[1], @b[1], @b[3]
-+ veor @b[2], @b[2], @b[5]
-+ veor @b[3], @b[3], @b[7]
-+
-+ veor @b[6], @b[6], @b[1]
-+ veor @b[2], @b[2], @b[0]
-+ veor @b[5], @b[5], @b[3]
-+ veor @b[4], @b[4], @b[6]
-+ veor @b[0], @b[0], @b[6]
-+ veor @b[1], @b[1], @b[4]
-+___
-+}
-+
-+sub InvOutBasisChange { # InBasisChange in reverse
-+my @b=@_[2,5,7,3,6,1,0,4];
-+$code.=<<___;
-+ veor @b[1], @b[1], @b[5]
-+ veor @b[2], @b[2], @b[7]
-+
-+ veor @b[3], @b[3], @b[1]
-+ veor @b[4], @b[4], @b[5]
-+ veor @b[7], @b[7], @b[5]
-+ veor @b[3], @b[3], @b[4]
-+ veor @b[5], @b[5], @b[0]
-+ veor @b[3], @b[3], @b[7]
-+ veor @b[6], @b[6], @b[2]
-+ veor @b[2], @b[2], @b[1]
-+ veor @b[6], @b[6], @b[3]
-+
-+ veor @b[3], @b[3], @b[0]
-+ veor @b[5], @b[5], @b[6]
-+___
-+}
-+
-+sub Mul_GF4 {
-+#;*************************************************************
-+#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
-+#;*************************************************************
-+my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
-+$code.=<<___;
-+ veor $t0, $y0, $y1
-+ vand $t0, $t0, $x0
-+ veor $x0, $x0, $x1
-+ vand $t1, $x1, $y0
-+ vand $x0, $x0, $y1
-+ veor $x1, $t1, $t0
-+ veor $x0, $x0, $t1
-+___
-+}
-+
-+sub Mul_GF4_N { # not used, see next subroutine
-+# multiply and scale by N
-+my ($x0,$x1,$y0,$y1,$t0)=@_;
-+$code.=<<___;
-+ veor $t0, $y0, $y1
-+ vand $t0, $t0, $x0
-+ veor $x0, $x0, $x1
-+ vand $x1, $x1, $y0
-+ vand $x0, $x0, $y1
-+ veor $x1, $x1, $x0
-+ veor $x0, $x0, $t0
-+___
-+}
-+
-+sub Mul_GF4_N_GF4 {
-+# interleaved Mul_GF4_N and Mul_GF4
-+my ($x0,$x1,$y0,$y1,$t0,
-+ $x2,$x3,$y2,$y3,$t1)=@_;
-+$code.=<<___;
-+ veor $t0, $y0, $y1
-+ veor $t1, $y2, $y3
-+ vand $t0, $t0, $x0
-+ vand $t1, $t1, $x2
-+ veor $x0, $x0, $x1
-+ veor $x2, $x2, $x3
-+ vand $x1, $x1, $y0
-+ vand $x3, $x3, $y2
-+ vand $x0, $x0, $y1
-+ vand $x2, $x2, $y3
-+ veor $x1, $x1, $x0
-+ veor $x2, $x2, $x3
-+ veor $x0, $x0, $t0
-+ veor $x3, $x3, $t1
-+___
-+}
-+sub Mul_GF16_2 {
-+my @x=@_[0..7];
-+my @y=@_[8..11];
-+my @t=@_[12..15];
-+$code.=<<___;
-+ veor @t[0], @x[0], @x[2]
-+ veor @t[1], @x[1], @x[3]
-+___
-+ &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2..3]);
-+$code.=<<___;
-+ veor @y[0], @y[0], @y[2]
-+ veor @y[1], @y[1], @y[3]
-+___
-+ Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
-+ @x[2], @x[3], @y[2], @y[3], @t[2]);
-+$code.=<<___;
-+ veor @x[0], @x[0], @t[0]
-+ veor @x[2], @x[2], @t[0]
-+ veor @x[1], @x[1], @t[1]
-+ veor @x[3], @x[3], @t[1]
-+
-+ veor @t[0], @x[4], @x[6]
-+ veor @t[1], @x[5], @x[7]
-+___
-+ &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
-+ @x[6], @x[7], @y[2], @y[3], @t[2]);
-+$code.=<<___;
-+ veor @y[0], @y[0], @y[2]
-+ veor @y[1], @y[1], @y[3]
-+___
-+ &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[2..3]);
-+$code.=<<___;
-+ veor @x[4], @x[4], @t[0]
-+ veor @x[6], @x[6], @t[0]
-+ veor @x[5], @x[5], @t[1]
-+ veor @x[7], @x[7], @t[1]
-+___
-+}
-+sub Inv_GF256 {
-+#;********************************************************************
-+#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
-+#;********************************************************************
-+my @x=@_[0..7];
-+my @t=@_[8..11];
-+my @s=@_[12..15];
-+# direct optimizations from hardware
-+$code.=<<___;
-+ veor @t[3], @x[4], @x[6]
-+ veor @t[2], @x[5], @x[7]
-+ veor @t[1], @x[1], @x[3]
-+ veor @s[1], @x[7], @x[6]
-+ vmov @t[0], @t[2]
-+ veor @s[0], @x[0], @x[2]
-+
-+ vorr @t[2], @t[2], @t[1]
-+ veor @s[3], @t[3], @t[0]
-+ vand @s[2], @t[3], @s[0]
-+ vorr @t[3], @t[3], @s[0]
-+ veor @s[0], @s[0], @t[1]
-+ vand @t[0], @t[0], @t[1]
-+ veor @t[1], @x[3], @x[2]
-+ vand @s[3], @s[3], @s[0]
-+ vand @s[1], @s[1], @t[1]
-+ veor @t[1], @x[4], @x[5]
-+ veor @s[0], @x[1], @x[0]
-+ veor @t[3], @t[3], @s[1]
-+ veor @t[2], @t[2], @s[1]
-+ vand @s[1], @t[1], @s[0]
-+ vorr @t[1], @t[1], @s[0]
-+ veor @t[3], @t[3], @s[3]
-+ veor @t[0], @t[0], @s[1]
-+ veor @t[2], @t[2], @s[2]
-+ veor @t[1], @t[1], @s[3]
-+ veor @t[0], @t[0], @s[2]
-+ vand @s[0], @x[7], @x[3]
-+ veor @t[1], @t[1], @s[2]
-+ vand @s[1], @x[6], @x[2]
-+ vand @s[2], @x[5], @x[1]
-+ vorr @s[3], @x[4], @x[0]
-+ veor @t[3], @t[3], @s[0]
-+ veor @t[1], @t[1], @s[2]
-+ veor @t[0], @t[0], @s[3]
-+ veor @t[2], @t[2], @s[1]
-+
-+ @ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
-+
-+ @ new smaller inversion
-+
-+ vand @s[2], @t[3], @t[1]
-+ vmov @s[0], @t[0]
-+
-+ veor @s[1], @t[2], @s[2]
-+ veor @s[3], @t[0], @s[2]
-+ veor @s[2], @t[0], @s[2] @ @s[2]=@s[3]
-+
-+ vbsl @s[1], @t[1], @t[0]
-+ vbsl @s[3], @t[3], @t[2]
-+ veor @t[3], @t[3], @t[2]
-+
-+ vbsl @s[0], @s[1], @s[2]
-+ vbsl @t[0], @s[2], @s[1]
-+
-+ vand @s[2], @s[0], @s[3]
-+ veor @t[1], @t[1], @t[0]
-+
-+ veor @s[2], @s[2], @t[3]
-+___
-+# output in s3, s2, s1, t1
-+
-+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
-+
-+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
-+ &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
-+
-+### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
-+}
-+
-+# AES linear components
-+
-+sub ShiftRows {
-+my @x=@_[0..7];
-+my @t=@_[8..11];
-+my $mask=pop;
-+$code.=<<___;
-+ vldmia $key!, {@t[0]-@t[3]}
-+ veor @t[0], @t[0], @x[0]
-+ veor @t[1], @t[1], @x[1]
-+ vtbl.8 `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
-+ vtbl.8 `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
-+ vldmia $key!, {@t[0]}
-+ veor @t[2], @t[2], @x[2]
-+ vtbl.8 `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
-+ vtbl.8 `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
-+ vldmia $key!, {@t[1]}
-+ veor @t[3], @t[3], @x[3]
-+ vtbl.8 `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
-+ vtbl.8 `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
-+ vldmia $key!, {@t[2]}
-+ vtbl.8 `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
-+ vtbl.8 `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
-+ vldmia $key!, {@t[3]}
-+ veor @t[0], @t[0], @x[4]
-+ veor @t[1], @t[1], @x[5]
-+ vtbl.8 `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
-+ vtbl.8 `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
-+ veor @t[2], @t[2], @x[6]
-+ vtbl.8 `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
-+ vtbl.8 `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
-+ veor @t[3], @t[3], @x[7]
-+ vtbl.8 `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
-+ vtbl.8 `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
-+ vtbl.8 `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
-+ vtbl.8 `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
-+___
-+}
-+
-+sub MixColumns {
-+# modified to emit output in order suitable for feeding back to aesenc[last]
-+my @x=@_[0..7];
-+my @t=@_[8..15];
-+$code.=<<___;
-+ vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32
-+ vext.8 @t[1], @x[1], @x[1], #12
-+ veor @x[0], @x[0], @t[0] @ x0 ^ (x0 <<< 32)
-+ vext.8 @t[2], @x[2], @x[2], #12
-+ veor @x[1], @x[1], @t[1]
-+ vext.8 @t[3], @x[3], @x[3], #12
-+ veor @x[2], @x[2], @t[2]
-+ vext.8 @t[4], @x[4], @x[4], #12
-+ veor @x[3], @x[3], @t[3]
-+ vext.8 @t[5], @x[5], @x[5], #12
-+ veor @x[4], @x[4], @t[4]
-+ vext.8 @t[6], @x[6], @x[6], #12
-+ veor @x[5], @x[5], @t[5]
-+ vext.8 @t[7], @x[7], @x[7], #12
-+ veor @x[6], @x[6], @t[6]
-+
-+ veor @t[1], @t[1], @x[0]
-+ veor @x[7], @x[7], @t[7]
-+ vext.8 @x[0], @x[0], @x[0], #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
-+ veor @t[2], @t[2], @x[1]
-+ veor @t[0], @t[0], @x[7]
-+ veor @t[1], @t[1], @x[7]
-+ vext.8 @x[1], @x[1], @x[1], #8
-+ veor @t[5], @t[5], @x[4]
-+ veor @x[0], @x[0], @t[0]
-+ veor @t[6], @t[6], @x[5]
-+ veor @x[1], @x[1], @t[1]
-+ vext.8 @t[0], @x[4], @x[4], #8
-+ veor @t[4], @t[4], @x[3]
-+ vext.8 @t[1], @x[5], @x[5], #8
-+ veor @t[7], @t[7], @x[6]
-+ vext.8 @x[4], @x[3], @x[3], #8
-+ veor @t[3], @t[3], @x[2]
-+ vext.8 @x[5], @x[7], @x[7], #8
-+ veor @t[4], @t[4], @x[7]
-+ vext.8 @x[3], @x[6], @x[6], #8
-+ veor @t[3], @t[3], @x[7]
-+ vext.8 @x[6], @x[2], @x[2], #8
-+ veor @x[7], @t[1], @t[5]
-+ veor @x[2], @t[0], @t[4]
-+
-+ veor @x[4], @x[4], @t[3]
-+ veor @x[5], @x[5], @t[7]
-+ veor @x[3], @x[3], @t[6]
-+ @ vmov @x[2], @t[0]
-+ veor @x[6], @x[6], @t[2]
-+ @ vmov @x[7], @t[1]
-+___
-+}
-+
-+sub InvMixColumns {
-+my @x=@_[0..7];
-+my @t=@_[8..15];
-+
-+$code.=<<___;
-+ @ multiplication by 0x0e
-+ vext.8 @t[7], @x[7], @x[7], #12
-+ vmov @t[2], @x[2]
-+ veor @x[2], @x[2], @x[5] @ 2 5
-+ veor @x[7], @x[7], @x[5] @ 7 5
-+ vext.8 @t[0], @x[0], @x[0], #12
-+ vmov @t[5], @x[5]
-+ veor @x[5], @x[5], @x[0] @ 5 0 [1]
-+ veor @x[0], @x[0], @x[1] @ 0 1
-+ vext.8 @t[1], @x[1], @x[1], #12
-+ veor @x[1], @x[1], @x[2] @ 1 25
-+ veor @x[0], @x[0], @x[6] @ 01 6 [2]
-+ vext.8 @t[3], @x[3], @x[3], #12
-+ veor @x[1], @x[1], @x[3] @ 125 3 [4]
-+ veor @x[2], @x[2], @x[0] @ 25 016 [3]
-+ veor @x[3], @x[3], @x[7] @ 3 75
-+ veor @x[7], @x[7], @x[6] @ 75 6 [0]
-+ vext.8 @t[6], @x[6], @x[6], #12
-+ vmov @t[4], @x[4]
-+ veor @x[6], @x[6], @x[4] @ 6 4
-+ veor @x[4], @x[4], @x[3] @ 4 375 [6]
-+ veor @x[3], @x[3], @x[7] @ 375 756=36
-+ veor @x[6], @x[6], @t[5] @ 64 5 [7]
-+ veor @x[3], @x[3], @t[2] @ 36 2
-+ vext.8 @t[5], @t[5], @t[5], #12
-+ veor @x[3], @x[3], @t[4] @ 362 4 [5]
-+___
-+ my @y = @x[7,5,0,2,1,3,4,6];
-+$code.=<<___;
-+ @ multiplication by 0x0b
-+ veor @y[1], @y[1], @y[0]
-+ veor @y[0], @y[0], @t[0]
-+ vext.8 @t[2], @t[2], @t[2], #12
-+ veor @y[1], @y[1], @t[1]
-+ veor @y[0], @y[0], @t[5]
-+ vext.8 @t[4], @t[4], @t[4], #12
-+ veor @y[1], @y[1], @t[6]
-+ veor @y[0], @y[0], @t[7]
-+ veor @t[7], @t[7], @t[6] @ clobber t[7]
-+
-+ veor @y[3], @y[3], @t[0]
-+ veor @y[1], @y[1], @y[0]
-+ vext.8 @t[0], @t[0], @t[0], #12
-+ veor @y[2], @y[2], @t[1]
-+ veor @y[4], @y[4], @t[1]
-+ vext.8 @t[1], @t[1], @t[1], #12
-+ veor @y[2], @y[2], @t[2]
-+ veor @y[3], @y[3], @t[2]
-+ veor @y[5], @y[5], @t[2]
-+ veor @y[2], @y[2], @t[7]
-+ vext.8 @t[2], @t[2], @t[2], #12
-+ veor @y[3], @y[3], @t[3]
-+ veor @y[6], @y[6], @t[3]
-+ veor @y[4], @y[4], @t[3]
-+ veor @y[7], @y[7], @t[4]
-+ vext.8 @t[3], @t[3], @t[3], #12
-+ veor @y[5], @y[5], @t[4]
-+ veor @y[7], @y[7], @t[7]
-+ veor @t[7], @t[7], @t[5] @ clobber t[7] even more
-+ veor @y[3], @y[3], @t[5]
-+ veor @y[4], @y[4], @t[4]
-+
-+ veor @y[5], @y[5], @t[7]
-+ vext.8 @t[4], @t[4], @t[4], #12
-+ veor @y[6], @y[6], @t[7]
-+ veor @y[4], @y[4], @t[7]
-+
-+ veor @t[7], @t[7], @t[5]
-+ vext.8 @t[5], @t[5], @t[5], #12
-+
-+ @ multiplication by 0x0d
-+ veor @y[4], @y[4], @y[7]
-+ veor @t[7], @t[7], @t[6] @ restore t[7]
-+ veor @y[7], @y[7], @t[4]
-+ vext.8 @t[6], @t[6], @t[6], #12
-+ veor @y[2], @y[2], @t[0]
-+ veor @y[7], @y[7], @t[5]
-+ vext.8 @t[7], @t[7], @t[7], #12
-+ veor @y[2], @y[2], @t[2]
-+
-+ veor @y[3], @y[3], @y[1]
-+ veor @y[1], @y[1], @t[1]
-+ veor @y[0], @y[0], @t[0]
-+ veor @y[3], @y[3], @t[0]
-+ veor @y[1], @y[1], @t[5]
-+ veor @y[0], @y[0], @t[5]
-+ vext.8 @t[0], @t[0], @t[0], #12
-+ veor @y[1], @y[1], @t[7]
-+ veor @y[0], @y[0], @t[6]
-+ veor @y[3], @y[3], @y[1]
-+ veor @y[4], @y[4], @t[1]
-+ vext.8 @t[1], @t[1], @t[1], #12
-+
-+ veor @y[7], @y[7], @t[7]
-+ veor @y[4], @y[4], @t[2]
-+ veor @y[5], @y[5], @t[2]
-+ veor @y[2], @y[2], @t[6]
-+ veor @t[6], @t[6], @t[3] @ clobber t[6]
-+ vext.8 @t[2], @t[2], @t[2], #12
-+ veor @y[4], @y[4], @y[7]
-+ veor @y[3], @y[3], @t[6]
-+
-+ veor @y[6], @y[6], @t[6]
-+ veor @y[5], @y[5], @t[5]
-+ vext.8 @t[5], @t[5], @t[5], #12
-+ veor @y[6], @y[6], @t[4]
-+ vext.8 @t[4], @t[4], @t[4], #12
-+ veor @y[5], @y[5], @t[6]
-+ veor @y[6], @y[6], @t[7]
-+ vext.8 @t[7], @t[7], @t[7], #12
-+ veor @t[6], @t[6], @t[3] @ restore t[6]
-+ vext.8 @t[3], @t[3], @t[3], #12
-+
-+ @ multiplication by 0x09
-+ veor @y[4], @y[4], @y[1]
-+ veor @t[1], @t[1], @y[1] @ t[1]=y[1]
-+ veor @t[0], @t[0], @t[5] @ clobber t[0]
-+ vext.8 @t[6], @t[6], @t[6], #12
-+ veor @t[1], @t[1], @t[5]
-+ veor @y[3], @y[3], @t[0]
-+ veor @t[0], @t[0], @y[0] @ t[0]=y[0]
-+ veor @t[1], @t[1], @t[6]
-+ veor @t[6], @t[6], @t[7] @ clobber t[6]
-+ veor @y[4], @y[4], @t[1]
-+ veor @y[7], @y[7], @t[4]
-+ veor @y[6], @y[6], @t[3]
-+ veor @y[5], @y[5], @t[2]
-+ veor @t[4], @t[4], @y[4] @ t[4]=y[4]
-+ veor @t[3], @t[3], @y[3] @ t[3]=y[3]
-+ veor @t[5], @t[5], @y[5] @ t[5]=y[5]
-+ veor @t[2], @t[2], @y[2] @ t[2]=y[2]
-+ veor @t[3], @t[3], @t[7]
-+ veor @XMM[5], @t[5], @t[6]
-+ veor @XMM[6], @t[6], @y[6] @ t[6]=y[6]
-+ veor @XMM[2], @t[2], @t[6]
-+ veor @XMM[7], @t[7], @y[7] @ t[7]=y[7]
-+
-+ vmov @XMM[0], @t[0]
-+ vmov @XMM[1], @t[1]
-+ @ vmov @XMM[2], @t[2]
-+ vmov @XMM[3], @t[3]
-+ vmov @XMM[4], @t[4]
-+ @ vmov @XMM[5], @t[5]
-+ @ vmov @XMM[6], @t[6]
-+ @ vmov @XMM[7], @t[7]
-+___
-+}
-+
-+sub swapmove {
-+my ($a,$b,$n,$mask,$t)=@_;
-+$code.=<<___;
-+ vshr.u64 $t, $b, #$n
-+ veor $t, $t, $a
-+ vand $t, $t, $mask
-+ veor $a, $a, $t
-+ vshl.u64 $t, $t, #$n
-+ veor $b, $b, $t
-+___
-+}
-+sub swapmove2x {
-+my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
-+$code.=<<___;
-+ vshr.u64 $t0, $b0, #$n
-+ vshr.u64 $t1, $b1, #$n
-+ veor $t0, $t0, $a0
-+ veor $t1, $t1, $a1
-+ vand $t0, $t0, $mask
-+ vand $t1, $t1, $mask
-+ veor $a0, $a0, $t0
-+ vshl.u64 $t0, $t0, #$n
-+ veor $a1, $a1, $t1
-+ vshl.u64 $t1, $t1, #$n
-+ veor $b0, $b0, $t0
-+ veor $b1, $b1, $t1
-+___
-+}
-+
-+sub bitslice {
-+my @x=reverse(@_[0..7]);
-+my ($t0,$t1,$t2,$t3)=@_[8..11];
-+$code.=<<___;
-+ vmov.i8 $t0,#0x55 @ compose .LBS0
-+ vmov.i8 $t1,#0x33 @ compose .LBS1
-+___
-+ &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
-+ &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
-+$code.=<<___;
-+ vmov.i8 $t0,#0x0f @ compose .LBS2
-+___
-+ &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
-+ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
-+
-+ &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
-+ &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
-+}
-+
-+$code.=<<___;
-+ .text
-+ .code 32
-+ .fpu neon
-+
-+ .align 4
-+.LM0ISR: @ InvShiftRows constants
-+ .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
-+.LISR:
-+ .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
-+.LISRM0:
-+ .quad 0x01040b0e0205080f, 0x0306090c00070a0d
-+
-+ .type _bsaes_decrypt8,%function
-+_bsaes_decrypt8:
-+ adr $const,.LM0ISR
-+ vldmia $key!, {@XMM[9]} @ round 0 key
-+
-+ vldmia $const!, {@XMM[8]} @ .LM0ISR
-+ veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
-+ veor @XMM[11], @XMM[1], @XMM[9]
-+ vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
-+ vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
-+ veor @XMM[12], @XMM[2], @XMM[9]
-+ vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
-+ vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
-+ veor @XMM[13], @XMM[3], @XMM[9]
-+ vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
-+ vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
-+ veor @XMM[14], @XMM[4], @XMM[9]
-+ vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
-+ vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
-+ veor @XMM[15], @XMM[5], @XMM[9]
-+ vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
-+ vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
-+ veor @XMM[10], @XMM[6], @XMM[9]
-+ vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
-+ vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
-+ veor @XMM[11], @XMM[7], @XMM[9]
-+ vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
-+ vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
-+ vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
-+ vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
-+___
-+ &bitslice (@XMM[0..7, 8..11]);
-+$code.=<<___;
-+ sub $rounds,$rounds,#1
-+ b .Ldec_sbox
-+
-+ .align 5
-+.Ldec_loop:
-+___
-+ &ShiftRows (@XMM[0..7, 8..12]);
-+$code.=".Ldec_sbox:\n";
-+ &InvSbox (@XMM[0..7, 8..15]);
-+$code.=<<___;
-+ subs $rounds,$rounds,#1
-+ bcc .Ldec_done
-+___
-+ &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
-+$code.=<<___;
-+ vldmia $const, {@XMM[12]} @ .LISR
-+ addeq $const,$const,#0x10
-+ bne .Ldec_loop
-+ vldmia $const, {@XMM[12]} @ .LISRM0
-+ b .Ldec_loop
-+.align 4
-+.Ldec_done:
-+___
-+ &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
-+$code.=<<___;
-+ vldmia $key, {@XMM[8]} @ last round key
-+ veor @XMM[6], @XMM[6], @XMM[8]
-+ veor @XMM[4], @XMM[4], @XMM[8]
-+ veor @XMM[2], @XMM[2], @XMM[8]
-+ veor @XMM[7], @XMM[7], @XMM[8]
-+ veor @XMM[3], @XMM[3], @XMM[8]
-+ veor @XMM[5], @XMM[5], @XMM[8]
-+ veor @XMM[0], @XMM[0], @XMM[8]
-+ veor @XMM[1], @XMM[1], @XMM[8]
-+ bx lr
-+.size _bsaes_decrypt8,.-_bsaes_decrypt8
-+
-+ .align 4
-+_bsaes_const:
-+.LM0SR: @ ShiftRows constants
-+ .quad 0x0a0e02060f03070b, 0x0004080c05090d01
-+.LSR:
-+ .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
-+.LSRM0:
-+ .quad 0x0304090e00050a0f, 0x01060b0c0207080d
-+
-+ .asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
-+ .size _bsaes_const,.-_bsaes_const
-+
-+ .align 5
-+ .type _bsaes_encrypt8,%function
-+_bsaes_encrypt8:
-+ adr $const,.LM0SR
-+ vldmia $key!, {@XMM[9]} @ round 0 key
-+
-+ vldmia $const!, {@XMM[8]} @ .LM0SR
-+ veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
-+ veor @XMM[11], @XMM[1], @XMM[9]
-+ vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
-+ vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
-+ veor @XMM[12], @XMM[2], @XMM[9]
-+ vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
-+ vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
-+ veor @XMM[13], @XMM[3], @XMM[9]
-+ vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
-+ vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
-+ veor @XMM[14], @XMM[4], @XMM[9]
-+ vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
-+ vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
-+ veor @XMM[15], @XMM[5], @XMM[9]
-+ vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
-+ vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
-+ veor @XMM[10], @XMM[6], @XMM[9]
-+ vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
-+ vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
-+ veor @XMM[11], @XMM[7], @XMM[9]
-+ vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
-+ vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
-+ vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
-+ vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
-+_bsaes_encrypt8_bitslice:
-+___
-+ &bitslice (@XMM[0..7, 8..11]);
-+$code.=<<___;
-+ sub $rounds,$rounds,#1
-+ b .Lenc_sbox
-+.align 4
-+.Lenc_loop:
-+___
-+ &ShiftRows (@XMM[0..7, 8..12]);
-+$code.=".Lenc_sbox:\n";
-+ &Sbox (@XMM[0..7, 8..15]);
-+$code.=<<___;
-+ subs $rounds,$rounds,#1
-+ bcc .Lenc_done
-+___
-+ &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
-+$code.=<<___;
-+ vldmia $const, {@XMM[12]} @ .LSR
-+ addeq $const,$const,#0x10
-+ bne .Lenc_loop
-+ vldmia $const, {@XMM[12]} @ .LSRM0
-+ b .Lenc_loop
-+.align 4
-+.Lenc_done:
-+___
-+ # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
-+ &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
-+$code.=<<___;
-+ vldmia $key, {@XMM[8]} @ last round key
-+ veor @XMM[4], @XMM[4], @XMM[8]
-+ veor @XMM[6], @XMM[6], @XMM[8]
-+ veor @XMM[3], @XMM[3], @XMM[8]
-+ veor @XMM[7], @XMM[7], @XMM[8]
-+ veor @XMM[2], @XMM[2], @XMM[8]
-+ veor @XMM[5], @XMM[5], @XMM[8]
-+ veor @XMM[0], @XMM[0], @XMM[8]
-+ veor @XMM[1], @XMM[1], @XMM[8]
-+ bx lr
-+.size _bsaes_encrypt8,.-_bsaes_encrypt8
-+___
-+}
-+{
-+my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
-+
-+sub bitslice_key {
-+my @x=reverse(@_[0..7]);
-+my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
-+
-+ &swapmove (@x[0,1],1,$bs0,$t2,$t3);
-+$code.=<<___;
-+ @ &swapmove(@x[2,3],1,$t0,$t2,$t3);
-+ vmov @x[2], @x[0]
-+ vmov @x[3], @x[1]
-+___
-+ #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
-+
-+ &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
-+$code.=<<___;
-+ @ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
-+ vmov @x[4], @x[0]
-+ vmov @x[6], @x[2]
-+ vmov @x[5], @x[1]
-+ vmov @x[7], @x[3]
-+___
-+ &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
-+ &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
-+}
-+
-+$code.=<<___;
-+ .align 4
-+.LM0:
-+ .quad 0x02060a0e03070b0f, 0x0004080c0105090d
-+
-+ .type _bsaes_key_convert,%function
-+_bsaes_key_convert:
-+ adr $const,.LM0
-+ vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key
-+ vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key
-+
-+ vmov.i8 @XMM[8], #0x01 @ bit masks
-+ vmov.i8 @XMM[9], #0x02
-+ vmov.i8 @XMM[10], #0x04
-+ vmov.i8 @XMM[11], #0x08
-+ vmov.i8 @XMM[12], #0x10
-+ vmov.i8 @XMM[13], #0x20
-+ vldmia $const, {@XMM[14]} @ .LM0
-+
-+#ifdef __ARMEL__
-+ vrev32.8 @XMM[7], @XMM[7]
-+ vrev32.8 @XMM[15], @XMM[15]
-+#endif
-+ sub $rounds,$rounds,#1
-+ vstmia $out!, {@XMM[7]} @ save round 0 key
-+ b .Lkey_loop
-+
-+ .align 5
-+.Lkey_loop:
-+ vtbl.8 `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
-+ vtbl.8 `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
-+ vmov.i8 @XMM[6], #0x40
-+ vmov.i8 @XMM[15], #0x80
-+
-+ vtst.8 @XMM[0], @XMM[7], @XMM[8]
-+ vtst.8 @XMM[1], @XMM[7], @XMM[9]
-+ vtst.8 @XMM[2], @XMM[7], @XMM[10]
-+ vtst.8 @XMM[3], @XMM[7], @XMM[11]
-+ vtst.8 @XMM[4], @XMM[7], @XMM[12]
-+ vtst.8 @XMM[5], @XMM[7], @XMM[13]
-+ vtst.8 @XMM[6], @XMM[7], @XMM[6]
-+ vtst.8 @XMM[7], @XMM[7], @XMM[15]
-+ vld1.8 {@XMM[15]}, [$inp]! @ load next round key
-+ vmvn @XMM[0], @XMM[0] @ "pnot"
-+ vmvn @XMM[1], @XMM[1]
-+ vmvn @XMM[5], @XMM[5]
-+ vmvn @XMM[6], @XMM[6]
-+#ifdef __ARMEL__
-+ vrev32.8 @XMM[15], @XMM[15]
-+#endif
-+ subs $rounds,$rounds,#1
-+ vstmia $out!,{@XMM[0]-@XMM[7]} @ write bit-sliced round key
-+ bne .Lkey_loop
-+
-+ vmov.i8 @XMM[7],#0x63 @ compose .L63
-+ @ don't save last round key
-+ bx lr
-+.size _bsaes_key_convert,.-_bsaes_key_convert
-+___
-+}
-+
-+if (0) { # following four functions are unsupported interface
-+ # used for benchmarking...
-+$code.=<<___;
-+.globl bsaes_enc_key_convert
-+.type bsaes_enc_key_convert,%function
-+.align 4
-+bsaes_enc_key_convert:
-+ stmdb sp!,{r4-r6,lr}
-+ vstmdb sp!,{d8-d15} @ ABI specification says so
-+
-+ ldr r5,[$inp,#240] @ pass rounds
-+ mov r4,$inp @ pass key
-+ mov r12,$out @ pass key schedule
-+ bl _bsaes_key_convert
-+ veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
-+ vstmia r12, {@XMM[7]} @ save last round key
-+
-+ vldmia sp!,{d8-d15}
-+ ldmia sp!,{r4-r6,pc}
-+.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
-+
-+.globl bsaes_encrypt_128
-+.type bsaes_encrypt_128,%function
-+.align 4
-+bsaes_encrypt_128:
-+ stmdb sp!,{r4-r6,lr}
-+ vstmdb sp!,{d8-d15} @ ABI specification says so
-+.Lenc128_loop:
-+ vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
-+ vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
-+ mov r4,$key @ pass the key
-+ vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
-+ mov r5,#10 @ pass rounds
-+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
-+
-+ bl _bsaes_encrypt8
-+
-+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
-+ vst1.8 {@XMM[4]}, [$out]!
-+ vst1.8 {@XMM[6]}, [$out]!
-+ vst1.8 {@XMM[3]}, [$out]!
-+ vst1.8 {@XMM[7]}, [$out]!
-+ vst1.8 {@XMM[2]}, [$out]!
-+ subs $len,$len,#0x80
-+ vst1.8 {@XMM[5]}, [$out]!
-+ bhi .Lenc128_loop
-+
-+ vldmia sp!,{d8-d15}
-+ ldmia sp!,{r4-r6,pc}
-+.size bsaes_encrypt_128,.-bsaes_encrypt_128
-+
-+.globl bsaes_dec_key_convert
-+.type bsaes_dec_key_convert,%function
-+.align 4
-+bsaes_dec_key_convert:
-+ stmdb sp!,{r4-r6,lr}
-+ vstmdb sp!,{d8-d15} @ ABI specification says so
-+
-+ ldr r5,[$inp,#240] @ pass rounds
-+ mov r4,$inp @ pass key
-+ mov r12,$out @ pass key schedule
-+ bl _bsaes_key_convert
-+ vldmia $out, {@XMM[6]}
-+ vstmia r12, {@XMM[15]} @ save last round key
-+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
-+ vstmia $out, {@XMM[7]}
-+
-+ vldmia sp!,{d8-d15}
-+ ldmia sp!,{r4-r6,pc}
-+.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
-+
-+.globl bsaes_decrypt_128
-+.type bsaes_decrypt_128,%function
-+.align 4
-+bsaes_decrypt_128:
-+ stmdb sp!,{r4-r6,lr}
-+ vstmdb sp!,{d8-d15} @ ABI specification says so
-+.Ldec128_loop:
-+ vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
-+ vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
-+ mov r4,$key @ pass the key
-+ vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
-+ mov r5,#10 @ pass rounds
-+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
-+
-+ bl _bsaes_decrypt8
-+
-+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
-+ vst1.8 {@XMM[6]}, [$out]!
-+ vst1.8 {@XMM[4]}, [$out]!
-+ vst1.8 {@XMM[2]}, [$out]!
-+ vst1.8 {@XMM[7]}, [$out]!
-+ vst1.8 {@XMM[3]}, [$out]!
-+ subs $len,$len,#0x80
-+ vst1.8 {@XMM[5]}, [$out]!
-+ bhi .Ldec128_loop
-+
-+ vldmia sp!,{d8-d15}
-+ ldmia sp!,{r4-r6,pc}
-+.size bsaes_decrypt_128,.-bsaes_decrypt_128
-+___
-+}
-+
-+{
-+my ($inp,$out,$len,$key,$keysched,$rounds,$iv)=("r0","r1","r2","r3","sp","r7","r8");
-+
-+$code.=<<___;
-+ .align 5
-+ .extern AES_cbc_encrypt
-+ .extern AES_decrypt
-+ .global bsaes_cbc_encrypt
-+ .type bsaes_cbc_encrypt,%function
-+bsaes_cbc_encrypt:
-+ cmp $len, #128
-+ blo AES_cbc_encrypt
-+
-+ @ it is up to the caller to make sure we are called with enc == 0
-+
-+ stmdb sp!, {r4-r10, lr}
-+ vstmdb sp!, {d8-d15} @ ABI specification says so
-+ ldr $iv, [sp, #0x60] @ IV is 1st arg on the stack
-+ lsr $len, #4 @ len in 16 byte blocks
-+ sub sp, #0x10 @ scratch space to carry over the IV
-+ mov r10, sp @ save sp
-+
-+ @ allocate the key schedule on the stack
-+ ldr $rounds, [r3, #240] @ get # of rounds
-+ sub r14, sp, $rounds, lsl #7 @ 128 bytes per inner round key
-+ add sp, r14, #`128-32` @ size of bit-sliced key schedule
-+
-+ @ populate the key schedule
-+ mov r4, $key @ pass key
-+ mov r5, $rounds @ pass # of rounds
-+ mov r12, $keysched @ pass key schedule
-+ bl _bsaes_key_convert
-+ vldmia $keysched, {@XMM[6]}
-+ vstmia r12, {@XMM[15]} @ save last round key
-+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
-+ vstmia $keysched, {@XMM[7]}
-+
-+ vld1.8 {@XMM[15]}, [$iv] @ load IV
-+ mov $iv, r10
-+
-+.Lcbc_dec_loop:
-+ subs $len, #0x8
-+ bmi .Lcbc_dec_loop_finish
-+
-+ vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
-+ vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
-+ mov r4, $keysched @ pass the key
-+ vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
-+ mov r5, $rounds
-+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]
-+ sub $inp, #0x60
-+ vst1.8 {@XMM[15]}, [$iv] @ put aside IV
-+
-+ bl _bsaes_decrypt8
-+
-+ vld1.8 {@XMM[14]}, [$iv] @ load IV
-+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
-+
-+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
-+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
-+ veor @XMM[1], @XMM[1], @XMM[8]
-+ veor @XMM[6], @XMM[6], @XMM[9]
-+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
-+ vst1.8 {@XMM[6]}, [$out]!
-+ vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
-+ veor @XMM[4], @XMM[4], @XMM[10]
-+ veor @XMM[2], @XMM[2], @XMM[11]
-+ vst1.8 {@XMM[4]}, [$out]!
-+ vst1.8 {@XMM[2]}, [$out]!
-+ vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
-+ veor @XMM[7], @XMM[7], @XMM[12]
-+ veor @XMM[3], @XMM[3], @XMM[13]
-+ veor @XMM[5], @XMM[5], @XMM[14]
-+ vst1.8 {@XMM[7]}, [$out]!
-+ vst1.8 {@XMM[3]}, [$out]!
-+ vst1.8 {@XMM[5]}, [$out]!
-+
-+ b .Lcbc_dec_loop
-+
-+.Lcbc_dec_loop_finish:
-+ adds $len, #8
-+ beq .Lcbc_dec_done
-+
-+ mov r5, $rounds
-+ vld1.8 {@XMM[0]}, [$inp]! @ load input
-+ mov r4, $keysched @ pass the key
-+ cmp $len, #2
-+ blo .Lcbc_dec_one
-+ vld1.8 {@XMM[1]}, [$inp]!
-+ beq .Lcbc_dec_two
-+ vld1.8 {@XMM[2]}, [$inp]!
-+ cmp $len, #4
-+ blo .Lcbc_dec_three
-+ vld1.8 {@XMM[3]}, [$inp]!
-+ beq .Lcbc_dec_four
-+ vld1.8 {@XMM[4]}, [$inp]!
-+ cmp $len, #6
-+ blo .Lcbc_dec_five
-+ vld1.8 {@XMM[5]}, [$inp]!
-+ beq .Lcbc_dec_six
-+ vld1.8 {@XMM[6]}, [$inp]!
-+ vst1.8 {@XMM[15]}, [$iv] @ put aside IV
-+ bl _bsaes_decrypt8
-+ sub $inp, #0x70
-+ vld1.8 {@XMM[14]}, [$iv] @ load IV
-+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
-+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
-+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
-+ veor @XMM[1], @XMM[1], @XMM[8]
-+ veor @XMM[6], @XMM[6], @XMM[9]
-+ vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
-+ veor @XMM[4], @XMM[4], @XMM[10]
-+ veor @XMM[2], @XMM[2], @XMM[11]
-+ vld1.8 {@XMM[15]}, [$inp]!
-+ veor @XMM[7], @XMM[7], @XMM[12]
-+ veor @XMM[3], @XMM[3], @XMM[13]
-+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
-+ vst1.8 {@XMM[6]}, [$out]!
-+ vst1.8 {@XMM[4]}, [$out]!
-+ vst1.8 {@XMM[2]}, [$out]!
-+ vst1.8 {@XMM[7]}, [$out]!
-+ vst1.8 {@XMM[3]}, [$out]!
-+ b .Lcbc_dec_done
-+.align 4
-+.Lcbc_dec_six:
-+ sub $inp, #0x60
-+ vst1.8 {@XMM[15]}, [$iv] @ put aside IV
-+ bl _bsaes_decrypt8
-+ vld1.8 {@XMM[14]}, [$iv] @ load IV
-+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
-+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
-+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
-+ veor @XMM[1], @XMM[1], @XMM[8]
-+ veor @XMM[6], @XMM[6], @XMM[9]
-+ vld1.8 {@XMM[12]}, [$inp]!
-+ veor @XMM[4], @XMM[4], @XMM[10]
-+ veor @XMM[2], @XMM[2], @XMM[11]
-+ vld1.8 {@XMM[15]}, [$inp]!
-+ veor @XMM[7], @XMM[7], @XMM[12]
-+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
-+ vst1.8 {@XMM[6]}, [$out]!
-+ vst1.8 {@XMM[4]}, [$out]!
-+ vst1.8 {@XMM[2]}, [$out]!
-+ vst1.8 {@XMM[7]}, [$out]!
-+ b .Lcbc_dec_done
-+.align 4
-+.Lcbc_dec_five:
-+ sub $inp, #0x50
-+ vst1.8 {@XMM[15]}, [$iv] @ put aside IV
-+ bl _bsaes_decrypt8
-+ vld1.8 {@XMM[14]}, [$iv] @ load IV
-+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
-+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
-+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
-+ veor @XMM[1], @XMM[1], @XMM[8]
-+ veor @XMM[6], @XMM[6], @XMM[9]
-+ vld1.8 {@XMM[15]}, [$inp]!
-+ veor @XMM[4], @XMM[4], @XMM[10]
-+ veor @XMM[2], @XMM[2], @XMM[11]
-+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
-+ vst1.8 {@XMM[6]}, [$out]!
-+ vst1.8 {@XMM[4]}, [$out]!
-+ vst1.8 {@XMM[2]}, [$out]!
-+ b .Lcbc_dec_done
-+.align 4
-+.Lcbc_dec_four:
-+ sub $inp, #0x40
-+ vst1.8 {@XMM[15]}, [$iv] @ put aside IV
-+ bl _bsaes_decrypt8
-+ vld1.8 {@XMM[14]}, [$iv] @ load IV
-+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
-+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
-+ vld1.8 {@XMM[10]}, [$inp]!
-+ veor @XMM[1], @XMM[1], @XMM[8]
-+ veor @XMM[6], @XMM[6], @XMM[9]
-+ vld1.8 {@XMM[15]}, [$inp]!
-+ veor @XMM[4], @XMM[4], @XMM[10]
-+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
-+ vst1.8 {@XMM[6]}, [$out]!
-+ vst1.8 {@XMM[4]}, [$out]!
-+ b .Lcbc_dec_done
-+.align 4
-+.Lcbc_dec_three:
-+ sub $inp, #0x30
-+ vst1.8 {@XMM[15]}, [$iv] @ put aside IV
-+ bl _bsaes_decrypt8
-+ vld1.8 {@XMM[14]}, [$iv] @ load IV
-+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
-+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
-+ vld1.8 {@XMM[15]}, [$inp]!
-+ veor @XMM[1], @XMM[1], @XMM[8]
-+ veor @XMM[6], @XMM[6], @XMM[9]
-+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
-+ vst1.8 {@XMM[6]}, [$out]!
-+ b .Lcbc_dec_done
-+.align 4
-+.Lcbc_dec_two:
-+ sub $inp, #0x20
-+ vst1.8 {@XMM[15]}, [$iv] @ put aside IV
-+ bl _bsaes_decrypt8
-+ vld1.8 {@XMM[14]}, [$iv] @ load IV
-+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
-+ vld1.8 {@XMM[8]}, [$inp]! @ reload input
-+ veor @XMM[1], @XMM[1], @XMM[8]
-+ vld1.8 {@XMM[15]}, [$inp]! @ reload input
-+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
-+ b .Lcbc_dec_done
-+.align 4
-+.Lcbc_dec_one:
-+ sub $inp, #0x10
-+ mov r7, $out @ save original out pointer
-+ mov $out, $iv @ use the iv scratch space as out buffer
-+ mov r2, $key
-+ bl AES_decrypt
-+ vld1.8 {@XMM[0]}, [$iv] @ reload output
-+ veor @XMM[15], @XMM[15], @XMM[0] @ ^= IV
-+ vst1.8 {@XMM[15]}, [r7]
-+ vmov @XMM[15], @XMM[0] @ IV
-+
-+.Lcbc_dec_done:
-+ vmov.i32 q0, #0
-+.Lcbc_dec_bzero: @ wipe key schedule [if any]
-+ vst1.8 {q0}, [$keysched]!
-+ teq $keysched, r10
-+ bne .Lcbc_dec_bzero
-+
-+ add sp, #0x10
-+ ldr $iv, [sp, #0x60]
-+ vst1.8 {@XMM[15]}, [$iv] @ return IV
-+ vldmia sp!, {d8-d15}
-+ ldmia sp!, {r4-r10, pc}
-+
-+ .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
-+___
-+}
-+{
-+my ($inp,$out,$len,$key,$keysched,$const,$rounds,$ctr)=("r0","r1","r2","r3","sp","r6","r7","r8");
-+
-+$code.=<<___;
-+ .align 5
-+ @ byte-swap constants
-+.LSWP:
-+ .quad 0x0405060703020100
-+.LSWPUPM0SR:
-+ .quad 0x0a0d02060c03070b, 0x0004080f05090e01
-+.LADD:
-+ .quad 0x0807060504030201
-+
-+ .extern AES_encrypt
-+ .global bsaes_ctr32_encrypt_blocks
-+ .type bsaes_ctr32_encrypt_blocks,%function
-+bsaes_ctr32_encrypt_blocks:
-+ cmp $len, #8 @ use plain AES for
-+ blo .Lctr_enc_short @ small sizes
-+
-+ stmdb sp!, {r4-r8, r10-r11, lr}
-+ vstmdb sp!, {d8-d15} @ ABI specification says so
-+ ldr $ctr, [sp, #0x60] @ ctr is 1st arg on the stack
-+ sub sp, #0x10 @ scratch space to carry over the ctr
-+ mov r10, sp @ save sp
-+
-+ @ allocate the key schedule on the stack
-+ ldr $rounds, [r3, #240] @ get # of rounds
-+ sub r14, sp, $rounds, lsl #7 @ 128 bytes per inner round key
-+ add sp, r14, #`128-32` @ size of bit-sliced key schedule
-+
-+ @ populate the key schedule
-+ mov r4, $key @ pass key
-+ mov r5, $rounds @ pass # of rounds
-+ mov r12, $keysched @ pass key schedule
-+ bl _bsaes_key_convert
-+ veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
-+ vstmia r12, {@XMM[7]} @ save last round key
-+
-+ vldm $ctr, {@XMM[0]} @ load counter
-+ mov $ctr, r10
-+
-+ vldm $keysched, {@XMM[4]} @ load round0 key
-+
-+ vldr `&Dlo(@XMM[8])`, .LSWP @ byte swap upper part
-+ vtbl.8 `&Dhi(@XMM[0])`, {`&Dhi(@XMM[0])`}, `&Dlo(@XMM[8])`
-+ vtbl.8 `&Dhi(@XMM[4])`, {`&Dhi(@XMM[4])`}, `&Dlo(@XMM[8])`
-+
-+ vstm $keysched, {@XMM[4]} @ save adjusted round0 key
-+
-+ b .Lctr_enc_loop
-+
-+ .align 5
-+.Lctr_enc_loop:
-+
-+ @ set up the addition constants
-+ vldr `&Dlo(@XMM[11])`, .LADD
-+ vmov.i8 `&Dhi(@XMM[11])`, #0
-+ vmov.i8 @XMM[12], #0
-+ vzip.8 `&Dlo(@XMM[11])`, `&Dhi(@XMM[11])`
-+ vzip.16 @XMM[11], @XMM[12]
-+
-+ @ get 8 counter values in regs and do the add
-+ vdup.32 @XMM[4], `&Dhi(@XMM[0])`[1]
-+ vdup.32 @XMM[9], `&Dhi(@XMM[0])`[1]
-+ vadd.u32 @XMM[4], @XMM[11]
-+ vadd.u32 @XMM[9], @XMM[12]
-+ vdup.32 @XMM[2], `&Dhi(@XMM[0])`[0]
-+ vdup.32 @XMM[6], `&Dhi(@XMM[0])`[0]
-+ vzip.32 @XMM[2], @XMM[4]
-+ vzip.32 @XMM[6], @XMM[9]
-+
-+ vmov `&Dhi(@XMM[1])`, `&Dlo(@XMM[0])`
-+ vmov `&Dlo(@XMM[1])`, `&Dlo(@XMM[0])`
-+ vmov @XMM[3], @XMM[1]
-+ vmov @XMM[5], @XMM[1]
-+ vmov @XMM[7], @XMM[1]
-+
-+ vswp `&Dhi(@XMM[1])`, `&Dlo(@XMM[2])`
-+ vswp `&Dhi(@XMM[3])`, `&Dlo(@XMM[4])`
-+ vswp `&Dhi(@XMM[5])`, `&Dlo(@XMM[6])`
-+ vswp `&Dhi(@XMM[7])`, `&Dlo(@XMM[9])`
-+
-+ vstm $ctr, {@XMM[9]} @ save counter
-+
-+ @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
-+ @ to flip byte order in 32-bit counter
-+
-+ adr r11, .LSWPUPM0SR
-+ vld1.8 {@XMM[8]}, [r11]! @ .LSWPUPM0SR
-+ adrl $const,.LSR
-+ vld1.8 {@XMM[9]}, [$keysched] @ load round0 key
-+ mov r5, $rounds @ pass rounds
-+ add r4, $keysched, #0x10 @ pass next round key
-+ veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
-+ veor @XMM[11], @XMM[1], @XMM[9]
-+ vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
-+ vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
-+ veor @XMM[12], @XMM[2], @XMM[9]
-+ vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
-+ vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
-+ veor @XMM[13], @XMM[3], @XMM[9]
-+ vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
-+ vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
-+ veor @XMM[14], @XMM[4], @XMM[9]
-+ vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
-+ vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
-+ veor @XMM[15], @XMM[5], @XMM[9]
-+ vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
-+ vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
-+ veor @XMM[10], @XMM[6], @XMM[9]
-+ vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
-+ vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
-+ veor @XMM[11], @XMM[7], @XMM[9]
-+ vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
-+ vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
-+ vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
-+ vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
-+
-+ bl _bsaes_encrypt8_bitslice
-+
-+ subs $len, #8
-+ blo .Lctr_enc_loop_done
-+
-+ vldmia $inp!, {@XMM[8]-@XMM[15]} @ load input
-+
-+ veor @XMM[8], @XMM[0]
-+ veor @XMM[1], @XMM[9]
-+ vst1.8 {@XMM[8]}, [$out]!
-+ veor @XMM[4], @XMM[10]
-+ vst1.8 {@XMM[1]}, [$out]!
-+ veor @XMM[6], @XMM[11]
-+ vst1.8 {@XMM[4]}, [$out]!
-+ veor @XMM[3], @XMM[12]
-+ vst1.8 {@XMM[6]}, [$out]!
-+ veor @XMM[7], @XMM[13]
-+ vst1.8 {@XMM[3]}, [$out]!
-+ veor @XMM[2], @XMM[14]
-+ vst1.8 {@XMM[7]}, [$out]!
-+ veor @XMM[5], @XMM[15]
-+ vst1.8 {@XMM[2]}, [$out]!
-+ vst1.8 {@XMM[5]}, [$out]!
-+
-+ vldm $ctr, {@XMM[0]} @ load counter
-+
-+ bne .Lctr_enc_loop
-+ b .Lctr_enc_done
-+
-+.Lctr_enc_loop_done:
-+ add $len, #8
-+ vld1.8 {@XMM[8]}, [$inp]! @ load input
-+ veor @XMM[0], @XMM[8]
-+ vst1.8 {@XMM[0]}, [$out]! @ write output
-+ cmp $len, #2
-+ blo .Lctr_enc_done
-+ vld1.8 {@XMM[9]}, [$inp]!
-+ veor @XMM[1], @XMM[9]
-+ vst1.8 {@XMM[1]}, [$out]!
-+ beq .Lctr_enc_done
-+ vld1.8 {@XMM[10]}, [$inp]!
-+ veor @XMM[4], @XMM[10]
-+ vst1.8 {@XMM[4]}, [$out]!
-+ cmp $len, #4
-+ blo .Lctr_enc_done
-+ vld1.8 {@XMM[11]}, [$inp]!
-+ veor @XMM[6], @XMM[11]
-+ vst1.8 {@XMM[6]}, [$out]!
-+ beq .Lctr_enc_done
-+ vld1.8 {@XMM[12]}, [$inp]!
-+ veor @XMM[3], @XMM[12]
-+ vst1.8 {@XMM[3]}, [$out]!
-+ cmp $len, #6
-+ blo .Lctr_enc_done
-+ vld1.8 {@XMM[13]}, [$inp]!
-+ veor @XMM[7], @XMM[13]
-+ vst1.8 {@XMM[7]}, [$out]!
-+ beq .Lctr_enc_done
-+ vld1.8 {@XMM[14]}, [$inp]
-+ veor @XMM[2], @XMM[14]
-+ vst1.8 {@XMM[2]}, [$out]!
-+
-+.Lctr_enc_done:
-+ vmov.i32 q0, #0
-+.Lctr_enc_bzero: @ wipe key schedule [if any]
-+ vst1.8 {q0}, [$keysched]!
-+ teq $keysched, r10
-+ bne .Lctr_enc_bzero
-+
-+ add sp, r10, #0x10
-+ vldmia sp!, {d8-d15}
-+ ldmia sp!, {r4-r8, r10-r11, pc}
-+
-+.Lctr_enc_short:
-+ ldr ip, [sp] @ ctr pointer is passed on stack
-+ stmdb sp!, {r0-r6, lr} @ stack regs as usual
-+
-+ ldm sp, {r4-r6} @ copy r0-2 to r4-6
-+ vldmia ip, {d0-d1} @ load the counter from [arg5]
-+ vstmdb sp!, {d0-d1} @ copy of ctr to top of stack
-+ sub sp, #0x10
-+
-+.Lctr_enc_short_loop:
-+ add r0, sp, #0x10
-+ mov r1, sp @ put output on the stack
-+ ldr r2, [sp, #0x2c] @ stacked r3
-+
-+ bl AES_encrypt
-+
-+ vldmia r4!, {@XMM[1]} @ load input
-+ ldr r0, [sp, #0x1c] @ load LSW of counter (BE)
-+#ifdef __ARMEL__
-+ rev r0, r0 @ need to increment the counter
-+ add r0, #1 @ in BE mode
-+ rev r0, r0
-+#else
-+ add r0, #1
-+#endif
-+ vldm sp, {@XMM[0]}
-+ str r0, [sp, #0x1c]
-+ veor @XMM[0], @XMM[1]
-+ subs r6, #1
-+ vstmia r5!, {@XMM[0]}
-+ bne .Lctr_enc_short_loop
-+
-+ add sp, #0x30
-+ ldmia sp!, {r4-r6, pc}
-+
-+ .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
-+___
-+}
-+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
-+
-+print $code;
-+
-+close STDOUT;
---- a/crypto/evp/e_aes.c
-+++ b/crypto/evp/e_aes.c
-@@ -482,6 +482,12 @@ static const EVP_CIPHER aes_##keylen##_#
- NULL,NULL,aes_##mode##_ctrl,NULL }; \
- const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
- { return &aes_##keylen##_##mode; }
-+
-+#endif
-+
-+#if defined(AES_ASM) && defined(BSAES_ASM) && (defined(__arm__) || defined(__arm))
-+#include "arm_arch.h"
-+#define BSAES_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON)
- #endif
-
- #define BLOCK_CIPHER_generic_pack(nid,keylen,flags) \
-@@ -1064,11 +1070,13 @@ static int aes_xts_init_key(EVP_CIPHER_C
- xctx->stream = NULL;
- #endif
- /* key_len is two AES keys */
-+#if !(defined(__arm__) || defined(__arm))
- #ifdef BSAES_CAPABLE
- if (BSAES_CAPABLE)
- xctx->stream = enc ? bsaes_xts_encrypt : bsaes_xts_decrypt;
- else
- #endif
-+#endif
- #ifdef VPAES_CAPABLE
- if (VPAES_CAPABLE)
- {
diff --git a/debian/patches/old/0002-bsaes-armv7.pl-Big-endian-fixes.patch b/debian/patches/old/0002-bsaes-armv7.pl-Big-endian-fixes.patch
deleted file mode 100644
index 23fb94a..0000000
--- a/debian/patches/old/0002-bsaes-armv7.pl-Big-endian-fixes.patch
+++ /dev/null
@@ -1,216 +0,0 @@
-From 719e0b800e3737f3a19251a097ff911744ed7a9e Mon Sep 17 00:00:00 2001
-From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
-Date: Mon, 15 Apr 2013 13:54:13 +0200
-Subject: [PATCH 2/3] bsaes-armv7.pl: Big endian fixes
-
-Updated the code to be (more) endian neutral, however, as it is
-still untested on big endian, it is only enabled for little endian
-at the moment.
----
- crypto/aes/asm/bsaes-armv7.pl | 121 ++++++++++++++----------------------------
- crypto/evp/e_aes.c | 3 -
- 2 files changed, 45 insertions(+), 79 deletions(-)
-
---- a/crypto/aes/asm/bsaes-armv7.pl
-+++ b/crypto/aes/asm/bsaes-armv7.pl
-@@ -1196,8 +1196,9 @@ bsaes_cbc_encrypt:
-
- .Lcbc_dec_done:
- vmov.i32 q0, #0
-+ vmov.i32 q1, #0
- .Lcbc_dec_bzero: @ wipe key schedule [if any]
-- vst1.8 {q0}, [$keysched]!
-+ vstm $keysched!, {q0-q1}
- teq $keysched, r10
- bne .Lcbc_dec_bzero
-
-@@ -1215,13 +1216,9 @@ my ($inp,$out,$len,$key,$keysched,$const
-
- $code.=<<___;
- .align 5
-- @ byte-swap constants
--.LSWP:
-- .quad 0x0405060703020100
--.LSWPUPM0SR:
-- .quad 0x0a0d02060c03070b, 0x0004080f05090e01
-+
- .LADD:
-- .quad 0x0807060504030201
-+ .long 1,2,3,4,5,6,7,0
-
- .extern AES_encrypt
- .global bsaes_ctr32_encrypt_blocks
-@@ -1233,7 +1230,7 @@ bsaes_ctr32_encrypt_blocks:
- stmdb sp!, {r4-r8, r10-r11, lr}
- vstmdb sp!, {d8-d15} @ ABI specification says so
- ldr $ctr, [sp, #0x60] @ ctr is 1st arg on the stack
-- sub sp, #0x10 @ scratch space to carry over the ctr
-+ sub sp, #0x20 @ scratch space to carry over the ctr
- mov r10, sp @ save sp
-
- @ allocate the key schedule on the stack
-@@ -1249,92 +1246,61 @@ bsaes_ctr32_encrypt_blocks:
- veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
- vstmia r12, {@XMM[7]} @ save last round key
-
-- vldm $ctr, {@XMM[0]} @ load counter
-- mov $ctr, r10
--
-- vldm $keysched, {@XMM[4]} @ load round0 key
--
-- vldr `&Dlo(@XMM[8])`, .LSWP @ byte swap upper part
-- vtbl.8 `&Dhi(@XMM[0])`, {`&Dhi(@XMM[0])`}, `&Dlo(@XMM[8])`
-- vtbl.8 `&Dhi(@XMM[4])`, {`&Dhi(@XMM[4])`}, `&Dlo(@XMM[8])`
--
-- vstm $keysched, {@XMM[4]} @ save adjusted round0 key
--
-+ @ copy the invariant bits of the ctr
-+ ldm $ctr, {r4-r5, r11}
-+ mov r12, r11
-+ stm r10!, {r4-r5}
-+ stm r10!, {r4-r5}
-+ stm r10!, {r11-r12}
-+ stm r10!, {r11-r12}
-+ sub r10, #0x20
-+
-+ ldr r11, [$ctr, #0xc] @ get LSW of BE ctr
-+#ifdef __ARMEL__
-+ rev r11, r11
-+#endif
- b .Lctr_enc_loop
-
- .align 5
- .Lctr_enc_loop:
-
-- @ set up the addition constants
-- vldr `&Dlo(@XMM[11])`, .LADD
-- vmov.i8 `&Dhi(@XMM[11])`, #0
-- vmov.i8 @XMM[12], #0
-- vzip.8 `&Dlo(@XMM[11])`, `&Dhi(@XMM[11])`
-- vzip.16 @XMM[11], @XMM[12]
--
- @ get 8 counter values in regs and do the add
-- vdup.32 @XMM[4], `&Dhi(@XMM[0])`[1]
-- vdup.32 @XMM[9], `&Dhi(@XMM[0])`[1]
-+ adr r4, .LADD
-+ vdup.32 @XMM[4], r11
-+ vldm r4, {@XMM[11]-@XMM[12]}
-+ vmov @XMM[0], @XMM[4]
- vadd.u32 @XMM[4], @XMM[11]
-- vadd.u32 @XMM[9], @XMM[12]
-- vdup.32 @XMM[2], `&Dhi(@XMM[0])`[0]
-- vdup.32 @XMM[6], `&Dhi(@XMM[0])`[0]
-+ vadd.u32 @XMM[0], @XMM[12]
-+#ifdef __ARMEL__
-+ vrev32.8 @XMM[4], @XMM[4]
-+ vrev32.8 @XMM[0], @XMM[0]
-+#endif
-+ vld1.8 {@XMM[1]-@XMM[2]}, [r10]
-+ vld1.8 {@XMM[5]-@XMM[6]}, [r10]
- vzip.32 @XMM[2], @XMM[4]
-- vzip.32 @XMM[6], @XMM[9]
--
-- vmov `&Dhi(@XMM[1])`, `&Dlo(@XMM[0])`
-- vmov `&Dlo(@XMM[1])`, `&Dlo(@XMM[0])`
- vmov @XMM[3], @XMM[1]
-- vmov @XMM[5], @XMM[1]
-- vmov @XMM[7], @XMM[1]
-+ vzip.32 @XMM[6], @XMM[0]
-+ vmov @XMM[7], @XMM[5]
-
- vswp `&Dhi(@XMM[1])`, `&Dlo(@XMM[2])`
- vswp `&Dhi(@XMM[3])`, `&Dlo(@XMM[4])`
- vswp `&Dhi(@XMM[5])`, `&Dlo(@XMM[6])`
-- vswp `&Dhi(@XMM[7])`, `&Dlo(@XMM[9])`
--
-- vstm $ctr, {@XMM[9]} @ save counter
--
-- @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
-- @ to flip byte order in 32-bit counter
-+ vswp `&Dhi(@XMM[7])`, `&Dlo(@XMM[0])`
-
-- adr r11, .LSWPUPM0SR
-- vld1.8 {@XMM[8]}, [r11]! @ .LSWPUPM0SR
-- adrl $const,.LSR
-- vld1.8 {@XMM[9]}, [$keysched] @ load round0 key
-+ mov r4, $keysched @ pass round key
- mov r5, $rounds @ pass rounds
-- add r4, $keysched, #0x10 @ pass next round key
-- veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
-- veor @XMM[11], @XMM[1], @XMM[9]
-- vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
-- vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
-- veor @XMM[12], @XMM[2], @XMM[9]
-- vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
-- vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
-- veor @XMM[13], @XMM[3], @XMM[9]
-- vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
-- vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
-- veor @XMM[14], @XMM[4], @XMM[9]
-- vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
-- vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
-- veor @XMM[15], @XMM[5], @XMM[9]
-- vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
-- vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
-- veor @XMM[10], @XMM[6], @XMM[9]
-- vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
-- vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
-- veor @XMM[11], @XMM[7], @XMM[9]
-- vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
-- vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
-- vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
-- vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
-
-- bl _bsaes_encrypt8_bitslice
-+ bl _bsaes_encrypt8
-
- subs $len, #8
-+ add r11, #8
-+
- blo .Lctr_enc_loop_done
-
-- vldmia $inp!, {@XMM[8]-@XMM[15]} @ load input
-+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ load input
-+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
-+ vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
-+ vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
-
- veor @XMM[8], @XMM[0]
- veor @XMM[1], @XMM[9]
-@@ -1353,8 +1319,6 @@ bsaes_ctr32_encrypt_blocks:
- vst1.8 {@XMM[2]}, [$out]!
- vst1.8 {@XMM[5]}, [$out]!
-
-- vldm $ctr, {@XMM[0]} @ load counter
--
- bne .Lctr_enc_loop
- b .Lctr_enc_done
-
-@@ -1393,12 +1357,13 @@ bsaes_ctr32_encrypt_blocks:
-
- .Lctr_enc_done:
- vmov.i32 q0, #0
-+ vmov.i32 q1, #0
- .Lctr_enc_bzero: @ wipe key schedule [if any]
-- vst1.8 {q0}, [$keysched]!
-+ vstm $keysched!, {q0-q1}
- teq $keysched, r10
- bne .Lctr_enc_bzero
-
-- add sp, r10, #0x10
-+ add sp, r10, #0x20
- vldmia sp!, {d8-d15}
- ldmia sp!, {r4-r8, r10-r11, pc}
-
---- a/crypto/evp/e_aes.c
-+++ b/crypto/evp/e_aes.c
-@@ -485,7 +485,8 @@ const EVP_CIPHER *EVP_aes_##keylen##_##m
-
- #endif
-
--#if defined(AES_ASM) && defined(BSAES_ASM) && (defined(__arm__) || defined(__arm))
-+#if defined(AES_ASM) && defined(BSAES_ASM) && (defined(__arm__) || defined(__arm)) \
-+ && defined(__ARMEL__)
- #include "arm_arch.h"
- #define BSAES_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON)
- #endif
diff --git a/debian/patches/old/0003-bsaes-armv7.pl-avoid-bit-sliced-AES-CBC-for-block-si.patch b/debian/patches/old/0003-bsaes-armv7.pl-avoid-bit-sliced-AES-CBC-for-block-si.patch
deleted file mode 100644
index 4d2235d..0000000
--- a/debian/patches/old/0003-bsaes-armv7.pl-avoid-bit-sliced-AES-CBC-for-block-si.patch
+++ /dev/null
@@ -1,24 +0,0 @@
-From a2f9535dd2b0d2e230f978aa3eaf103f5224b6d5 Mon Sep 17 00:00:00 2001
-From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
-Date: Mon, 15 Apr 2013 14:32:59 +0200
-Subject: [PATCH 3/3] bsaes-armv7.pl: avoid bit-sliced AES/CBC for block sizes
- < 1k
-
-Avoid using bit sliced AES for CBC decryption when the block size
-is smaller than 1k. The reason is that the overhead of creating the
-key schedule is larger than the obtained speedup on Cortex-A9.
----
- crypto/aes/asm/bsaes-armv7.pl | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
---- a/crypto/aes/asm/bsaes-armv7.pl
-+++ b/crypto/aes/asm/bsaes-armv7.pl
-@@ -985,7 +985,7 @@ $code.=<<___;
- .global bsaes_cbc_encrypt
- .type bsaes_cbc_encrypt,%function
- bsaes_cbc_encrypt:
-- cmp $len, #128
-+ cmp $len, #1024
- blo AES_cbc_encrypt
-
- @ it is up to the caller to make sure we are called with enc == 0
diff --git a/debian/patches/pic.patch b/debian/patches/pic.patch
index ed95be4..bf63614 100644
--- a/debian/patches/pic.patch
+++ b/debian/patches/pic.patch
@@ -1,9 +1,9 @@
---
crypto/des/asm/desboth.pl | 17 ++++++++++++++---
crypto/perlasm/cbc.pl | 24 ++++++++++++++++++++----
- crypto/perlasm/x86gas.pl | 11 +++++++++++
+ crypto/perlasm/x86gas.pl | 16 ++++++++++++++++
crypto/x86cpuid.pl | 10 +++++-----
- 4 files changed, 50 insertions(+), 12 deletions(-)
+ 4 files changed, 55 insertions(+), 12 deletions(-)
--- a/crypto/des/asm/desboth.pl
+++ b/crypto/des/asm/desboth.pl
@@ -108,11 +108,13 @@
}
push(@out,$initseg) if ($initseg);
}
-@@ -218,7 +219,17 @@ ___
+@@ -218,8 +219,23 @@ ___
elsif ($::elf)
{ $initseg.=<<___;
.section .init
-+#ifdef OPENSSL_PIC
++___
++ if ($::pic)
++ { $initseg.=<<___;
+ pushl %ebx
+ call .pic_point0
+.pic_point0:
@@ -120,12 +122,16 @@
+ addl \$_GLOBAL_OFFSET_TABLE_+[.-.pic_point0],%ebx
+ call $f\@PLT
+ popl %ebx
-+#else
++___
++ }
++ else
++ { $initseg.=<<___;
call $f
-+#endif
___
++ }
}
elsif ($::coff)
+ { $initseg.=<<___; # applies to both Cygwin and Mingw
--- a/crypto/x86cpuid.pl
+++ b/crypto/x86cpuid.pl
@@ -8,6 +8,8 @@ require "x86asm.pl";
diff --git a/debian/patches/series b/debian/patches/series
index a2845d0..040e3df 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -35,7 +35,9 @@ default_bits.patch
perlpath-quilt.patch
tls12_workarounds.patch
ubuntu_deb676533_arm_asm.patch
+arm64-support
CVE-2013-0166.patch
+# Disabled for now, as causes regression on AES-NI
CVE-2013-0169.patch
fix_key_decoding_deadlock.patch
diff --git a/debian/patches/ubuntu_deb676533_arm_asm.patch b/debian/patches/ubuntu_deb676533_arm_asm.patch
index 9325394..a484bec 100644
--- a/debian/patches/ubuntu_deb676533_arm_asm.patch
+++ b/debian/patches/ubuntu_deb676533_arm_asm.patch
@@ -10,7 +10,7 @@ Bug-Ubuntu: https://bugs.launchpad.net/ubuntu/+source/openssl/+bug/1083498
--- a/Configure
+++ b/Configure
@@ -346,9 +346,8 @@ my %table=(
- "debian-alpha","gcc:-DTERMIO $debian_cflag::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_UNROLL:${alpha_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+ "debian-alpha","gcc:-DTERMIO ${debian_cflags}::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_UNROLL:${alpha_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"debian-alpha-ev4","gcc:-DTERMIO ${debian_cflags} -mcpu=ev4::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_UNROLL:${alpha_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"debian-alpha-ev5","gcc:-DTERMIO ${debian_cflags} -mcpu=ev5::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_UNROLL:${alpha_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
-"debian-armeb","gcc:-DB_ENDIAN -DTERMIO ${debian_cflags}::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",