Imported Debian patch 1.0.1c-4ubuntu9~linaro1HEAD master

author: Fathi Boudra <fathi.boudra@linaro.org> 2013-05-26 12:15:05 +0300
committer: Fathi Boudra <fathi.boudra@linaro.org> 2013-06-29 15:09:45 +0300
commit: a7dbcd9ed1d4d9cf3c5e327d4daed85e393303a1 (patch)
tree: 84e2fda53d6fd06ff21b3be1b211a44f917155ec
parent: bb612d6a59521b30e8dbe7b91cd696e2980cbf6b (diff)
11 files changed, 122 insertions, 1797 deletions
diff --git a/debian/changelog b/debian/changelog
index 9c76aae..abb81fa 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,6 +1,6 @@
-openssl (1.0.1c-3ubuntu2.5~linaro2) quantal; urgency=low
+openssl (1.0.1c-4ubuntu9~linaro1) raring; urgency=low
 
-  * Update patches - merged upstream ():
+  * Update patches - merged upstream:
     - 0000-bsaes-armv7.patch
     - 0000-crypto-modes-modes_lcl.h-let-STRICT_ALIGNMENT-be-on-.patch
     - 0001-bsaes-armv7.pl-take-it-into-build-loop.patch
@@ -8,48 +8,40 @@ openssl (1.0.1c-3ubuntu2.5~linaro2) quantal; urgency=low
     - 0003-bsaes-armv7.pl-avoid-bit-sliced-AES-CBC-for-block-si.patch
   * Update debian/control: build on armhf architecture only.
 
- -- Fathi Boudra <fathi.boudra@linaro.org>  Wed, 24 Apr 2013 12:45:04 +0300
+ -- Fathi Boudra <fathi.boudra@linaro.org>  Sun, 26 May 2013 12:15:05 +0300
 
-openssl (1.0.1c-3ubuntu2.5~linaro1) quantal; urgency=low
+openssl (1.0.1c-4ubuntu8) raring; urgency=low
 
-  * Add patches from Ard Biesheuvel:
-    - 0001-Added-CTR-and-CBC-decrypt-hooks-for-NEON-bit-sliced-.patch
-    - 0002-bsaes-armv7.pl-Big-endian-fixes.patch
-    - 0003-bsaes-armv7.pl-avoid-bit-sliced-AES-CBC-for-block-si.patch
+  * SECURITY UPDATE: "Lucky Thirteen" timing side-channel TLS attack
+    - debian/patches/CVE-2013-0169.patch: re-enabled patch and added extra
+      commit from upstream to fix regression.
+    - CVE-2013-0169
 
- -- Fathi Boudra <fathi.boudra@linaro.org>  Tue, 16 Apr 2013 12:43:11 +0300
+ -- Marc Deslauriers <marc.deslauriers@ubuntu.com>  Tue, 19 Mar 2013 14:33:14 -0400
 
-openssl (1.0.1c-3ubuntu2.4) quantal; urgency=low
+openssl (1.0.1c-4ubuntu7) raring; urgency=low
 
-  [ Dmitrijs Ledkovs ]
-  * Enable arm assembly code. (LP: #1083498) (Closes: #676533)
   * Enable optimized 64bit elliptic curve code contributed by Google. (LP: #1018522)
 
-  [ Marc Deslauriers ]
-  * debian/patches/fix_key_decoding_deadlock.patch: Fix possible deadlock
-    when decoding public keys. (LP: #1066032)
-
- -- Dmitrijs Ledkovs <dmitrij.ledkov@ubuntu.com>  Thu, 04 Apr 2013 12:15:11 +0100
+ -- Dmitrijs Ledkovs <dmitrij.ledkov@ubuntu.com>  Thu, 07 Mar 2013 15:36:16 +0000
 
-openssl (1.0.1c-3ubuntu2.3) quantal-security; urgency=low
+openssl (1.0.1c-4ubuntu6) raring; urgency=low
 
-  * SECURITY UPDATE: "Lucky Thirteen" timing side-channel TLS attack
-    - debian/patches/CVE-2013-0169.patch: re-enabled patch and added extra
-      commit from upstream to fix regression.
-    - CVE-2013-0169
+  * debian/patches/fix_key_decoding_deadlock.patch: Fix possible deadlock
+    when decoding public keys. (LP: #1066032)
 
- -- Marc Deslauriers <marc.deslauriers@ubuntu.com>  Tue, 19 Mar 2013 14:35:24 -0400
+ -- Marc Deslauriers <marc.deslauriers@ubuntu.com>  Wed, 06 Mar 2013 08:11:19 -0500
 
-openssl (1.0.1c-3ubuntu2.2) quantal-security; urgency=low
+openssl (1.0.1c-4ubuntu5) raring; urgency=low
 
   * REGRESSION FIX: decryption errors on AES-NI hardware (LP: #1134873,
     LP: #1133333)
     - debian/patches/CVE-2013-0169.patch: disabled for now until fix is
       available from upstream.
 
- -- Marc Deslauriers <marc.deslauriers@ubuntu.com>  Thu, 28 Feb 2013 10:56:42 -0500
+ -- Marc Deslauriers <marc.deslauriers@ubuntu.com>  Thu, 28 Feb 2013 11:01:29 -0500
 
-openssl (1.0.1c-3ubuntu2.1) quantal-security; urgency=low
+openssl (1.0.1c-4ubuntu4) raring; urgency=low
 
   * SECURITY UPDATE: denial of service via invalid OCSP key
     - debian/patches/CVE-2013-0166.patch: properly handle NULL key in
@@ -62,7 +54,73 @@ openssl (1.0.1c-3ubuntu2.1) quantal-security; urgency=low
     - Fix included in CVE-2013-0169 patch
     - CVE-2012-2686
 
- -- Marc Deslauriers <marc.deslauriers@ubuntu.com>  Mon, 18 Feb 2013 13:13:42 -0500
+ -- Marc Deslauriers <marc.deslauriers@ubuntu.com>  Tue, 19 Feb 2013 13:25:24 -0500
+
+openssl (1.0.1c-4ubuntu3) raring; urgency=low
+
+  * Add basic arm64 support (no assembler) (LP: #1102107)
+
+ -- Wookey <wookey@wookware.org>  Sun, 20 Jan 2013 17:30:15 +0000
+
+openssl (1.0.1c-4ubuntu2) raring; urgency=low
+
+  * Enable arm assembly code. (LP: #1083498) (Closes: #676533)
+
+ -- Dmitrijs Ledkovs <dmitrij.ledkov@ubuntu.com>  Wed, 28 Nov 2012 00:08:45 +0000
+
+openssl (1.0.1c-4ubuntu1) raring; urgency=low
+
+  * Resynchronise with Debian (LP: #1077228).  Remaining changes:
+    - debian/libssl1.0.0.postinst:
+      + Display a system restart required notification on libssl1.0.0
+        upgrade on servers.
+      + Use a different priority for libssl1.0.0/restart-services depending
+        on whether a desktop, or server dist-upgrade is being performed.
+    - debian/{libssl1.0.0-udeb.dirs, control, rules}: Create
+      libssl1.0.0-udeb, for the benefit of wget-udeb (no wget-udeb package
+      in Debian).
+    - debian/{libcrypto1.0.0-udeb.dirs, libssl1.0.0.dirs, libssl1.0.0.files,
+      rules}: Move runtime libraries to /lib, for the benefit of
+      wpasupplicant.
+    - debian/patches/perlpath-quilt.patch: Don't change perl #! paths under
+      .pc.
+    - debian/rules:
+      + Don't run 'make test' when cross-building.
+      + Use host compiler when cross-building.  Patch from Neil Williams.
+      + Don't build for processors no longer supported: i586 (on i386)
+      + Fix Makefile to properly clean up libs/ dirs in clean target.
+      + Replace duplicate files in the doc directory with symlinks.
+    - Unapply patch c_rehash-multi and comment it out in the series as it
+      breaks parsing of certificates with CRLF line endings and other cases
+      (see Debian #642314 for discussion), it also changes the semantics of
+      c_rehash directories by requiring applications to parse hash link
+      targets as files containing potentially *multiple* certificates rather
+      than exactly one.
+    - Bump version passed to dh_makeshlibs to 1.0.1 for new symbols.
+    - debian/patches/tls12_workarounds.patch: Workaround large client hello
+      issues when TLS 1.1 and lower is in use
+    - debian/control: Mark Debian Vcs-* as XS-Debian-Vcs-*
+  * Dropped changes:
+    - Drop openssl-doc in favour of the libssl-doc package introduced by
+      Debian.  Add Conflicts/Replaces until the next LTS release.
+      + Drop the Conflicts/Replaces because 12.04 LTS was 'the next LTS
+        release'
+
+ -- Tyler Hicks <tyhicks@canonical.com>  Fri, 09 Nov 2012 14:49:13 -0800
+
+openssl (1.0.1c-4) unstable; urgency=low
+
+  * Fix the configure rules for alpha (Closes: #672710)
+  * Switch the postinst to sh again, there never was a reason to
+    switch it to bash (Closes: #676398)
+  * Fix pic.patch to not use #ifdef in x86cpuid.s, only .S files are
+    preprocessed.  We generate the file again for pic anyway.
+    (Closes: #677468)
+  * Drop Breaks against openssh as it was only for upgrades
+    between versions that were only in testing/unstable.
+    (Closes: #668600)
+
+ -- Kurt Roeckx <kurt@roeckx.be>  Tue, 17 Jul 2012 11:49:19 +0200
 
 openssl (1.0.1c-3ubuntu2) quantal; urgency=low
 
diff --git a/debian/control b/debian/control
index 8868169..b2c0212 100644
--- a/debian/control
+++ b/debian/control
@@ -34,7 +34,6 @@ Architecture: armhf
 Multi-Arch: same
 Pre-Depends: ${misc:Pre-Depends}
 Depends: ${shlibs:Depends}, ${misc:Depends}
-Breaks: openssh-client (<< 1:5.9p1-4), openssh-server (<< 1:5.9p1-4)
 Description: SSL shared libraries
  libssl and libcrypto shared libraries needed by programs like
  apache-ssl, telnet-ssl and openssh.
@@ -79,8 +78,7 @@ Package: libssl-doc
 Section: doc
 Priority: optional
 Architecture: all
-Replaces: libssl-dev (<< 1.0.0), openssl-doc
-Conflicts: openssl-doc
+Replaces: libssl-dev (<< 1.0.0)
 Breaks: libssl-dev (<< 1.0.0)
 Depends: ${shlibs:Depends}, ${perl:Depends}, ${misc:Depends}
 Description: SSL development documentation documentation
diff --git a/debian/libssl1.0.0.postinst b/debian/libssl1.0.0.postinst
index 4e8a17c..57ae577 100644
--- a/debian/libssl1.0.0.postinst
+++ b/debian/libssl1.0.0.postinst
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 
 . /usr/share/debconf/confmodule
 
diff --git a/debian/patches/arm64-support b/debian/patches/arm64-support
new file mode 100644
index 0000000..a2f6cd6
--- /dev/null
+++ b/debian/patches/arm64-support
@@ -0,0 +1,19 @@
+Description: Add arm64 support
+ Add 'debian-arm64' to configure so it at least tries to build 
+Author: Wookey <wookey@wookware.org>, Riku Voipio <riku.voipio@linaro.org>
+Last-Update: <2013-01-20>
+
+---
+ Configure |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/Configure
++++ b/Configure
+@@ -346,6 +346,7 @@ my %table=(
+ "debian-alpha","gcc:-DTERMIO ${debian_cflags}::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_UNROLL:${alpha_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+ "debian-alpha-ev4","gcc:-DTERMIO ${debian_cflags} -mcpu=ev4::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_UNROLL:${alpha_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+ "debian-alpha-ev5","gcc:-DTERMIO ${debian_cflags} -mcpu=ev5::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_UNROLL:${alpha_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
++"debian-arm64","gcc:-DL_ENDIAN -DTERMIO ${debian_cflags}::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+ "debian-armel","gcc:-DL_ENDIAN -DTERMIO ${debian_cflags}::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${armv4_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+ "debian-armhf","gcc:-DL_ENDIAN -DTERMIO ${debian_cflags}::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${armv4_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+ "debian-amd64", "gcc:-m64 -DL_ENDIAN -DTERMIO ${debian_cflags} -DMD32_REG_T=int::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:elf:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::",
diff --git a/debian/patches/debian-targets.patch b/debian/patches/debian-targets.patch
index 0727967..1a4da29 100644
--- a/debian/patches/debian-targets.patch
+++ b/debian/patches/debian-targets.patch
@@ -20,7 +20,7 @@
  "tru64-alpha-cc", "cc:-std1 -tune host -fast -readonly_strings::-pthread:::SIXTY_FOUR_BIT_LONG RC4_CHUNK:${alpha_asm}:dlfcn:alpha-osf1-shared::-msym:.so",
  
 +# Debian GNU/* (various architectures)
-+"debian-alpha","gcc:-DTERMIO $debian_cflag::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_UNROLL:${alpha_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
++"debian-alpha","gcc:-DTERMIO ${debian_cflags}::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_UNROLL:${alpha_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 +"debian-alpha-ev4","gcc:-DTERMIO ${debian_cflags} -mcpu=ev4::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_UNROLL:${alpha_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 +"debian-alpha-ev5","gcc:-DTERMIO ${debian_cflags} -mcpu=ev5::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_UNROLL:${alpha_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 +"debian-armeb","gcc:-DB_ENDIAN -DTERMIO ${debian_cflags}::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
diff --git a/debian/patches/old/0001-Added-CTR-and-CBC-decrypt-hooks-for-NEON-bit-sliced-.patch b/debian/patches/old/0001-Added-CTR-and-CBC-decrypt-hooks-for-NEON-bit-sliced-.patch
deleted file mode 100644
index 0ebaba8..0000000
--- a/debian/patches/old/0001-Added-CTR-and-CBC-decrypt-hooks-for-NEON-bit-sliced-.patch
+++ /dev/null
@@ -1,1518 +0,0 @@
-From 5e51c6b42b8b7d773ab45dcabec1189a1451bebd Mon Sep 17 00:00:00 2001
-From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
-Date: Thu, 11 Apr 2013 17:33:36 +0200
-Subject: [PATCH 1/3] Added CTR and CBC-decrypt hooks for NEON bit sliced AES
-
-The actual bit sliced AES transform was already there, only the
-hooks were missing. These are based heavily on the existing hooks
-for x86_64.
----
- Configure                     |    2 
- crypto/aes/Makefile           |    3 
- crypto/aes/asm/bsaes-armv7.pl | 1447 ++++++++++++++++++++++++++++++++++++++++++
- crypto/evp/e_aes.c            |    8 
- 4 files changed, 1458 insertions(+), 2 deletions(-)
-
---- a/Configure
-+++ b/Configure
-@@ -140,7 +140,7 @@ my $alpha_asm="alphacpuid.o:bn_asm.o alp
- my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o::::::::";
- my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o::::::::";
- my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o::aes-s390x.o aes-ctr.o aes-xts.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:";
--my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o::void";
-+my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o bsaes-armv7.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o::void";
- my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::32";
- my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::64";
- my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::::";
---- a/crypto/aes/Makefile
-+++ b/crypto/aes/Makefile
-@@ -79,8 +79,9 @@ aes-mips.S:	asm/aes-mips.pl
- 	$(PERL) asm/aes-mips.pl $(PERLASM_SCHEME) $@
- 
- # GNU make "catch all"
--aes-%.S:	asm/aes-%.pl;	$(PERL) $< $(PERLASM_SCHEME) > $@
-+%.S:		asm/%.pl;	$(PERL) $< $(PERLASM_SCHEME) > $@
- aes-armv4.o:	aes-armv4.S
-+bsaes-armv7.o:	bsaes-armv7.S
- 
- files:
- 	$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
---- /dev/null
-+++ b/crypto/aes/asm/bsaes-armv7.pl
-@@ -0,0 +1,1447 @@
-+#!/usr/bin/env perl
-+
-+# ====================================================================
-+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-+# project. The module is, however, dual licensed under OpenSSL and
-+# CRYPTOGAMS licenses depending on where you obtain it. For further
-+# details see http://www.openssl.org/~appro/cryptogams/.
-+# ====================================================================
-+
-+# Bit-sliced AES for ARM NEON
-+#
-+# February 2012.
-+#
-+# This implementation is direct adaptation of bsaes-x86_64 module for
-+# ARM NEON. Except that this module is endian-neutral [in sense that
-+# it can be compiled for either endianness] by courtesy of vld1.8's
-+# neutrality. Initial version doesn't implement interface to OpenSSL,
-+# only low-level primitives and unsupported entry points, just enough
-+# to collect performance results, which for Cortex-A8 core are:
-+#
-+# encrypt	19.5 cycles per byte processed with 128-bit key
-+# decrypt	24.0 cycles per byte processed with 128-bit key
-+# key conv.	440  cycles per 128-bit key/0.18 of 8x block
-+#
-+# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 22.6,
-+# which is [much] worse than anticipated (for further details see
-+# http://www.openssl.org/~appro/Snapdragon-S4.html).
-+#
-+# When comparing to x86_64 results keep in mind that NEON unit is
-+# [mostly] single-issue and thus can't [fully] benefit from
-+# instruction-level parallelism. And when comparing to aes-armv4
-+# results keep in mind key schedule conversion overhead (see
-+# bsaes-x86_64.pl for further details)...
-+#
-+#						<appro@openssl.org>
-+
-+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-+open STDOUT,">$output";
-+
-+my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
-+my @XMM=map("q$_",(0..15));
-+
-+{
-+my ($key,$rounds,$const)=("r4","r5","r6");
-+
-+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
-+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
-+
-+sub Sbox {
-+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
-+# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
-+my @b=@_[0..7];
-+my @t=@_[8..11];
-+my @s=@_[12..15];
-+	&InBasisChange	(@b);
-+	&Inv_GF256	(@b[6,5,0,3,7,1,4,2],@t,@s);
-+	&OutBasisChange	(@b[7,1,4,2,6,5,0,3]);
-+}
-+
-+sub InBasisChange {
-+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
-+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 
-+my @b=@_[0..7];
-+$code.=<<___;
-+	veor	@b[2], @b[2], @b[1]
-+	veor	@b[5], @b[5], @b[6]
-+	veor	@b[3], @b[3], @b[0]
-+	veor	@b[6], @b[6], @b[2]
-+	veor	@b[5], @b[5], @b[0]
-+
-+	veor	@b[6], @b[6], @b[3]
-+	veor	@b[3], @b[3], @b[7]
-+	veor	@b[7], @b[7], @b[5]
-+	veor	@b[3], @b[3], @b[4]
-+	veor	@b[4], @b[4], @b[5]
-+
-+	veor	@b[2], @b[2], @b[7]
-+	veor	@b[3], @b[3], @b[1]
-+	veor	@b[1], @b[1], @b[5]
-+___
-+}
-+
-+sub OutBasisChange {
-+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
-+# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
-+my @b=@_[0..7];
-+$code.=<<___;
-+	veor	@b[0], @b[0], @b[6]
-+	veor	@b[1], @b[1], @b[4]
-+	veor	@b[4], @b[4], @b[6]
-+	veor	@b[2], @b[2], @b[0]
-+	veor	@b[6], @b[6], @b[1]
-+
-+	veor	@b[1], @b[1], @b[5]
-+	veor	@b[5], @b[5], @b[3]
-+	veor	@b[3], @b[3], @b[7]
-+	veor	@b[7], @b[7], @b[5]
-+	veor	@b[2], @b[2], @b[5]
-+
-+	veor	@b[4], @b[4], @b[7]
-+___
-+}
-+
-+sub InvSbox {
-+# input in lsb 	> [b0, b1, b2, b3, b4, b5, b6, b7] < msb
-+# output in lsb	> [b0, b1, b6, b4, b2, b7, b3, b5] < msb
-+my @b=@_[0..7];
-+my @t=@_[8..11];
-+my @s=@_[12..15];
-+	&InvInBasisChange	(@b);
-+	&Inv_GF256		(@b[5,1,2,6,3,7,0,4],@t,@s);
-+	&InvOutBasisChange	(@b[3,7,0,4,5,1,2,6]);
-+}
-+
-+sub InvInBasisChange {		# OutBasisChange in reverse (with twist)
-+my @b=@_[5,1,2,6,3,7,0,4];
-+$code.=<<___
-+	 veor	@b[1], @b[1], @b[7]
-+	veor	@b[4], @b[4], @b[7]
-+
-+	veor	@b[7], @b[7], @b[5]
-+	 veor	@b[1], @b[1], @b[3]
-+	veor	@b[2], @b[2], @b[5]
-+	veor	@b[3], @b[3], @b[7]
-+
-+	veor	@b[6], @b[6], @b[1]
-+	veor	@b[2], @b[2], @b[0]
-+	 veor	@b[5], @b[5], @b[3]
-+	veor	@b[4], @b[4], @b[6]
-+	veor	@b[0], @b[0], @b[6]
-+	veor	@b[1], @b[1], @b[4]
-+___
-+}
-+
-+sub InvOutBasisChange {		# InBasisChange in reverse
-+my @b=@_[2,5,7,3,6,1,0,4];
-+$code.=<<___;
-+	veor	@b[1], @b[1], @b[5]
-+	veor	@b[2], @b[2], @b[7]
-+
-+	veor	@b[3], @b[3], @b[1]
-+	veor	@b[4], @b[4], @b[5]
-+	veor	@b[7], @b[7], @b[5]
-+	veor	@b[3], @b[3], @b[4]
-+	 veor 	@b[5], @b[5], @b[0]
-+	veor	@b[3], @b[3], @b[7]
-+	 veor	@b[6], @b[6], @b[2]
-+	 veor	@b[2], @b[2], @b[1]
-+	veor	@b[6], @b[6], @b[3]
-+
-+	veor	@b[3], @b[3], @b[0]
-+	veor	@b[5], @b[5], @b[6]
-+___
-+}
-+
-+sub Mul_GF4 {
-+#;*************************************************************
-+#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
-+#;*************************************************************
-+my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
-+$code.=<<___;
-+	veor 	$t0, $y0, $y1
-+	vand	$t0, $t0, $x0
-+	veor	$x0, $x0, $x1
-+	vand	$t1, $x1, $y0
-+	vand	$x0, $x0, $y1
-+	veor	$x1, $t1, $t0
-+	veor	$x0, $x0, $t1
-+___
-+}
-+
-+sub Mul_GF4_N {				# not used, see next subroutine
-+# multiply and scale by N
-+my ($x0,$x1,$y0,$y1,$t0)=@_;
-+$code.=<<___;
-+	veor	$t0, $y0, $y1
-+	vand	$t0, $t0, $x0
-+	veor	$x0, $x0, $x1
-+	vand	$x1, $x1, $y0
-+	vand	$x0, $x0, $y1
-+	veor	$x1, $x1, $x0
-+	veor	$x0, $x0, $t0
-+___
-+}
-+
-+sub Mul_GF4_N_GF4 {
-+# interleaved Mul_GF4_N and Mul_GF4
-+my ($x0,$x1,$y0,$y1,$t0,
-+    $x2,$x3,$y2,$y3,$t1)=@_;
-+$code.=<<___;
-+	veor	$t0, $y0, $y1
-+	 veor 	$t1, $y2, $y3
-+	vand	$t0, $t0, $x0
-+	 vand	$t1, $t1, $x2
-+	veor	$x0, $x0, $x1
-+	 veor	$x2, $x2, $x3
-+	vand	$x1, $x1, $y0
-+	 vand	$x3, $x3, $y2
-+	vand	$x0, $x0, $y1
-+	 vand	$x2, $x2, $y3
-+	veor	$x1, $x1, $x0
-+	 veor	$x2, $x2, $x3
-+	veor	$x0, $x0, $t0
-+	 veor	$x3, $x3, $t1
-+___
-+}
-+sub Mul_GF16_2 {
-+my @x=@_[0..7];
-+my @y=@_[8..11];
-+my @t=@_[12..15];
-+$code.=<<___;
-+	veor	@t[0], @x[0], @x[2]
-+	veor	@t[1], @x[1], @x[3]
-+___
-+	&Mul_GF4  	(@x[0], @x[1], @y[0], @y[1], @t[2..3]);
-+$code.=<<___;
-+	veor	@y[0], @y[0], @y[2]
-+	veor	@y[1], @y[1], @y[3]
-+___
-+	Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
-+			 @x[2], @x[3], @y[2], @y[3], @t[2]);
-+$code.=<<___;
-+	veor	@x[0], @x[0], @t[0]
-+	veor	@x[2], @x[2], @t[0]
-+	veor	@x[1], @x[1], @t[1]
-+	veor	@x[3], @x[3], @t[1]
-+
-+	veor	@t[0], @x[4], @x[6]
-+	veor	@t[1], @x[5], @x[7]
-+___
-+	&Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
-+			 @x[6], @x[7], @y[2], @y[3], @t[2]);
-+$code.=<<___;
-+	veor	@y[0], @y[0], @y[2]
-+	veor	@y[1], @y[1], @y[3]
-+___
-+	&Mul_GF4  	(@x[4], @x[5], @y[0], @y[1], @t[2..3]);
-+$code.=<<___;
-+	veor	@x[4], @x[4], @t[0]
-+	veor	@x[6], @x[6], @t[0]
-+	veor	@x[5], @x[5], @t[1]
-+	veor	@x[7], @x[7], @t[1]
-+___
-+}
-+sub Inv_GF256 {
-+#;********************************************************************
-+#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
-+#;********************************************************************
-+my @x=@_[0..7];
-+my @t=@_[8..11];
-+my @s=@_[12..15];
-+# direct optimizations from hardware
-+$code.=<<___;
-+	veor	@t[3], @x[4], @x[6]
-+	veor	@t[2], @x[5], @x[7]
-+	veor	@t[1], @x[1], @x[3]
-+	veor	@s[1], @x[7], @x[6]
-+	 vmov	@t[0], @t[2]
-+	veor	@s[0], @x[0], @x[2]
-+
-+	vorr	@t[2], @t[2], @t[1]
-+	veor	@s[3], @t[3], @t[0]
-+	vand	@s[2], @t[3], @s[0]
-+	vorr	@t[3], @t[3], @s[0]
-+	veor	@s[0], @s[0], @t[1]
-+	vand	@t[0], @t[0], @t[1]
-+	veor	@t[1], @x[3], @x[2]
-+	vand	@s[3], @s[3], @s[0]
-+	vand	@s[1], @s[1], @t[1]
-+	veor	@t[1], @x[4], @x[5]
-+	veor	@s[0], @x[1], @x[0]
-+	veor	@t[3], @t[3], @s[1]
-+	veor	@t[2], @t[2], @s[1]
-+	vand	@s[1], @t[1], @s[0]
-+	vorr	@t[1], @t[1], @s[0]
-+	veor	@t[3], @t[3], @s[3]
-+	veor	@t[0], @t[0], @s[1]
-+	veor	@t[2], @t[2], @s[2]
-+	veor	@t[1], @t[1], @s[3]
-+	veor	@t[0], @t[0], @s[2]
-+	vand	@s[0], @x[7], @x[3]
-+	veor	@t[1], @t[1], @s[2]
-+	vand	@s[1], @x[6], @x[2]
-+	vand	@s[2], @x[5], @x[1]
-+	vorr	@s[3], @x[4], @x[0]
-+	veor	@t[3], @t[3], @s[0]
-+	veor	@t[1], @t[1], @s[2]
-+	veor	@t[0], @t[0], @s[3]
-+	veor	@t[2], @t[2], @s[1]
-+
-+	@ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
-+
-+	@ new smaller inversion
-+
-+	vand	@s[2], @t[3], @t[1]
-+	vmov	@s[0], @t[0]
-+
-+	veor	@s[1], @t[2], @s[2]
-+	veor	@s[3], @t[0], @s[2]
-+	veor	@s[2], @t[0], @s[2]	@ @s[2]=@s[3]
-+
-+	vbsl	@s[1], @t[1], @t[0]
-+	vbsl	@s[3], @t[3], @t[2]
-+	veor	@t[3], @t[3], @t[2]
-+
-+	vbsl	@s[0], @s[1], @s[2]
-+	vbsl	@t[0], @s[2], @s[1]
-+
-+	vand	@s[2], @s[0], @s[3]
-+	veor	@t[1], @t[1], @t[0]
-+
-+	veor	@s[2], @s[2], @t[3]
-+___
-+# output in s3, s2, s1, t1
-+
-+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
-+
-+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
-+	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
-+
-+### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
-+}
-+
-+# AES linear components
-+
-+sub ShiftRows {
-+my @x=@_[0..7];
-+my @t=@_[8..11];
-+my $mask=pop;
-+$code.=<<___;
-+	vldmia	$key!, {@t[0]-@t[3]}
-+	veor	@t[0], @t[0], @x[0]
-+	veor	@t[1], @t[1], @x[1]
-+	vtbl.8	`&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
-+	vtbl.8	`&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
-+	vldmia	$key!, {@t[0]}
-+	veor	@t[2], @t[2], @x[2]
-+	vtbl.8	`&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
-+	vtbl.8	`&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
-+	vldmia	$key!, {@t[1]}
-+	veor	@t[3], @t[3], @x[3]
-+	vtbl.8	`&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
-+	vtbl.8	`&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
-+	vldmia	$key!, {@t[2]}
-+	vtbl.8	`&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
-+	vtbl.8	`&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
-+	vldmia	$key!, {@t[3]}
-+	veor	@t[0], @t[0], @x[4]
-+	veor	@t[1], @t[1], @x[5]
-+	vtbl.8	`&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
-+	vtbl.8	`&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
-+	veor	@t[2], @t[2], @x[6]
-+	vtbl.8	`&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
-+	vtbl.8	`&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
-+	veor	@t[3], @t[3], @x[7]
-+	vtbl.8	`&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
-+	vtbl.8	`&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
-+	vtbl.8	`&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
-+	vtbl.8	`&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
-+___
-+}
-+
-+sub MixColumns {
-+# modified to emit output in order suitable for feeding back to aesenc[last]
-+my @x=@_[0..7];
-+my @t=@_[8..15];
-+$code.=<<___;
-+	vext.8	@t[0], @x[0], @x[0], #12	@ x0 <<< 32
-+	vext.8	@t[1], @x[1], @x[1], #12
-+	 veor	@x[0], @x[0], @t[0]		@ x0 ^ (x0 <<< 32)
-+	vext.8	@t[2], @x[2], @x[2], #12
-+	 veor	@x[1], @x[1], @t[1]
-+	vext.8	@t[3], @x[3], @x[3], #12
-+	 veor	@x[2], @x[2], @t[2]
-+	vext.8	@t[4], @x[4], @x[4], #12
-+	 veor	@x[3], @x[3], @t[3]
-+	vext.8	@t[5], @x[5], @x[5], #12
-+	 veor	@x[4], @x[4], @t[4]
-+	vext.8	@t[6], @x[6], @x[6], #12
-+	 veor	@x[5], @x[5], @t[5]
-+	vext.8	@t[7], @x[7], @x[7], #12
-+	 veor	@x[6], @x[6], @t[6]
-+
-+	veor	@t[1], @t[1], @x[0]
-+	 veor	@x[7], @x[7], @t[7]
-+	 vext.8	@x[0], @x[0], @x[0], #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
-+	veor	@t[2], @t[2], @x[1]
-+	veor	@t[0], @t[0], @x[7]
-+	veor	@t[1], @t[1], @x[7]
-+	 vext.8	@x[1], @x[1], @x[1], #8
-+	veor	@t[5], @t[5], @x[4]
-+	 veor	@x[0], @x[0], @t[0]
-+	veor	@t[6], @t[6], @x[5]
-+	 veor	@x[1], @x[1], @t[1]
-+	 vext.8	@t[0], @x[4], @x[4], #8
-+	veor	@t[4], @t[4], @x[3]
-+	 vext.8	@t[1], @x[5], @x[5], #8
-+	veor	@t[7], @t[7], @x[6]
-+	 vext.8	@x[4], @x[3], @x[3], #8
-+	veor	@t[3], @t[3], @x[2]
-+	 vext.8	@x[5], @x[7], @x[7], #8
-+	veor	@t[4], @t[4], @x[7]
-+	 vext.8	@x[3], @x[6], @x[6], #8
-+	veor	@t[3], @t[3], @x[7]
-+	 vext.8	@x[6], @x[2], @x[2], #8
-+	veor	@x[7], @t[1], @t[5]
-+	veor	@x[2], @t[0], @t[4]
-+
-+	veor	@x[4], @x[4], @t[3]
-+	veor	@x[5], @x[5], @t[7]
-+	veor	@x[3], @x[3], @t[6]
-+	 @ vmov	@x[2], @t[0]
-+	veor	@x[6], @x[6], @t[2]
-+	 @ vmov	@x[7], @t[1]
-+___
-+}
-+
-+sub InvMixColumns {
-+my @x=@_[0..7];
-+my @t=@_[8..15];
-+
-+$code.=<<___;
-+	@ multiplication by 0x0e
-+	vext.8	@t[7], @x[7], @x[7], #12
-+	vmov	@t[2], @x[2]
-+	veor	@x[2], @x[2], @x[5]		@ 2 5
-+	veor	@x[7], @x[7], @x[5]		@ 7 5
-+	vext.8	@t[0], @x[0], @x[0], #12
-+	vmov	@t[5], @x[5]
-+	veor	@x[5], @x[5], @x[0]		@ 5 0		[1]
-+	veor	@x[0], @x[0], @x[1]		@ 0 1
-+	vext.8	@t[1], @x[1], @x[1], #12
-+	veor	@x[1], @x[1], @x[2]		@ 1 25
-+	veor	@x[0], @x[0], @x[6]		@ 01 6		[2]
-+	vext.8	@t[3], @x[3], @x[3], #12
-+	veor	@x[1], @x[1], @x[3]		@ 125 3		[4]
-+	veor	@x[2], @x[2], @x[0]		@ 25 016	[3]
-+	veor	@x[3], @x[3], @x[7]		@ 3 75
-+	veor	@x[7], @x[7], @x[6]		@ 75 6		[0]
-+	vext.8	@t[6], @x[6], @x[6], #12
-+	vmov	@t[4], @x[4]
-+	veor	@x[6], @x[6], @x[4]		@ 6 4
-+	veor	@x[4], @x[4], @x[3]		@ 4 375		[6]
-+	veor	@x[3], @x[3], @x[7]		@ 375 756=36
-+	veor	@x[6], @x[6], @t[5]		@ 64 5		[7]
-+	veor	@x[3], @x[3], @t[2]		@ 36 2
-+	vext.8	@t[5], @t[5], @t[5], #12
-+	veor	@x[3], @x[3], @t[4]		@ 362 4		[5]
-+___
-+					my @y = @x[7,5,0,2,1,3,4,6];
-+$code.=<<___;
-+	@ multiplication by 0x0b
-+	veor	@y[1], @y[1], @y[0]
-+	veor	@y[0], @y[0], @t[0]
-+	vext.8	@t[2], @t[2], @t[2], #12
-+	veor	@y[1], @y[1], @t[1]
-+	veor	@y[0], @y[0], @t[5]
-+	vext.8	@t[4], @t[4], @t[4], #12
-+	veor	@y[1], @y[1], @t[6]
-+	veor	@y[0], @y[0], @t[7]
-+	veor	@t[7], @t[7], @t[6]		@ clobber t[7]
-+
-+	veor	@y[3], @y[3], @t[0]
-+	 veor	@y[1], @y[1], @y[0]
-+	vext.8	@t[0], @t[0], @t[0], #12
-+	veor	@y[2], @y[2], @t[1]
-+	veor	@y[4], @y[4], @t[1]
-+	vext.8	@t[1], @t[1], @t[1], #12
-+	veor	@y[2], @y[2], @t[2]
-+	veor	@y[3], @y[3], @t[2]
-+	veor	@y[5], @y[5], @t[2]
-+	veor	@y[2], @y[2], @t[7]
-+	vext.8	@t[2], @t[2], @t[2], #12
-+	veor	@y[3], @y[3], @t[3]
-+	veor	@y[6], @y[6], @t[3]
-+	veor	@y[4], @y[4], @t[3]
-+	veor	@y[7], @y[7], @t[4]
-+	vext.8	@t[3], @t[3], @t[3], #12
-+	veor	@y[5], @y[5], @t[4]
-+	veor	@y[7], @y[7], @t[7]
-+	veor	@t[7], @t[7], @t[5]		@ clobber t[7] even more
-+	veor	@y[3], @y[3], @t[5]
-+	veor	@y[4], @y[4], @t[4]
-+
-+	veor	@y[5], @y[5], @t[7]
-+	vext.8	@t[4], @t[4], @t[4], #12
-+	veor	@y[6], @y[6], @t[7]
-+	veor	@y[4], @y[4], @t[7]
-+
-+	veor	@t[7], @t[7], @t[5]
-+	vext.8	@t[5], @t[5], @t[5], #12
-+
-+	@ multiplication by 0x0d
-+	veor	@y[4], @y[4], @y[7]
-+	 veor	@t[7], @t[7], @t[6]		@ restore t[7]
-+	veor	@y[7], @y[7], @t[4]
-+	vext.8	@t[6], @t[6], @t[6], #12
-+	veor	@y[2], @y[2], @t[0]
-+	veor	@y[7], @y[7], @t[5]
-+	vext.8	@t[7], @t[7], @t[7], #12
-+	veor	@y[2], @y[2], @t[2]
-+
-+	veor	@y[3], @y[3], @y[1]
-+	veor	@y[1], @y[1], @t[1]
-+	veor	@y[0], @y[0], @t[0]
-+	veor	@y[3], @y[3], @t[0]
-+	veor	@y[1], @y[1], @t[5]
-+	veor	@y[0], @y[0], @t[5]
-+	vext.8	@t[0], @t[0], @t[0], #12
-+	veor	@y[1], @y[1], @t[7]
-+	veor	@y[0], @y[0], @t[6]
-+	veor	@y[3], @y[3], @y[1]
-+	veor	@y[4], @y[4], @t[1]
-+	vext.8	@t[1], @t[1], @t[1], #12
-+
-+	veor	@y[7], @y[7], @t[7]
-+	veor	@y[4], @y[4], @t[2]
-+	veor	@y[5], @y[5], @t[2]
-+	veor	@y[2], @y[2], @t[6]
-+	veor	@t[6], @t[6], @t[3]		@ clobber t[6]
-+	vext.8	@t[2], @t[2], @t[2], #12
-+	veor	@y[4], @y[4], @y[7]
-+	veor	@y[3], @y[3], @t[6]
-+
-+	veor	@y[6], @y[6], @t[6]
-+	veor	@y[5], @y[5], @t[5]
-+	vext.8	@t[5], @t[5], @t[5], #12
-+	veor	@y[6], @y[6], @t[4]
-+	vext.8	@t[4], @t[4], @t[4], #12
-+	veor	@y[5], @y[5], @t[6]
-+	veor	@y[6], @y[6], @t[7]
-+	vext.8	@t[7], @t[7], @t[7], #12
-+	veor	@t[6], @t[6], @t[3]		@ restore t[6]
-+	vext.8	@t[3], @t[3], @t[3], #12
-+
-+	@ multiplication by 0x09
-+	veor	@y[4], @y[4], @y[1]
-+	veor	@t[1], @t[1], @y[1]		@ t[1]=y[1]
-+	veor	@t[0], @t[0], @t[5]		@ clobber t[0]
-+	vext.8	@t[6], @t[6], @t[6], #12
-+	veor	@t[1], @t[1], @t[5]
-+	veor	@y[3], @y[3], @t[0]
-+	veor	@t[0], @t[0], @y[0]		@ t[0]=y[0]
-+	veor	@t[1], @t[1], @t[6]
-+	veor	@t[6], @t[6], @t[7]		@ clobber t[6]
-+	veor	@y[4], @y[4], @t[1]
-+	veor	@y[7], @y[7], @t[4]
-+	veor	@y[6], @y[6], @t[3]
-+	veor	@y[5], @y[5], @t[2]
-+	veor	@t[4], @t[4], @y[4]		@ t[4]=y[4]
-+	veor	@t[3], @t[3], @y[3]		@ t[3]=y[3]
-+	veor	@t[5], @t[5], @y[5]		@ t[5]=y[5]
-+	veor	@t[2], @t[2], @y[2]		@ t[2]=y[2]
-+	veor	@t[3], @t[3], @t[7]
-+	veor	@XMM[5], @t[5], @t[6]
-+	veor	@XMM[6], @t[6], @y[6]		@ t[6]=y[6]
-+	veor	@XMM[2], @t[2], @t[6]
-+	veor	@XMM[7], @t[7], @y[7]		@ t[7]=y[7]
-+
-+	vmov	@XMM[0], @t[0]
-+	vmov	@XMM[1], @t[1]
-+	@ vmov	@XMM[2], @t[2]
-+	vmov	@XMM[3], @t[3]
-+	vmov	@XMM[4], @t[4]
-+	@ vmov	@XMM[5], @t[5]
-+	@ vmov	@XMM[6], @t[6]
-+	@ vmov	@XMM[7], @t[7]
-+___
-+}
-+
-+sub swapmove {
-+my ($a,$b,$n,$mask,$t)=@_;
-+$code.=<<___;
-+	vshr.u64	$t, $b, #$n
-+	veor		$t, $t, $a
-+	vand		$t, $t, $mask
-+	veor		$a, $a, $t
-+	vshl.u64	$t, $t, #$n
-+	veor		$b, $b, $t
-+___
-+}
-+sub swapmove2x {
-+my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
-+$code.=<<___;
-+	vshr.u64	$t0, $b0, #$n
-+	 vshr.u64	$t1, $b1, #$n
-+	veor		$t0, $t0, $a0
-+	 veor		$t1, $t1, $a1
-+	vand		$t0, $t0, $mask
-+	 vand		$t1, $t1, $mask
-+	veor		$a0, $a0, $t0
-+	vshl.u64	$t0, $t0, #$n
-+	 veor		$a1, $a1, $t1
-+	 vshl.u64	$t1, $t1, #$n
-+	veor		$b0, $b0, $t0
-+	 veor		$b1, $b1, $t1
-+___
-+}
-+
-+sub bitslice {
-+my @x=reverse(@_[0..7]);
-+my ($t0,$t1,$t2,$t3)=@_[8..11];
-+$code.=<<___;
-+	vmov.i8	$t0,#0x55			@ compose .LBS0
-+	vmov.i8	$t1,#0x33			@ compose .LBS1
-+___
-+	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
-+	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
-+$code.=<<___;
-+	vmov.i8	$t0,#0x0f			@ compose .LBS2
-+___
-+	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
-+	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
-+
-+	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
-+	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
-+}
-+
-+$code.=<<___;
-+	.text
-+	.code	32
-+	.fpu	neon
-+
-+	.align	4
-+.LM0ISR:	@ InvShiftRows constants
-+	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
-+.LISR:
-+	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
-+.LISRM0:
-+	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
-+
-+	.type	_bsaes_decrypt8,%function
-+_bsaes_decrypt8:
-+	adr	$const,.LM0ISR
-+	vldmia	$key!, {@XMM[9]}		@ round 0 key
-+
-+	vldmia	$const!, {@XMM[8]}		@ .LM0ISR
-+	veor	@XMM[10], @XMM[0], @XMM[9]	@ xor with round0 key
-+	veor	@XMM[11], @XMM[1], @XMM[9]
-+	 vtbl.8	`&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
-+	 vtbl.8	`&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
-+	veor	@XMM[12], @XMM[2], @XMM[9]
-+	 vtbl.8	`&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
-+	 vtbl.8	`&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
-+	veor	@XMM[13], @XMM[3], @XMM[9]
-+	 vtbl.8	`&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
-+	 vtbl.8	`&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
-+	veor	@XMM[14], @XMM[4], @XMM[9]
-+	 vtbl.8	`&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
-+	 vtbl.8	`&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
-+	veor	@XMM[15], @XMM[5], @XMM[9]
-+	 vtbl.8	`&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
-+	 vtbl.8	`&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
-+	veor	@XMM[10], @XMM[6], @XMM[9]
-+	 vtbl.8	`&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
-+	 vtbl.8	`&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
-+	veor	@XMM[11], @XMM[7], @XMM[9]
-+	 vtbl.8	`&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
-+	 vtbl.8	`&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
-+	 vtbl.8	`&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
-+	 vtbl.8	`&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
-+___
-+	&bitslice	(@XMM[0..7, 8..11]);
-+$code.=<<___;
-+	sub	$rounds,$rounds,#1
-+	b	.Ldec_sbox
-+
-+	.align	5
-+.Ldec_loop:
-+___
-+	&ShiftRows	(@XMM[0..7, 8..12]);
-+$code.=".Ldec_sbox:\n";
-+	&InvSbox	(@XMM[0..7, 8..15]);
-+$code.=<<___;
-+	subs	$rounds,$rounds,#1
-+	bcc	.Ldec_done
-+___
-+	&InvMixColumns	(@XMM[0,1,6,4,2,7,3,5, 8..15]);
-+$code.=<<___;
-+	vldmia	$const, {@XMM[12]}		@ .LISR
-+	addeq	$const,$const,#0x10
-+	bne	.Ldec_loop
-+	vldmia	$const, {@XMM[12]}		@ .LISRM0
-+	b	.Ldec_loop
-+.align	4
-+.Ldec_done:
-+___
-+	&bitslice	(@XMM[0,1,6,4,2,7,3,5, 8..11]);
-+$code.=<<___;
-+	vldmia	$key, {@XMM[8]}			@ last round key
-+	veor	@XMM[6], @XMM[6], @XMM[8]
-+	veor	@XMM[4], @XMM[4], @XMM[8]
-+	veor	@XMM[2], @XMM[2], @XMM[8]
-+	veor	@XMM[7], @XMM[7], @XMM[8]
-+	veor	@XMM[3], @XMM[3], @XMM[8]
-+	veor	@XMM[5], @XMM[5], @XMM[8]
-+	veor	@XMM[0], @XMM[0], @XMM[8]
-+	veor	@XMM[1], @XMM[1], @XMM[8]
-+	bx	lr
-+.size	_bsaes_decrypt8,.-_bsaes_decrypt8
-+
-+	.align	4
-+_bsaes_const:
-+.LM0SR:		@ ShiftRows constants
-+	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
-+.LSR:
-+	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
-+.LSRM0:
-+	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
-+
-+	.asciz	"Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
-+	.size	_bsaes_const,.-_bsaes_const
-+
-+	.align 5
-+	.type	_bsaes_encrypt8,%function
-+_bsaes_encrypt8:
-+	adr	$const,.LM0SR
-+	vldmia	$key!, {@XMM[9]}		@ round 0 key
-+
-+	vldmia	$const!, {@XMM[8]}		@ .LM0SR
-+	veor	@XMM[10], @XMM[0], @XMM[9]	@ xor with round0 key
-+	veor	@XMM[11], @XMM[1], @XMM[9]
-+	 vtbl.8	`&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
-+	 vtbl.8	`&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
-+	veor	@XMM[12], @XMM[2], @XMM[9]
-+	 vtbl.8	`&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
-+	 vtbl.8	`&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
-+	veor	@XMM[13], @XMM[3], @XMM[9]
-+	 vtbl.8	`&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
-+	 vtbl.8	`&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
-+	veor	@XMM[14], @XMM[4], @XMM[9]
-+	 vtbl.8	`&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
-+	 vtbl.8	`&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
-+	veor	@XMM[15], @XMM[5], @XMM[9]
-+	 vtbl.8	`&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
-+	 vtbl.8	`&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
-+	veor	@XMM[10], @XMM[6], @XMM[9]
-+	 vtbl.8	`&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
-+	 vtbl.8	`&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
-+	veor	@XMM[11], @XMM[7], @XMM[9]
-+	 vtbl.8	`&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
-+	 vtbl.8	`&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
-+	 vtbl.8	`&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
-+	 vtbl.8	`&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
-+_bsaes_encrypt8_bitslice:
-+___
-+	&bitslice	(@XMM[0..7, 8..11]);
-+$code.=<<___;
-+	sub	$rounds,$rounds,#1
-+	b	.Lenc_sbox
-+.align	4
-+.Lenc_loop:
-+___
-+	&ShiftRows	(@XMM[0..7, 8..12]);
-+$code.=".Lenc_sbox:\n";
-+	&Sbox		(@XMM[0..7, 8..15]);
-+$code.=<<___;
-+	subs	$rounds,$rounds,#1
-+	bcc	.Lenc_done
-+___
-+	&MixColumns	(@XMM[0,1,4,6,3,7,2,5, 8..15]);
-+$code.=<<___;
-+	vldmia	$const, {@XMM[12]}		@ .LSR
-+	addeq	$const,$const,#0x10
-+	bne	.Lenc_loop
-+	vldmia	$const, {@XMM[12]}		@ .LSRM0
-+	b	.Lenc_loop
-+.align	4
-+.Lenc_done:
-+___
-+	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
-+	&bitslice	(@XMM[0,1,4,6,3,7,2,5, 8..11]);
-+$code.=<<___;
-+	vldmia	$key, {@XMM[8]}			@ last round key
-+	veor	@XMM[4], @XMM[4], @XMM[8]
-+	veor	@XMM[6], @XMM[6], @XMM[8]
-+	veor	@XMM[3], @XMM[3], @XMM[8]
-+	veor	@XMM[7], @XMM[7], @XMM[8]
-+	veor	@XMM[2], @XMM[2], @XMM[8]
-+	veor	@XMM[5], @XMM[5], @XMM[8]
-+	veor	@XMM[0], @XMM[0], @XMM[8]
-+	veor	@XMM[1], @XMM[1], @XMM[8]
-+	bx	lr
-+.size	_bsaes_encrypt8,.-_bsaes_encrypt8
-+___
-+}
-+{
-+my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
-+
-+sub bitslice_key {
-+my @x=reverse(@_[0..7]);
-+my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
-+
-+	&swapmove	(@x[0,1],1,$bs0,$t2,$t3);
-+$code.=<<___;
-+	@ &swapmove(@x[2,3],1,$t0,$t2,$t3);
-+	vmov	@x[2], @x[0]
-+	vmov	@x[3], @x[1]
-+___
-+	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
-+
-+	&swapmove2x	(@x[0,2,1,3],2,$bs1,$t2,$t3);
-+$code.=<<___;
-+	@ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
-+	vmov	@x[4], @x[0]
-+	vmov	@x[6], @x[2]
-+	vmov	@x[5], @x[1]
-+	vmov	@x[7], @x[3]
-+___
-+	&swapmove2x	(@x[0,4,1,5],4,$bs2,$t2,$t3);
-+	&swapmove2x	(@x[2,6,3,7],4,$bs2,$t2,$t3);
-+}
-+
-+$code.=<<___;
-+	.align	4
-+.LM0:
-+	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
-+
-+	.type	_bsaes_key_convert,%function
-+_bsaes_key_convert:
-+	adr	$const,.LM0
-+	vld1.8	{@XMM[7]},  [$inp]!		@ load round 0 key
-+	vld1.8	{@XMM[15]}, [$inp]!		@ load round 1 key
-+
-+	vmov.i8	@XMM[8],  #0x01			@ bit masks
-+	vmov.i8	@XMM[9],  #0x02
-+	vmov.i8	@XMM[10], #0x04
-+	vmov.i8	@XMM[11], #0x08
-+	vmov.i8	@XMM[12], #0x10
-+	vmov.i8	@XMM[13], #0x20
-+	vldmia	$const, {@XMM[14]}		@ .LM0
-+
-+#ifdef __ARMEL__
-+	vrev32.8	@XMM[7],  @XMM[7]
-+	vrev32.8	@XMM[15], @XMM[15]
-+#endif
-+	sub	$rounds,$rounds,#1
-+	vstmia	$out!, {@XMM[7]}		@ save round 0 key
-+	b	.Lkey_loop
-+
-+	.align	5
-+.Lkey_loop:
-+	vtbl.8	`&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
-+	vtbl.8	`&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
-+	vmov.i8	@XMM[6],  #0x40
-+	vmov.i8	@XMM[15], #0x80
-+
-+	vtst.8	@XMM[0], @XMM[7], @XMM[8]
-+	vtst.8	@XMM[1], @XMM[7], @XMM[9]
-+	vtst.8	@XMM[2], @XMM[7], @XMM[10]
-+	vtst.8	@XMM[3], @XMM[7], @XMM[11]
-+	vtst.8	@XMM[4], @XMM[7], @XMM[12]
-+	vtst.8	@XMM[5], @XMM[7], @XMM[13]
-+	vtst.8	@XMM[6], @XMM[7], @XMM[6]
-+	vtst.8	@XMM[7], @XMM[7], @XMM[15]
-+	vld1.8	{@XMM[15]}, [$inp]!		@ load next round key
-+	vmvn	@XMM[0], @XMM[0]		@ "pnot"
-+	vmvn	@XMM[1], @XMM[1]
-+	vmvn	@XMM[5], @XMM[5]
-+	vmvn	@XMM[6], @XMM[6]
-+#ifdef __ARMEL__
-+	vrev32.8	@XMM[15], @XMM[15]
-+#endif
-+	subs	$rounds,$rounds,#1
-+	vstmia	$out!,{@XMM[0]-@XMM[7]}		@ write bit-sliced round key
-+	bne	.Lkey_loop
-+
-+	vmov.i8	@XMM[7],#0x63			@ compose .L63
-+	@ don't save last round key
-+	bx	lr
-+.size	_bsaes_key_convert,.-_bsaes_key_convert
-+___
-+}
-+
-+if (0) {		# following four functions are unsupported interface
-+			# used for benchmarking...
-+$code.=<<___;
-+.globl	bsaes_enc_key_convert
-+.type	bsaes_enc_key_convert,%function
-+.align	4
-+bsaes_enc_key_convert:
-+	stmdb	sp!,{r4-r6,lr}
-+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
-+
-+	ldr	r5,[$inp,#240]			@ pass rounds
-+	mov	r4,$inp				@ pass key
-+	mov	r12,$out			@ pass key schedule
-+	bl	_bsaes_key_convert
-+	veor	@XMM[7],@XMM[7],@XMM[15]	@ fix up last round key
-+	vstmia	r12, {@XMM[7]}			@ save last round key
-+
-+	vldmia	sp!,{d8-d15}
-+	ldmia	sp!,{r4-r6,pc}
-+.size	bsaes_enc_key_convert,.-bsaes_enc_key_convert
-+
-+.globl	bsaes_encrypt_128
-+.type	bsaes_encrypt_128,%function
-+.align	4
-+bsaes_encrypt_128:
-+	stmdb	sp!,{r4-r6,lr}
-+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
-+.Lenc128_loop:
-+	vld1.8	{@XMM[0]-@XMM[1]}, [$inp]!	@ load input
-+	vld1.8	{@XMM[2]-@XMM[3]}, [$inp]!
-+	mov	r4,$key				@ pass the key
-+	vld1.8	{@XMM[4]-@XMM[5]}, [$inp]!
-+	mov	r5,#10				@ pass rounds
-+	vld1.8	{@XMM[6]-@XMM[7]}, [$inp]!
-+
-+	bl	_bsaes_encrypt8
-+
-+	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
-+	vst1.8	{@XMM[4]}, [$out]!
-+	vst1.8	{@XMM[6]}, [$out]!
-+	vst1.8	{@XMM[3]}, [$out]!
-+	vst1.8	{@XMM[7]}, [$out]!
-+	vst1.8	{@XMM[2]}, [$out]!
-+	subs	$len,$len,#0x80
-+	vst1.8	{@XMM[5]}, [$out]!
-+	bhi	.Lenc128_loop
-+
-+	vldmia	sp!,{d8-d15}
-+	ldmia	sp!,{r4-r6,pc}
-+.size	bsaes_encrypt_128,.-bsaes_encrypt_128
-+
-+.globl	bsaes_dec_key_convert
-+.type	bsaes_dec_key_convert,%function
-+.align	4
-+bsaes_dec_key_convert:
-+	stmdb	sp!,{r4-r6,lr}
-+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
-+
-+	ldr	r5,[$inp,#240]			@ pass rounds
-+	mov	r4,$inp				@ pass key
-+	mov	r12,$out			@ pass key schedule
-+	bl	_bsaes_key_convert
-+	vldmia	$out, {@XMM[6]}
-+	vstmia	r12,  {@XMM[15]}		@ save last round key
-+	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
-+	vstmia	$out, {@XMM[7]}
-+
-+	vldmia	sp!,{d8-d15}
-+	ldmia	sp!,{r4-r6,pc}
-+.size	bsaes_dec_key_convert,.-bsaes_dec_key_convert
-+
-+.globl	bsaes_decrypt_128
-+.type	bsaes_decrypt_128,%function
-+.align	4
-+bsaes_decrypt_128:
-+	stmdb	sp!,{r4-r6,lr}
-+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
-+.Ldec128_loop:
-+	vld1.8	{@XMM[0]-@XMM[1]}, [$inp]!	@ load input
-+	vld1.8	{@XMM[2]-@XMM[3]}, [$inp]!
-+	mov	r4,$key				@ pass the key
-+	vld1.8	{@XMM[4]-@XMM[5]}, [$inp]!
-+	mov	r5,#10				@ pass rounds
-+	vld1.8	{@XMM[6]-@XMM[7]}, [$inp]!
-+
-+	bl	_bsaes_decrypt8
-+
-+	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
-+	vst1.8	{@XMM[6]}, [$out]!
-+	vst1.8	{@XMM[4]}, [$out]!
-+	vst1.8	{@XMM[2]}, [$out]!
-+	vst1.8	{@XMM[7]}, [$out]!
-+	vst1.8	{@XMM[3]}, [$out]!
-+	subs	$len,$len,#0x80
-+	vst1.8	{@XMM[5]}, [$out]!
-+	bhi	.Ldec128_loop
-+
-+	vldmia	sp!,{d8-d15}
-+	ldmia	sp!,{r4-r6,pc}
-+.size	bsaes_decrypt_128,.-bsaes_decrypt_128
-+___
-+}
-+
-+{
-+my ($inp,$out,$len,$key,$keysched,$rounds,$iv)=("r0","r1","r2","r3","sp","r7","r8");
-+
-+$code.=<<___;
-+	.align	5
-+	.extern AES_cbc_encrypt
-+	.extern AES_decrypt
-+	.global	bsaes_cbc_encrypt
-+	.type	bsaes_cbc_encrypt,%function
-+bsaes_cbc_encrypt:
-+	cmp	$len, #128
-+	blo	AES_cbc_encrypt
-+
-+	@ it is up to the caller to make sure we are called with enc == 0
-+
-+	stmdb	sp!, {r4-r10, lr}
-+	vstmdb	sp!, {d8-d15}			@ ABI specification says so
-+	ldr	$iv, [sp, #0x60]		@ IV is 1st arg on the stack
-+	lsr	$len, #4			@ len in 16 byte blocks
-+	sub	sp, #0x10			@ scratch space to carry over the IV
-+	mov	r10, sp				@ save sp
-+
-+	@ allocate the key schedule on the stack
-+	ldr	$rounds, [r3, #240]		@ get # of rounds
-+	sub	r14, sp, $rounds, lsl #7	@ 128 bytes per inner round key
-+	add	sp, r14, #`128-32`		@ size of bit-sliced key schedule
-+
-+	@ populate the key schedule
-+	mov	r4, $key			@ pass key
-+	mov	r5, $rounds			@ pass # of rounds
-+	mov	r12, $keysched			@ pass key schedule
-+	bl	_bsaes_key_convert
-+	vldmia	$keysched, {@XMM[6]}
-+	vstmia	r12,  {@XMM[15]}		@ save last round key
-+	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
-+	vstmia	$keysched, {@XMM[7]}
-+
-+	vld1.8	{@XMM[15]}, [$iv]		@ load IV
-+	mov	$iv, r10
-+
-+.Lcbc_dec_loop:
-+	subs	$len, #0x8
-+	bmi	.Lcbc_dec_loop_finish
-+
-+	vld1.8	{@XMM[0]-@XMM[1]}, [$inp]!	@ load input
-+	vld1.8	{@XMM[2]-@XMM[3]}, [$inp]!
-+	mov	r4, $keysched			@ pass the key
-+	vld1.8	{@XMM[4]-@XMM[5]}, [$inp]!
-+	mov	r5, $rounds
-+	vld1.8	{@XMM[6]-@XMM[7]}, [$inp]
-+	sub	$inp, #0x60
-+	vst1.8	{@XMM[15]}, [$iv]		@ put aside IV
-+
-+	bl	_bsaes_decrypt8
-+
-+	vld1.8	{@XMM[14]}, [$iv]		@ load IV
-+	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
-+
-+	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
-+	vld1.8	{@XMM[10]-@XMM[11]}, [$inp]!
-+	veor	@XMM[1], @XMM[1], @XMM[8]
-+	veor	@XMM[6], @XMM[6], @XMM[9]
-+	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
-+	vst1.8	{@XMM[6]}, [$out]!
-+	vld1.8	{@XMM[12]-@XMM[13]}, [$inp]!
-+	veor	@XMM[4], @XMM[4], @XMM[10]
-+	veor	@XMM[2], @XMM[2], @XMM[11]
-+	vst1.8	{@XMM[4]}, [$out]!
-+	vst1.8	{@XMM[2]}, [$out]!
-+	vld1.8	{@XMM[14]-@XMM[15]}, [$inp]!
-+	veor	@XMM[7], @XMM[7], @XMM[12]
-+	veor	@XMM[3], @XMM[3], @XMM[13]
-+	veor	@XMM[5], @XMM[5], @XMM[14]
-+	vst1.8	{@XMM[7]}, [$out]!
-+	vst1.8	{@XMM[3]}, [$out]!
-+	vst1.8	{@XMM[5]}, [$out]!
-+
-+	b	.Lcbc_dec_loop
-+
-+.Lcbc_dec_loop_finish:
-+	adds	$len, #8
-+	beq	.Lcbc_dec_done
-+
-+	mov	r5, $rounds
-+	vld1.8	{@XMM[0]}, [$inp]!		@ load input
-+	mov	r4, $keysched			@ pass the key
-+	cmp	$len, #2
-+	blo	.Lcbc_dec_one
-+	vld1.8	{@XMM[1]}, [$inp]!
-+	beq	.Lcbc_dec_two
-+	vld1.8	{@XMM[2]}, [$inp]!
-+	cmp	$len, #4
-+	blo	.Lcbc_dec_three
-+	vld1.8	{@XMM[3]}, [$inp]!
-+	beq	.Lcbc_dec_four
-+	vld1.8	{@XMM[4]}, [$inp]!
-+	cmp	$len, #6
-+	blo	.Lcbc_dec_five
-+	vld1.8	{@XMM[5]}, [$inp]!
-+	beq	.Lcbc_dec_six
-+	vld1.8	{@XMM[6]}, [$inp]!
-+	vst1.8	{@XMM[15]}, [$iv]		@ put aside IV
-+	bl	_bsaes_decrypt8
-+	sub	$inp, #0x70
-+	vld1.8	{@XMM[14]}, [$iv]		@ load IV
-+	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
-+	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
-+	vld1.8	{@XMM[10]-@XMM[11]}, [$inp]!
-+	veor	@XMM[1], @XMM[1], @XMM[8]
-+	veor	@XMM[6], @XMM[6], @XMM[9]
-+	vld1.8	{@XMM[12]-@XMM[13]}, [$inp]!
-+	veor	@XMM[4], @XMM[4], @XMM[10]
-+	veor	@XMM[2], @XMM[2], @XMM[11]
-+	vld1.8	{@XMM[15]}, [$inp]!
-+	veor	@XMM[7], @XMM[7], @XMM[12]
-+	veor	@XMM[3], @XMM[3], @XMM[13]
-+	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
-+	vst1.8	{@XMM[6]}, [$out]!
-+	vst1.8	{@XMM[4]}, [$out]!
-+	vst1.8	{@XMM[2]}, [$out]!
-+	vst1.8	{@XMM[7]}, [$out]!
-+	vst1.8	{@XMM[3]}, [$out]!
-+	b	.Lcbc_dec_done
-+.align	4
-+.Lcbc_dec_six:
-+	sub	$inp, #0x60
-+	vst1.8	{@XMM[15]}, [$iv]		@ put aside IV
-+	bl	_bsaes_decrypt8
-+	vld1.8	{@XMM[14]}, [$iv]		@ load IV
-+	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
-+	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
-+	vld1.8	{@XMM[10]-@XMM[11]}, [$inp]!
-+	veor	@XMM[1], @XMM[1], @XMM[8]
-+	veor	@XMM[6], @XMM[6], @XMM[9]
-+	vld1.8	{@XMM[12]}, [$inp]!
-+	veor	@XMM[4], @XMM[4], @XMM[10]
-+	veor	@XMM[2], @XMM[2], @XMM[11]
-+	vld1.8	{@XMM[15]}, [$inp]!
-+	veor	@XMM[7], @XMM[7], @XMM[12]
-+	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
-+	vst1.8	{@XMM[6]}, [$out]!
-+	vst1.8	{@XMM[4]}, [$out]!
-+	vst1.8	{@XMM[2]}, [$out]!
-+	vst1.8	{@XMM[7]}, [$out]!
-+	b	.Lcbc_dec_done
-+.align	4
-+.Lcbc_dec_five:
-+	sub	$inp, #0x50
-+	vst1.8	{@XMM[15]}, [$iv]		@ put aside IV
-+	bl	_bsaes_decrypt8
-+	vld1.8	{@XMM[14]}, [$iv]		@ load IV
-+	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
-+	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
-+	vld1.8	{@XMM[10]-@XMM[11]}, [$inp]!
-+	veor	@XMM[1], @XMM[1], @XMM[8]
-+	veor	@XMM[6], @XMM[6], @XMM[9]
-+	vld1.8	{@XMM[15]}, [$inp]!
-+	veor	@XMM[4], @XMM[4], @XMM[10]
-+	veor	@XMM[2], @XMM[2], @XMM[11]
-+	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
-+	vst1.8	{@XMM[6]}, [$out]!
-+	vst1.8	{@XMM[4]}, [$out]!
-+	vst1.8	{@XMM[2]}, [$out]!
-+	b	.Lcbc_dec_done
-+.align	4
-+.Lcbc_dec_four:
-+	sub	$inp, #0x40
-+	vst1.8	{@XMM[15]}, [$iv]		@ put aside IV
-+	bl	_bsaes_decrypt8
-+	vld1.8	{@XMM[14]}, [$iv]		@ load IV
-+	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
-+	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
-+	vld1.8	{@XMM[10]}, [$inp]!
-+	veor	@XMM[1], @XMM[1], @XMM[8]
-+	veor	@XMM[6], @XMM[6], @XMM[9]
-+	vld1.8	{@XMM[15]}, [$inp]!
-+	veor	@XMM[4], @XMM[4], @XMM[10]
-+	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
-+	vst1.8	{@XMM[6]}, [$out]!
-+	vst1.8	{@XMM[4]}, [$out]!
-+	b	.Lcbc_dec_done
-+.align	4
-+.Lcbc_dec_three:
-+	sub	$inp, #0x30
-+	vst1.8	{@XMM[15]}, [$iv]		@ put aside IV
-+	bl	_bsaes_decrypt8
-+	vld1.8	{@XMM[14]}, [$iv]		@ load IV
-+	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
-+	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
-+	vld1.8	{@XMM[15]}, [$inp]!
-+	veor	@XMM[1], @XMM[1], @XMM[8]
-+	veor	@XMM[6], @XMM[6], @XMM[9]
-+	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
-+	vst1.8	{@XMM[6]}, [$out]!
-+	b	.Lcbc_dec_done
-+.align	4
-+.Lcbc_dec_two:
-+	sub	$inp, #0x20
-+	vst1.8	{@XMM[15]}, [$iv]		@ put aside IV
-+	bl	_bsaes_decrypt8
-+	vld1.8	{@XMM[14]}, [$iv]		@ load IV
-+	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
-+	vld1.8	{@XMM[8]}, [$inp]!		@ reload input
-+	veor	@XMM[1], @XMM[1], @XMM[8]
-+	vld1.8	{@XMM[15]}, [$inp]!		@ reload input
-+	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
-+	b	.Lcbc_dec_done
-+.align	4
-+.Lcbc_dec_one:
-+	sub	$inp, #0x10
-+	mov	r7, $out			@ save original out pointer
-+	mov	$out, $iv			@ use the iv scratch space as out buffer
-+	mov	r2, $key
-+	bl	AES_decrypt
-+	vld1.8	{@XMM[0]}, [$iv]		@ reload output
-+	veor	@XMM[15], @XMM[15], @XMM[0]	@ ^= IV
-+	vst1.8	{@XMM[15]}, [r7]
-+	vmov	@XMM[15], @XMM[0]		@ IV
-+
-+.Lcbc_dec_done:
-+	vmov.i32	q0, #0
-+.Lcbc_dec_bzero:				@ wipe key schedule [if any]
-+	vst1.8		{q0}, [$keysched]!
-+	teq		$keysched, r10
-+	bne		.Lcbc_dec_bzero
-+
-+	add	sp, #0x10
-+	ldr	$iv, [sp, #0x60]
-+	vst1.8	{@XMM[15]}, [$iv]	@ return IV
-+	vldmia	sp!, {d8-d15}
-+	ldmia	sp!, {r4-r10, pc}
-+
-+	.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
-+___
-+}
-+{
-+my ($inp,$out,$len,$key,$keysched,$const,$rounds,$ctr)=("r0","r1","r2","r3","sp","r6","r7","r8");
-+
-+$code.=<<___;
-+	.align 5
-+	@ byte-swap constants
-+.LSWP:
-+	.quad	0x0405060703020100
-+.LSWPUPM0SR:
-+	.quad	0x0a0d02060c03070b, 0x0004080f05090e01
-+.LADD:
-+	.quad	0x0807060504030201
-+
-+	.extern	AES_encrypt
-+	.global	bsaes_ctr32_encrypt_blocks
-+	.type	bsaes_ctr32_encrypt_blocks,%function
-+bsaes_ctr32_encrypt_blocks:
-+	cmp	$len, #8			@ use plain AES for
-+	blo	.Lctr_enc_short			@            small sizes
-+
-+	stmdb	sp!, {r4-r8, r10-r11, lr}
-+	vstmdb	sp!, {d8-d15}			@ ABI specification says so
-+	ldr	$ctr, [sp, #0x60]		@ ctr is 1st arg on the stack
-+	sub	sp, #0x10			@ scratch space to carry over the ctr
-+	mov	r10, sp				@ save sp
-+
-+	@ allocate the key schedule on the stack
-+	ldr	$rounds, [r3, #240]		@ get # of rounds
-+	sub	r14, sp, $rounds, lsl #7	@ 128 bytes per inner round key
-+	add	sp, r14, #`128-32`		@ size of bit-sliced key schedule
-+
-+	@ populate the key schedule
-+	mov	r4, $key			@ pass key
-+	mov	r5, $rounds			@ pass # of rounds
-+	mov	r12, $keysched			@ pass key schedule
-+	bl	_bsaes_key_convert
-+	veor	@XMM[7],@XMM[7],@XMM[15]	@ fix up last round key
-+	vstmia	r12, {@XMM[7]}			@ save last round key
-+
-+	vldm	$ctr, {@XMM[0]}			@ load counter
-+	mov	$ctr, r10
-+
-+	vldm	$keysched, {@XMM[4]}		@ load round0 key
-+
-+	vldr	`&Dlo(@XMM[8])`, .LSWP		@ byte swap upper part
-+	vtbl.8	`&Dhi(@XMM[0])`, {`&Dhi(@XMM[0])`}, `&Dlo(@XMM[8])`
-+	vtbl.8	`&Dhi(@XMM[4])`, {`&Dhi(@XMM[4])`}, `&Dlo(@XMM[8])`
-+
-+	vstm	$keysched, {@XMM[4]}		@ save adjusted round0 key
-+
-+	b	.Lctr_enc_loop
-+
-+	.align	5
-+.Lctr_enc_loop:
-+
-+	@ set up the addition constants
-+	vldr		`&Dlo(@XMM[11])`, .LADD
-+	vmov.i8		`&Dhi(@XMM[11])`, #0
-+	vmov.i8		@XMM[12], #0
-+	vzip.8		`&Dlo(@XMM[11])`, `&Dhi(@XMM[11])`
-+	vzip.16		@XMM[11], @XMM[12]
-+
-+	@ get 8 counter values in regs and do the add
-+	vdup.32		@XMM[4], `&Dhi(@XMM[0])`[1]
-+	vdup.32		@XMM[9], `&Dhi(@XMM[0])`[1]
-+	vadd.u32	@XMM[4], @XMM[11]
-+	vadd.u32	@XMM[9], @XMM[12]
-+	vdup.32		@XMM[2], `&Dhi(@XMM[0])`[0]
-+	vdup.32		@XMM[6], `&Dhi(@XMM[0])`[0]
-+	vzip.32		@XMM[2], @XMM[4]
-+	vzip.32		@XMM[6], @XMM[9]
-+
-+	vmov		`&Dhi(@XMM[1])`, `&Dlo(@XMM[0])`
-+	vmov		`&Dlo(@XMM[1])`, `&Dlo(@XMM[0])`
-+	vmov		@XMM[3], @XMM[1]
-+	vmov		@XMM[5], @XMM[1]
-+	vmov		@XMM[7], @XMM[1]
-+
-+	vswp		`&Dhi(@XMM[1])`, `&Dlo(@XMM[2])`
-+	vswp		`&Dhi(@XMM[3])`, `&Dlo(@XMM[4])`
-+	vswp		`&Dhi(@XMM[5])`, `&Dlo(@XMM[6])`
-+	vswp		`&Dhi(@XMM[7])`, `&Dlo(@XMM[9])`
-+
-+	vstm		$ctr, {@XMM[9]}			@ save counter
-+
-+	@ Borrow prologue from _bsaes_encrypt8 to use the opportunity
-+	@ to flip byte order in 32-bit counter
-+
-+	adr		r11, .LSWPUPM0SR
-+	vld1.8		{@XMM[8]}, [r11]!		@ .LSWPUPM0SR
-+	adrl		$const,.LSR
-+	vld1.8		{@XMM[9]}, [$keysched]		@ load round0 key
-+	mov		r5, $rounds			@ pass rounds
-+	add		r4, $keysched, #0x10		@ pass next round key
-+	veor		@XMM[10], @XMM[0], @XMM[9]	@ xor with round0 key
-+	veor		@XMM[11], @XMM[1], @XMM[9]
-+	 vtbl.8		`&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
-+	 vtbl.8		`&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
-+	veor		@XMM[12], @XMM[2], @XMM[9]
-+	 vtbl.8		`&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
-+	 vtbl.8		`&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
-+	veor		@XMM[13], @XMM[3], @XMM[9]
-+	 vtbl.8		`&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
-+	 vtbl.8		`&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
-+	veor		@XMM[14], @XMM[4], @XMM[9]
-+	 vtbl.8		`&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
-+	 vtbl.8		`&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
-+	veor		@XMM[15], @XMM[5], @XMM[9]
-+	 vtbl.8		`&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
-+	 vtbl.8		`&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
-+	veor		@XMM[10], @XMM[6], @XMM[9]
-+	 vtbl.8		`&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
-+	 vtbl.8		`&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
-+	veor		@XMM[11], @XMM[7], @XMM[9]
-+	 vtbl.8		`&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
-+	 vtbl.8		`&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
-+	 vtbl.8		`&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
-+	 vtbl.8		`&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
-+
-+	bl		_bsaes_encrypt8_bitslice
-+
-+	subs		$len, #8
-+	blo		.Lctr_enc_loop_done
-+
-+	vldmia		$inp!, {@XMM[8]-@XMM[15]}	@ load input
-+
-+	veor		@XMM[8], @XMM[0]
-+	veor		@XMM[1], @XMM[9]
-+	vst1.8		{@XMM[8]}, [$out]!
-+	veor		@XMM[4], @XMM[10]
-+	vst1.8		{@XMM[1]}, [$out]!
-+	veor		@XMM[6], @XMM[11]
-+	vst1.8		{@XMM[4]}, [$out]!
-+	veor		@XMM[3], @XMM[12]
-+	vst1.8		{@XMM[6]}, [$out]!
-+	veor		@XMM[7], @XMM[13]
-+	vst1.8		{@XMM[3]}, [$out]!
-+	veor		@XMM[2], @XMM[14]
-+	vst1.8		{@XMM[7]}, [$out]!
-+	veor		@XMM[5], @XMM[15]
-+	vst1.8		{@XMM[2]}, [$out]!
-+	vst1.8		{@XMM[5]}, [$out]!
-+
-+	vldm		$ctr, {@XMM[0]}		@ load counter
-+
-+	bne		.Lctr_enc_loop
-+	b		.Lctr_enc_done
-+
-+.Lctr_enc_loop_done:
-+	add		$len, #8
-+	vld1.8		{@XMM[8]}, [$inp]!	@ load input
-+	veor		@XMM[0], @XMM[8]
-+	vst1.8		{@XMM[0]}, [$out]!	@ write output
-+	cmp		$len, #2
-+	blo		.Lctr_enc_done
-+	vld1.8		{@XMM[9]}, [$inp]!
-+	veor		@XMM[1], @XMM[9]
-+	vst1.8		{@XMM[1]}, [$out]!
-+	beq		.Lctr_enc_done
-+	vld1.8		{@XMM[10]}, [$inp]!
-+	veor		@XMM[4], @XMM[10]
-+	vst1.8		{@XMM[4]}, [$out]!
-+	cmp		$len, #4
-+	blo		.Lctr_enc_done
-+	vld1.8		{@XMM[11]}, [$inp]!
-+	veor		@XMM[6], @XMM[11]
-+	vst1.8		{@XMM[6]}, [$out]!
-+	beq		.Lctr_enc_done
-+	vld1.8		{@XMM[12]}, [$inp]!
-+	veor		@XMM[3], @XMM[12]
-+	vst1.8		{@XMM[3]}, [$out]!
-+	cmp		$len, #6
-+	blo		.Lctr_enc_done
-+	vld1.8		{@XMM[13]}, [$inp]!
-+	veor		@XMM[7], @XMM[13]
-+	vst1.8		{@XMM[7]}, [$out]!
-+	beq		.Lctr_enc_done
-+	vld1.8		{@XMM[14]}, [$inp]
-+	veor		@XMM[2], @XMM[14]
-+	vst1.8		{@XMM[2]}, [$out]!
-+
-+.Lctr_enc_done:
-+	vmov.i32	q0, #0
-+.Lctr_enc_bzero:			@ wipe key schedule [if any]
-+	vst1.8		{q0}, [$keysched]!
-+	teq		$keysched, r10
-+	bne		.Lctr_enc_bzero
-+
-+	add	sp, r10, #0x10
-+	vldmia	sp!, {d8-d15}
-+	ldmia	sp!, {r4-r8, r10-r11, pc}
-+
-+.Lctr_enc_short:
-+	ldr	ip, [sp]		@ ctr pointer is passed on stack
-+	stmdb	sp!, {r0-r6, lr}	@ stack regs as usual
-+
-+	ldm	sp, {r4-r6}		@ copy r0-2 to r4-6
-+	vldmia	ip, {d0-d1}		@ load the counter from [arg5]
-+	vstmdb	sp!, {d0-d1}		@ copy of ctr to top of stack
-+	sub	sp, #0x10
-+
-+.Lctr_enc_short_loop:
-+	add	r0, sp, #0x10
-+	mov	r1, sp			@ put output on the stack
-+	ldr	r2, [sp, #0x2c]		@ stacked r3
-+
-+	bl	AES_encrypt
-+
-+	vldmia	r4!, {@XMM[1]}		@ load input
-+	ldr	r0, [sp, #0x1c]		@ load LSW of counter (BE)
-+#ifdef __ARMEL__
-+	rev	r0, r0			@ need to increment the counter
-+	add	r0, #1			@ in BE mode
-+	rev	r0, r0
-+#else
-+	add	r0, #1
-+#endif
-+	vldm	sp, {@XMM[0]}
-+	str	r0, [sp, #0x1c]
-+	veor	@XMM[0], @XMM[1]
-+	subs	r6, #1
-+	vstmia	r5!, {@XMM[0]}
-+	bne	.Lctr_enc_short_loop
-+
-+	add	sp, #0x30
-+	ldmia	sp!, {r4-r6, pc}
-+
-+	.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
-+___
-+}
-+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
-+
-+print $code;
-+
-+close STDOUT;
---- a/crypto/evp/e_aes.c
-+++ b/crypto/evp/e_aes.c
-@@ -482,6 +482,12 @@ static const EVP_CIPHER aes_##keylen##_#
- 	NULL,NULL,aes_##mode##_ctrl,NULL }; \
- const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
- { return &aes_##keylen##_##mode; }
-+
-+#endif
-+
-+#if	defined(AES_ASM) && defined(BSAES_ASM) && (defined(__arm__) || defined(__arm))
-+#include "arm_arch.h"
-+#define BSAES_CAPABLE	(OPENSSL_armcap_P & ARMV7_NEON)
- #endif
- 
- #define BLOCK_CIPHER_generic_pack(nid,keylen,flags)		\
-@@ -1064,11 +1070,13 @@ static int aes_xts_init_key(EVP_CIPHER_C
- 		xctx->stream = NULL;
- #endif
- 		/* key_len is two AES keys */
-+#if !(defined(__arm__) || defined(__arm))
- #ifdef BSAES_CAPABLE
- 		if (BSAES_CAPABLE)
- 			xctx->stream = enc ? bsaes_xts_encrypt : bsaes_xts_decrypt;
- 		else
- #endif
-+#endif
- #ifdef VPAES_CAPABLE
- 		if (VPAES_CAPABLE)
- 		    {
diff --git a/debian/patches/old/0002-bsaes-armv7.pl-Big-endian-fixes.patch b/debian/patches/old/0002-bsaes-armv7.pl-Big-endian-fixes.patch
deleted file mode 100644
index 23fb94a..0000000
--- a/debian/patches/old/0002-bsaes-armv7.pl-Big-endian-fixes.patch
+++ /dev/null
@@ -1,216 +0,0 @@
-From 719e0b800e3737f3a19251a097ff911744ed7a9e Mon Sep 17 00:00:00 2001
-From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
-Date: Mon, 15 Apr 2013 13:54:13 +0200
-Subject: [PATCH 2/3] bsaes-armv7.pl: Big endian fixes
-
-Updated the code to be (more) endian neutral, however, as it is
-still untested on big endian, it is only enabled for little endian
-at the moment.
----
- crypto/aes/asm/bsaes-armv7.pl |  121 ++++++++++++++----------------------------
- crypto/evp/e_aes.c            |    3 -
- 2 files changed, 45 insertions(+), 79 deletions(-)
-
---- a/crypto/aes/asm/bsaes-armv7.pl
-+++ b/crypto/aes/asm/bsaes-armv7.pl
-@@ -1196,8 +1196,9 @@ bsaes_cbc_encrypt:
- 
- .Lcbc_dec_done:
- 	vmov.i32	q0, #0
-+	vmov.i32	q1, #0
- .Lcbc_dec_bzero:				@ wipe key schedule [if any]
--	vst1.8		{q0}, [$keysched]!
-+	vstm		$keysched!, {q0-q1}
- 	teq		$keysched, r10
- 	bne		.Lcbc_dec_bzero
- 
-@@ -1215,13 +1216,9 @@ my ($inp,$out,$len,$key,$keysched,$const
- 
- $code.=<<___;
- 	.align 5
--	@ byte-swap constants
--.LSWP:
--	.quad	0x0405060703020100
--.LSWPUPM0SR:
--	.quad	0x0a0d02060c03070b, 0x0004080f05090e01
-+
- .LADD:
--	.quad	0x0807060504030201
-+	.long	1,2,3,4,5,6,7,0
- 
- 	.extern	AES_encrypt
- 	.global	bsaes_ctr32_encrypt_blocks
-@@ -1233,7 +1230,7 @@ bsaes_ctr32_encrypt_blocks:
- 	stmdb	sp!, {r4-r8, r10-r11, lr}
- 	vstmdb	sp!, {d8-d15}			@ ABI specification says so
- 	ldr	$ctr, [sp, #0x60]		@ ctr is 1st arg on the stack
--	sub	sp, #0x10			@ scratch space to carry over the ctr
-+	sub	sp, #0x20			@ scratch space to carry over the ctr
- 	mov	r10, sp				@ save sp
- 
- 	@ allocate the key schedule on the stack
-@@ -1249,92 +1246,61 @@ bsaes_ctr32_encrypt_blocks:
- 	veor	@XMM[7],@XMM[7],@XMM[15]	@ fix up last round key
- 	vstmia	r12, {@XMM[7]}			@ save last round key
- 
--	vldm	$ctr, {@XMM[0]}			@ load counter
--	mov	$ctr, r10
--
--	vldm	$keysched, {@XMM[4]}		@ load round0 key
--
--	vldr	`&Dlo(@XMM[8])`, .LSWP		@ byte swap upper part
--	vtbl.8	`&Dhi(@XMM[0])`, {`&Dhi(@XMM[0])`}, `&Dlo(@XMM[8])`
--	vtbl.8	`&Dhi(@XMM[4])`, {`&Dhi(@XMM[4])`}, `&Dlo(@XMM[8])`
--
--	vstm	$keysched, {@XMM[4]}		@ save adjusted round0 key
--
-+	@ copy the invariant bits of the ctr
-+	ldm	$ctr, {r4-r5, r11}
-+	mov	r12, r11
-+	stm	r10!, {r4-r5}
-+	stm	r10!, {r4-r5}
-+	stm	r10!, {r11-r12}
-+	stm	r10!, {r11-r12}
-+	sub	r10, #0x20
-+
-+	ldr	r11, [$ctr, #0xc]		@ get LSW of BE ctr
-+#ifdef __ARMEL__
-+	rev	r11, r11
-+#endif
- 	b	.Lctr_enc_loop
- 
- 	.align	5
- .Lctr_enc_loop:
- 
--	@ set up the addition constants
--	vldr		`&Dlo(@XMM[11])`, .LADD
--	vmov.i8		`&Dhi(@XMM[11])`, #0
--	vmov.i8		@XMM[12], #0
--	vzip.8		`&Dlo(@XMM[11])`, `&Dhi(@XMM[11])`
--	vzip.16		@XMM[11], @XMM[12]
--
- 	@ get 8 counter values in regs and do the add
--	vdup.32		@XMM[4], `&Dhi(@XMM[0])`[1]
--	vdup.32		@XMM[9], `&Dhi(@XMM[0])`[1]
-+	adr		r4, .LADD
-+	vdup.32		@XMM[4], r11
-+	vldm		r4, {@XMM[11]-@XMM[12]}
-+	vmov		@XMM[0], @XMM[4]
- 	vadd.u32	@XMM[4], @XMM[11]
--	vadd.u32	@XMM[9], @XMM[12]
--	vdup.32		@XMM[2], `&Dhi(@XMM[0])`[0]
--	vdup.32		@XMM[6], `&Dhi(@XMM[0])`[0]
-+	vadd.u32	@XMM[0], @XMM[12]
-+#ifdef __ARMEL__
-+	vrev32.8	@XMM[4], @XMM[4]
-+	vrev32.8	@XMM[0], @XMM[0]
-+#endif
-+	vld1.8		{@XMM[1]-@XMM[2]}, [r10]
-+	vld1.8		{@XMM[5]-@XMM[6]}, [r10]
- 	vzip.32		@XMM[2], @XMM[4]
--	vzip.32		@XMM[6], @XMM[9]
--
--	vmov		`&Dhi(@XMM[1])`, `&Dlo(@XMM[0])`
--	vmov		`&Dlo(@XMM[1])`, `&Dlo(@XMM[0])`
- 	vmov		@XMM[3], @XMM[1]
--	vmov		@XMM[5], @XMM[1]
--	vmov		@XMM[7], @XMM[1]
-+	vzip.32		@XMM[6], @XMM[0]
-+	vmov		@XMM[7], @XMM[5]
- 
- 	vswp		`&Dhi(@XMM[1])`, `&Dlo(@XMM[2])`
- 	vswp		`&Dhi(@XMM[3])`, `&Dlo(@XMM[4])`
- 	vswp		`&Dhi(@XMM[5])`, `&Dlo(@XMM[6])`
--	vswp		`&Dhi(@XMM[7])`, `&Dlo(@XMM[9])`
--
--	vstm		$ctr, {@XMM[9]}			@ save counter
--
--	@ Borrow prologue from _bsaes_encrypt8 to use the opportunity
--	@ to flip byte order in 32-bit counter
-+	vswp		`&Dhi(@XMM[7])`, `&Dlo(@XMM[0])`
- 
--	adr		r11, .LSWPUPM0SR
--	vld1.8		{@XMM[8]}, [r11]!		@ .LSWPUPM0SR
--	adrl		$const,.LSR
--	vld1.8		{@XMM[9]}, [$keysched]		@ load round0 key
-+	mov		r4, $keysched			@ pass round key
- 	mov		r5, $rounds			@ pass rounds
--	add		r4, $keysched, #0x10		@ pass next round key
--	veor		@XMM[10], @XMM[0], @XMM[9]	@ xor with round0 key
--	veor		@XMM[11], @XMM[1], @XMM[9]
--	 vtbl.8		`&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
--	 vtbl.8		`&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
--	veor		@XMM[12], @XMM[2], @XMM[9]
--	 vtbl.8		`&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
--	 vtbl.8		`&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
--	veor		@XMM[13], @XMM[3], @XMM[9]
--	 vtbl.8		`&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
--	 vtbl.8		`&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
--	veor		@XMM[14], @XMM[4], @XMM[9]
--	 vtbl.8		`&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
--	 vtbl.8		`&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
--	veor		@XMM[15], @XMM[5], @XMM[9]
--	 vtbl.8		`&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
--	 vtbl.8		`&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
--	veor		@XMM[10], @XMM[6], @XMM[9]
--	 vtbl.8		`&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
--	 vtbl.8		`&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
--	veor		@XMM[11], @XMM[7], @XMM[9]
--	 vtbl.8		`&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
--	 vtbl.8		`&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
--	 vtbl.8		`&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
--	 vtbl.8		`&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
- 
--	bl		_bsaes_encrypt8_bitslice
-+	bl		_bsaes_encrypt8
- 
- 	subs		$len, #8
-+	add		r11, #8
-+
- 	blo		.Lctr_enc_loop_done
- 
--	vldmia		$inp!, {@XMM[8]-@XMM[15]}	@ load input
-+	vld1.8		{@XMM[8]-@XMM[9]}, [$inp]!	@ load input
-+	vld1.8		{@XMM[10]-@XMM[11]}, [$inp]!
-+	vld1.8		{@XMM[12]-@XMM[13]}, [$inp]!
-+	vld1.8		{@XMM[14]-@XMM[15]}, [$inp]!
- 
- 	veor		@XMM[8], @XMM[0]
- 	veor		@XMM[1], @XMM[9]
-@@ -1353,8 +1319,6 @@ bsaes_ctr32_encrypt_blocks:
- 	vst1.8		{@XMM[2]}, [$out]!
- 	vst1.8		{@XMM[5]}, [$out]!
- 
--	vldm		$ctr, {@XMM[0]}		@ load counter
--
- 	bne		.Lctr_enc_loop
- 	b		.Lctr_enc_done
- 
-@@ -1393,12 +1357,13 @@ bsaes_ctr32_encrypt_blocks:
- 
- .Lctr_enc_done:
- 	vmov.i32	q0, #0
-+	vmov.i32	q1, #0
- .Lctr_enc_bzero:			@ wipe key schedule [if any]
--	vst1.8		{q0}, [$keysched]!
-+	vstm		$keysched!, {q0-q1}
- 	teq		$keysched, r10
- 	bne		.Lctr_enc_bzero
- 
--	add	sp, r10, #0x10
-+	add	sp, r10, #0x20
- 	vldmia	sp!, {d8-d15}
- 	ldmia	sp!, {r4-r8, r10-r11, pc}
- 
---- a/crypto/evp/e_aes.c
-+++ b/crypto/evp/e_aes.c
-@@ -485,7 +485,8 @@ const EVP_CIPHER *EVP_aes_##keylen##_##m
- 
- #endif
- 
--#if	defined(AES_ASM) && defined(BSAES_ASM) && (defined(__arm__) || defined(__arm))
-+#if	defined(AES_ASM) && defined(BSAES_ASM) && (defined(__arm__) || defined(__arm)) \
-+	&& defined(__ARMEL__)
- #include "arm_arch.h"
- #define BSAES_CAPABLE	(OPENSSL_armcap_P & ARMV7_NEON)
- #endif
diff --git a/debian/patches/old/0003-bsaes-armv7.pl-avoid-bit-sliced-AES-CBC-for-block-si.patch b/debian/patches/old/0003-bsaes-armv7.pl-avoid-bit-sliced-AES-CBC-for-block-si.patch
deleted file mode 100644
index 4d2235d..0000000
--- a/debian/patches/old/0003-bsaes-armv7.pl-avoid-bit-sliced-AES-CBC-for-block-si.patch
+++ /dev/null
@@ -1,24 +0,0 @@
-From a2f9535dd2b0d2e230f978aa3eaf103f5224b6d5 Mon Sep 17 00:00:00 2001
-From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
-Date: Mon, 15 Apr 2013 14:32:59 +0200
-Subject: [PATCH 3/3] bsaes-armv7.pl: avoid bit-sliced AES/CBC for block sizes
- < 1k
-
-Avoid using bit sliced AES for CBC decryption when the block size
-is smaller than 1k. The reason is that the overhead of creating the
-key schedule is larger than the obtained speedup on Cortex-A9.
----
- crypto/aes/asm/bsaes-armv7.pl |    2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
---- a/crypto/aes/asm/bsaes-armv7.pl
-+++ b/crypto/aes/asm/bsaes-armv7.pl
-@@ -985,7 +985,7 @@ $code.=<<___;
- 	.global	bsaes_cbc_encrypt
- 	.type	bsaes_cbc_encrypt,%function
- bsaes_cbc_encrypt:
--	cmp	$len, #128
-+	cmp	$len, #1024
- 	blo	AES_cbc_encrypt
- 
- 	@ it is up to the caller to make sure we are called with enc == 0
diff --git a/debian/patches/pic.patch b/debian/patches/pic.patch
index ed95be4..bf63614 100644
--- a/debian/patches/pic.patch
+++ b/debian/patches/pic.patch
@@ -1,9 +1,9 @@
 ---
  crypto/des/asm/desboth.pl |   17 ++++++++++++++---
  crypto/perlasm/cbc.pl     |   24 ++++++++++++++++++++----
- crypto/perlasm/x86gas.pl  |   11 +++++++++++
+ crypto/perlasm/x86gas.pl  |   16 ++++++++++++++++
  crypto/x86cpuid.pl        |   10 +++++-----
- 4 files changed, 50 insertions(+), 12 deletions(-)
+ 4 files changed, 55 insertions(+), 12 deletions(-)
 
 --- a/crypto/des/asm/desboth.pl
 +++ b/crypto/des/asm/desboth.pl
@@ -108,11 +108,13 @@
      }
      push(@out,$initseg) if ($initseg);
  }
-@@ -218,7 +219,17 @@ ___
+@@ -218,8 +219,23 @@ ___
      elsif ($::elf)
      {	$initseg.=<<___;
  .section	.init
-+#ifdef OPENSSL_PIC
++___
++        if ($::pic)
++	{   $initseg.=<<___;
 +	pushl	%ebx
 +	call	.pic_point0
 +.pic_point0:
@@ -120,12 +122,16 @@
 +	addl	\$_GLOBAL_OFFSET_TABLE_+[.-.pic_point0],%ebx
 +	call	$f\@PLT
 +	popl	%ebx
-+#else
++___
++	}
++	else
++	{   $initseg.=<<___;
  	call	$f
-+#endif
  ___
++	}
      }
      elsif ($::coff)
+     {   $initseg.=<<___;	# applies to both Cygwin and Mingw
 --- a/crypto/x86cpuid.pl
 +++ b/crypto/x86cpuid.pl
 @@ -8,6 +8,8 @@ require "x86asm.pl";
diff --git a/debian/patches/series b/debian/patches/series
index a2845d0..040e3df 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -35,7 +35,9 @@ default_bits.patch
 perlpath-quilt.patch
 tls12_workarounds.patch
 ubuntu_deb676533_arm_asm.patch
+arm64-support
 CVE-2013-0166.patch
+# Disabled for now, as causes regression on AES-NI
 CVE-2013-0169.patch
 fix_key_decoding_deadlock.patch
 
diff --git a/debian/patches/ubuntu_deb676533_arm_asm.patch b/debian/patches/ubuntu_deb676533_arm_asm.patch
index 9325394..a484bec 100644
--- a/debian/patches/ubuntu_deb676533_arm_asm.patch
+++ b/debian/patches/ubuntu_deb676533_arm_asm.patch
@@ -10,7 +10,7 @@ Bug-Ubuntu: https://bugs.launchpad.net/ubuntu/+source/openssl/+bug/1083498
 --- a/Configure
 +++ b/Configure
 @@ -346,9 +346,8 @@ my %table=(
- "debian-alpha","gcc:-DTERMIO $debian_cflag::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_UNROLL:${alpha_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+ "debian-alpha","gcc:-DTERMIO ${debian_cflags}::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_UNROLL:${alpha_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
  "debian-alpha-ev4","gcc:-DTERMIO ${debian_cflags} -mcpu=ev4::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_UNROLL:${alpha_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
  "debian-alpha-ev5","gcc:-DTERMIO ${debian_cflags} -mcpu=ev5::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_UNROLL:${alpha_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 -"debian-armeb","gcc:-DB_ENDIAN -DTERMIO ${debian_cflags}::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
author	Fathi Boudra <fathi.boudra@linaro.org>	2013-05-26 12:15:05 +0300
committer	Fathi Boudra <fathi.boudra@linaro.org>	2013-06-29 15:09:45 +0300
commit	a7dbcd9ed1d4d9cf3c5e327d4daed85e393303a1 (patch)
tree	84e2fda53d6fd06ff21b3be1b211a44f917155ec
parent	bb612d6a59521b30e8dbe7b91cd696e2980cbf6b (diff)