aboutsummaryrefslogtreecommitdiff
path: root/src/cpu
diff options
context:
space:
mode:
authorkvn <none@none>2012-10-24 14:33:22 -0700
committerkvn <none@none>2012-10-24 14:33:22 -0700
commitf1d6dc261e69a36e094118451f35c7f90b5fc7e1 (patch)
tree5f7dd4cb0f737345255d2abea1536834c2c63439 /src/cpu
parented5447be6be3e258e9fc9e4892db615340177731 (diff)
7184394: add intrinsics to use AES instructions
Summary: Use new x86 AES instructions for AESCrypt. Reviewed-by: twisti, kvn, roland Contributed-by: tom.deneau@amd.com
Diffstat (limited to 'src/cpu')
-rw-r--r--src/cpu/x86/vm/assembler_x86.cpp97
-rw-r--r--src/cpu/x86/vm/assembler_x86.hpp25
-rw-r--r--src/cpu/x86/vm/stubGenerator_x86_32.cpp533
-rw-r--r--src/cpu/x86/vm/stubGenerator_x86_64.cpp552
-rw-r--r--src/cpu/x86/vm/stubRoutines_x86_32.cpp1
-rw-r--r--src/cpu/x86/vm/stubRoutines_x86_32.hpp4
-rw-r--r--src/cpu/x86/vm/stubRoutines_x86_64.cpp1
-rw-r--r--src/cpu/x86/vm/stubRoutines_x86_64.hpp5
-rw-r--r--src/cpu/x86/vm/vm_version_x86.cpp32
-rw-r--r--src/cpu/x86/vm/vm_version_x86.hpp10
10 files changed, 1257 insertions, 3 deletions
diff --git a/src/cpu/x86/vm/assembler_x86.cpp b/src/cpu/x86/vm/assembler_x86.cpp
index 378c1f8c6..6b9677d30 100644
--- a/src/cpu/x86/vm/assembler_x86.cpp
+++ b/src/cpu/x86/vm/assembler_x86.cpp
@@ -1007,6 +1007,67 @@ void Assembler::addss(XMMRegister dst, Address src) {
emit_simd_arith(0x58, dst, src, VEX_SIMD_F3);
}
+void Assembler::aesdec(XMMRegister dst, Address src) {
+ assert(VM_Version::supports_aes(), "");
+ InstructionMark im(this);
+ simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+ emit_byte(0xde);
+ emit_operand(dst, src);
+}
+
+void Assembler::aesdec(XMMRegister dst, XMMRegister src) {
+ assert(VM_Version::supports_aes(), "");
+ int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+ emit_byte(0xde);
+ emit_byte(0xC0 | encode);
+}
+
+void Assembler::aesdeclast(XMMRegister dst, Address src) {
+ assert(VM_Version::supports_aes(), "");
+ InstructionMark im(this);
+ simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+ emit_byte(0xdf);
+ emit_operand(dst, src);
+}
+
+void Assembler::aesdeclast(XMMRegister dst, XMMRegister src) {
+ assert(VM_Version::supports_aes(), "");
+ int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+ emit_byte(0xdf);
+ emit_byte(0xC0 | encode);
+}
+
+void Assembler::aesenc(XMMRegister dst, Address src) {
+ assert(VM_Version::supports_aes(), "");
+ InstructionMark im(this);
+ simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+ emit_byte(0xdc);
+ emit_operand(dst, src);
+}
+
+void Assembler::aesenc(XMMRegister dst, XMMRegister src) {
+ assert(VM_Version::supports_aes(), "");
+ int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+ emit_byte(0xdc);
+ emit_byte(0xC0 | encode);
+}
+
+void Assembler::aesenclast(XMMRegister dst, Address src) {
+ assert(VM_Version::supports_aes(), "");
+ InstructionMark im(this);
+ simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+ emit_byte(0xdd);
+ emit_operand(dst, src);
+}
+
+void Assembler::aesenclast(XMMRegister dst, XMMRegister src) {
+ assert(VM_Version::supports_aes(), "");
+ int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+ emit_byte(0xdd);
+ emit_byte(0xC0 | encode);
+}
+
+
void Assembler::andl(Address dst, int32_t imm32) {
InstructionMark im(this);
prefix(dst);
@@ -2307,6 +2368,22 @@ void Assembler::prefix(Prefix p) {
a_byte(p);
}
+void Assembler::pshufb(XMMRegister dst, XMMRegister src) {
+ assert(VM_Version::supports_ssse3(), "");
+ int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+ emit_byte(0x00);
+ emit_byte(0xC0 | encode);
+}
+
+void Assembler::pshufb(XMMRegister dst, Address src) {
+ assert(VM_Version::supports_ssse3(), "");
+ assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
+ InstructionMark im(this);
+ simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+ emit_byte(0x00);
+ emit_operand(dst, src);
+}
+
void Assembler::pshufd(XMMRegister dst, XMMRegister src, int mode) {
assert(isByte(mode), "invalid value");
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
@@ -8067,6 +8144,15 @@ void MacroAssembler::movptr(Address dst, Register src) {
LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
}
+void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src) {
+ if (reachable(src)) {
+ Assembler::movdqu(dst, as_Address(src));
+ } else {
+ lea(rscratch1, src);
+ Assembler::movdqu(dst, Address(rscratch1, 0));
+ }
+}
+
void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
if (reachable(src)) {
Assembler::movsd(dst, as_Address(src));
@@ -8357,6 +8443,17 @@ void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
}
}
+void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
+ // Used in sign-bit flipping with aligned address.
+ assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
+ if (reachable(src)) {
+ Assembler::pshufb(dst, as_Address(src));
+ } else {
+ lea(rscratch1, src);
+ Assembler::pshufb(dst, Address(rscratch1, 0));
+ }
+}
+
// AVX 3-operands instructions
void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
diff --git a/src/cpu/x86/vm/assembler_x86.hpp b/src/cpu/x86/vm/assembler_x86.hpp
index c936e13f5..8a9bbaf42 100644
--- a/src/cpu/x86/vm/assembler_x86.hpp
+++ b/src/cpu/x86/vm/assembler_x86.hpp
@@ -875,6 +875,17 @@ private:
void addss(XMMRegister dst, Address src);
void addss(XMMRegister dst, XMMRegister src);
+ // AES instructions
+ void aesdec(XMMRegister dst, Address src);
+ void aesdec(XMMRegister dst, XMMRegister src);
+ void aesdeclast(XMMRegister dst, Address src);
+ void aesdeclast(XMMRegister dst, XMMRegister src);
+ void aesenc(XMMRegister dst, Address src);
+ void aesenc(XMMRegister dst, XMMRegister src);
+ void aesenclast(XMMRegister dst, Address src);
+ void aesenclast(XMMRegister dst, XMMRegister src);
+
+
void andl(Address dst, int32_t imm32);
void andl(Register dst, int32_t imm32);
void andl(Register dst, Address src);
@@ -1424,6 +1435,10 @@ private:
void prefetcht2(Address src);
void prefetchw(Address src);
+ // Shuffle Bytes
+ void pshufb(XMMRegister dst, XMMRegister src);
+ void pshufb(XMMRegister dst, Address src);
+
// Shuffle Packed Doublewords
void pshufd(XMMRegister dst, XMMRegister src, int mode);
void pshufd(XMMRegister dst, Address src, int mode);
@@ -2611,6 +2626,12 @@ public:
void divss(XMMRegister dst, Address src) { Assembler::divss(dst, src); }
void divss(XMMRegister dst, AddressLiteral src);
+ // Move Unaligned Double Quadword
+ void movdqu(Address dst, XMMRegister src) { Assembler::movdqu(dst, src); }
+ void movdqu(XMMRegister dst, Address src) { Assembler::movdqu(dst, src); }
+ void movdqu(XMMRegister dst, XMMRegister src) { Assembler::movdqu(dst, src); }
+ void movdqu(XMMRegister dst, AddressLiteral src);
+
void movsd(XMMRegister dst, XMMRegister src) { Assembler::movsd(dst, src); }
void movsd(Address dst, XMMRegister src) { Assembler::movsd(dst, src); }
void movsd(XMMRegister dst, Address src) { Assembler::movsd(dst, src); }
@@ -2658,6 +2679,10 @@ public:
void xorps(XMMRegister dst, Address src) { Assembler::xorps(dst, src); }
void xorps(XMMRegister dst, AddressLiteral src);
+ // Shuffle Bytes
+ void pshufb(XMMRegister dst, XMMRegister src) { Assembler::pshufb(dst, src); }
+ void pshufb(XMMRegister dst, Address src) { Assembler::pshufb(dst, src); }
+ void pshufb(XMMRegister dst, AddressLiteral src);
// AVX 3-operands instructions
void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vaddsd(dst, nds, src); }
diff --git a/src/cpu/x86/vm/stubGenerator_x86_32.cpp b/src/cpu/x86/vm/stubGenerator_x86_32.cpp
index f149fde83..d8b61e0b2 100644
--- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp
+++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp
@@ -2137,6 +2137,529 @@ class StubGenerator: public StubCodeGenerator {
}
}
+ // AES intrinsic stubs
+ enum {AESBlockSize = 16};
+
+ address generate_key_shuffle_mask() {
+ __ align(16);
+ StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
+ address start = __ pc();
+ __ emit_data(0x00010203, relocInfo::none, 0 );
+ __ emit_data(0x04050607, relocInfo::none, 0 );
+ __ emit_data(0x08090a0b, relocInfo::none, 0 );
+ __ emit_data(0x0c0d0e0f, relocInfo::none, 0 );
+ return start;
+ }
+
+ // Utility routine for loading a 128-bit key word in little endian format
+ // can optionally specify that the shuffle mask is already in an xmmregister
+ void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
+ __ movdqu(xmmdst, Address(key, offset));
+ if (xmm_shuf_mask != NULL) {
+ __ pshufb(xmmdst, xmm_shuf_mask);
+ } else {
+ __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+ }
+ }
+
+ // aesenc using specified key+offset
+ // can optionally specify that the shuffle mask is already in an xmmregister
+ void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
+ load_key(xmmtmp, key, offset, xmm_shuf_mask);
+ __ aesenc(xmmdst, xmmtmp);
+ }
+
+ // aesdec using specified key+offset
+ // can optionally specify that the shuffle mask is already in an xmmregister
+ void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
+ load_key(xmmtmp, key, offset, xmm_shuf_mask);
+ __ aesdec(xmmdst, xmmtmp);
+ }
+
+
+ // Arguments:
+ //
+ // Inputs:
+ // c_rarg0 - source byte array address
+ // c_rarg1 - destination byte array address
+ // c_rarg2 - K (key) in little endian int array
+ //
+ address generate_aescrypt_encryptBlock() {
+ assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
+ Label L_doLast;
+ address start = __ pc();
+
+ const Register from = rsi; // source array address
+ const Register to = rdx; // destination array address
+ const Register key = rcx; // key array address
+ const Register keylen = rax;
+ const Address from_param(rbp, 8+0);
+ const Address to_param (rbp, 8+4);
+ const Address key_param (rbp, 8+8);
+
+ const XMMRegister xmm_result = xmm0;
+ const XMMRegister xmm_temp = xmm1;
+ const XMMRegister xmm_key_shuf_mask = xmm2;
+
+ __ enter(); // required for proper stackwalking of RuntimeStub frame
+ __ push(rsi);
+ __ movptr(from , from_param);
+ __ movptr(to , to_param);
+ __ movptr(key , key_param);
+
+ __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+ // keylen = # of 32-bit words, convert to 128-bit words
+ __ shrl(keylen, 2);
+ __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more
+
+ __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+ __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input
+
+ // For encryption, the java expanded key ordering is just what we need
+
+ load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
+ __ pxor(xmm_result, xmm_temp);
+ for (int offset = 0x10; offset <= 0x90; offset += 0x10) {
+ aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
+ }
+ load_key (xmm_temp, key, 0xa0, xmm_key_shuf_mask);
+ __ cmpl(keylen, 0);
+ __ jcc(Assembler::equal, L_doLast);
+ __ aesenc(xmm_result, xmm_temp); // only in 192 and 256 bit keys
+ aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
+ load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask);
+ __ subl(keylen, 2);
+ __ jcc(Assembler::equal, L_doLast);
+ __ aesenc(xmm_result, xmm_temp); // only in 256 bit keys
+ aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
+ load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask);
+
+ __ BIND(L_doLast);
+ __ aesenclast(xmm_result, xmm_temp);
+ __ movdqu(Address(to, 0), xmm_result); // store the result
+ __ xorptr(rax, rax); // return 0
+ __ pop(rsi);
+ __ leave(); // required for proper stackwalking of RuntimeStub frame
+ __ ret(0);
+
+ return start;
+ }
+
+
+ // Arguments:
+ //
+ // Inputs:
+ // c_rarg0 - source byte array address
+ // c_rarg1 - destination byte array address
+ // c_rarg2 - K (key) in little endian int array
+ //
+ address generate_aescrypt_decryptBlock() {
+ assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
+ Label L_doLast;
+ address start = __ pc();
+
+ const Register from = rsi; // source array address
+ const Register to = rdx; // destination array address
+ const Register key = rcx; // key array address
+ const Register keylen = rax;
+ const Address from_param(rbp, 8+0);
+ const Address to_param (rbp, 8+4);
+ const Address key_param (rbp, 8+8);
+
+ const XMMRegister xmm_result = xmm0;
+ const XMMRegister xmm_temp = xmm1;
+ const XMMRegister xmm_key_shuf_mask = xmm2;
+
+ __ enter(); // required for proper stackwalking of RuntimeStub frame
+ __ push(rsi);
+ __ movptr(from , from_param);
+ __ movptr(to , to_param);
+ __ movptr(key , key_param);
+
+ __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+ // keylen = # of 32-bit words, convert to 128-bit words
+ __ shrl(keylen, 2);
+ __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more
+
+ __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+ __ movdqu(xmm_result, Address(from, 0));
+
+ // for decryption java expanded key ordering is rotated one position from what we want
+ // so we start from 0x10 here and hit 0x00 last
+ // we don't know if the key is aligned, hence not using load-execute form
+ load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask);
+ __ pxor (xmm_result, xmm_temp);
+ for (int offset = 0x20; offset <= 0xa0; offset += 0x10) {
+ aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
+ }
+ __ cmpl(keylen, 0);
+ __ jcc(Assembler::equal, L_doLast);
+ // only in 192 and 256 bit keys
+ aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
+ aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask);
+ __ subl(keylen, 2);
+ __ jcc(Assembler::equal, L_doLast);
+ // only in 256 bit keys
+ aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
+ aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask);
+
+ __ BIND(L_doLast);
+ // for decryption the aesdeclast operation is always on key+0x00
+ load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
+ __ aesdeclast(xmm_result, xmm_temp);
+
+ __ movdqu(Address(to, 0), xmm_result); // store the result
+
+ __ xorptr(rax, rax); // return 0
+ __ pop(rsi);
+ __ leave(); // required for proper stackwalking of RuntimeStub frame
+ __ ret(0);
+
+ return start;
+ }
+
+ void handleSOERegisters(bool saving) {
+ const int saveFrameSizeInBytes = 4 * wordSize;
+ const Address saved_rbx (rbp, -3 * wordSize);
+ const Address saved_rsi (rbp, -2 * wordSize);
+ const Address saved_rdi (rbp, -1 * wordSize);
+
+ if (saving) {
+ __ subptr(rsp, saveFrameSizeInBytes);
+ __ movptr(saved_rsi, rsi);
+ __ movptr(saved_rdi, rdi);
+ __ movptr(saved_rbx, rbx);
+ } else {
+ // restoring
+ __ movptr(rsi, saved_rsi);
+ __ movptr(rdi, saved_rdi);
+ __ movptr(rbx, saved_rbx);
+ }
+ }
+
+ // Arguments:
+ //
+ // Inputs:
+ // c_rarg0 - source byte array address
+ // c_rarg1 - destination byte array address
+ // c_rarg2 - K (key) in little endian int array
+ // c_rarg3 - r vector byte array address
+ // c_rarg4 - input length
+ //
+ address generate_cipherBlockChaining_encryptAESCrypt() {
+ assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
+ address start = __ pc();
+
+ Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
+ const Register from = rsi; // source array address
+ const Register to = rdx; // destination array address
+ const Register key = rcx; // key array address
+ const Register rvec = rdi; // r byte array initialized from initvector array address
+ // and left with the results of the last encryption block
+ const Register len_reg = rbx; // src len (must be multiple of blocksize 16)
+ const Register pos = rax;
+
+ // xmm register assignments for the loops below
+ const XMMRegister xmm_result = xmm0;
+ const XMMRegister xmm_temp = xmm1;
+ // first 6 keys preloaded into xmm2-xmm7
+ const int XMM_REG_NUM_KEY_FIRST = 2;
+ const int XMM_REG_NUM_KEY_LAST = 7;
+ const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
+
+ __ enter(); // required for proper stackwalking of RuntimeStub frame
+ handleSOERegisters(true /*saving*/);
+
+ // load registers from incoming parameters
+ const Address from_param(rbp, 8+0);
+ const Address to_param (rbp, 8+4);
+ const Address key_param (rbp, 8+8);
+ const Address rvec_param (rbp, 8+12);
+ const Address len_param (rbp, 8+16);
+ __ movptr(from , from_param);
+ __ movptr(to , to_param);
+ __ movptr(key , key_param);
+ __ movptr(rvec , rvec_param);
+ __ movptr(len_reg , len_param);
+
+ const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front
+ __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+ // load up xmm regs 2 thru 7 with keys 0-5
+ for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
+ load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
+ offset += 0x10;
+ }
+
+ __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec
+
+ // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
+ __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+ __ cmpl(rax, 44);
+ __ jcc(Assembler::notEqual, L_key_192_256);
+
+ // 128 bit code follows here
+ __ movptr(pos, 0);
+ __ align(OptoLoopAlignment);
+ __ BIND(L_loopTop_128);
+ __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
+ __ pxor (xmm_result, xmm_temp); // xor with the current r vector
+
+ __ pxor (xmm_result, xmm_key0); // do the aes rounds
+ for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
+ __ aesenc(xmm_result, as_XMMRegister(rnum));
+ }
+ for (int key_offset = 0x60; key_offset <= 0x90; key_offset += 0x10) {
+ aes_enc_key(xmm_result, xmm_temp, key, key_offset);
+ }
+ load_key(xmm_temp, key, 0xa0);
+ __ aesenclast(xmm_result, xmm_temp);
+
+ __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
+ // no need to store r to memory until we exit
+ __ addptr(pos, AESBlockSize);
+ __ subptr(len_reg, AESBlockSize);
+ __ jcc(Assembler::notEqual, L_loopTop_128);
+
+ __ BIND(L_exit);
+ __ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object
+
+ handleSOERegisters(false /*restoring*/);
+ __ movl(rax, 0); // return 0 (why?)
+ __ leave(); // required for proper stackwalking of RuntimeStub frame
+ __ ret(0);
+
+ __ BIND(L_key_192_256);
+ // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
+ __ cmpl(rax, 52);
+ __ jcc(Assembler::notEqual, L_key_256);
+
+ // 192-bit code follows here (could be changed to use more xmm registers)
+ __ movptr(pos, 0);
+ __ align(OptoLoopAlignment);
+ __ BIND(L_loopTop_192);
+ __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
+ __ pxor (xmm_result, xmm_temp); // xor with the current r vector
+
+ __ pxor (xmm_result, xmm_key0); // do the aes rounds
+ for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
+ __ aesenc(xmm_result, as_XMMRegister(rnum));
+ }
+ for (int key_offset = 0x60; key_offset <= 0xb0; key_offset += 0x10) {
+ aes_enc_key(xmm_result, xmm_temp, key, key_offset);
+ }
+ load_key(xmm_temp, key, 0xc0);
+ __ aesenclast(xmm_result, xmm_temp);
+
+ __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
+ // no need to store r to memory until we exit
+ __ addptr(pos, AESBlockSize);
+ __ subptr(len_reg, AESBlockSize);
+ __ jcc(Assembler::notEqual, L_loopTop_192);
+ __ jmp(L_exit);
+
+ __ BIND(L_key_256);
+ // 256-bit code follows here (could be changed to use more xmm registers)
+ __ movptr(pos, 0);
+ __ align(OptoLoopAlignment);
+ __ BIND(L_loopTop_256);
+ __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
+ __ pxor (xmm_result, xmm_temp); // xor with the current r vector
+
+ __ pxor (xmm_result, xmm_key0); // do the aes rounds
+ for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
+ __ aesenc(xmm_result, as_XMMRegister(rnum));
+ }
+ for (int key_offset = 0x60; key_offset <= 0xd0; key_offset += 0x10) {
+ aes_enc_key(xmm_result, xmm_temp, key, key_offset);
+ }
+ load_key(xmm_temp, key, 0xe0);
+ __ aesenclast(xmm_result, xmm_temp);
+
+ __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
+ // no need to store r to memory until we exit
+ __ addptr(pos, AESBlockSize);
+ __ subptr(len_reg, AESBlockSize);
+ __ jcc(Assembler::notEqual, L_loopTop_256);
+ __ jmp(L_exit);
+
+ return start;
+ }
+
+
+ // CBC AES Decryption.
+ // In 32-bit stub, because of lack of registers we do not try to parallelize 4 blocks at a time.
+ //
+ // Arguments:
+ //
+ // Inputs:
+ // c_rarg0 - source byte array address
+ // c_rarg1 - destination byte array address
+ // c_rarg2 - K (key) in little endian int array
+ // c_rarg3 - r vector byte array address
+ // c_rarg4 - input length
+ //
+
+ address generate_cipherBlockChaining_decryptAESCrypt() {
+ assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
+ address start = __ pc();
+
+ Label L_exit, L_key_192_256, L_key_256;
+ Label L_singleBlock_loopTop_128;
+ Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256;
+ const Register from = rsi; // source array address
+ const Register to = rdx; // destination array address
+ const Register key = rcx; // key array address
+ const Register rvec = rdi; // r byte array initialized from initvector array address
+ // and left with the results of the last encryption block
+ const Register len_reg = rbx; // src len (must be multiple of blocksize 16)
+ const Register pos = rax;
+
+ // xmm register assignments for the loops below
+ const XMMRegister xmm_result = xmm0;
+ const XMMRegister xmm_temp = xmm1;
+ // first 6 keys preloaded into xmm2-xmm7
+ const int XMM_REG_NUM_KEY_FIRST = 2;
+ const int XMM_REG_NUM_KEY_LAST = 7;
+ const int FIRST_NON_REG_KEY_offset = 0x70;
+ const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
+
+ __ enter(); // required for proper stackwalking of RuntimeStub frame
+ handleSOERegisters(true /*saving*/);
+
+ // load registers from incoming parameters
+ const Address from_param(rbp, 8+0);
+ const Address to_param (rbp, 8+4);
+ const Address key_param (rbp, 8+8);
+ const Address rvec_param (rbp, 8+12);
+ const Address len_param (rbp, 8+16);
+ __ movptr(from , from_param);
+ __ movptr(to , to_param);
+ __ movptr(key , key_param);
+ __ movptr(rvec , rvec_param);
+ __ movptr(len_reg , len_param);
+
+ // the java expanded key ordering is rotated one position from what we want
+ // so we start from 0x10 here and hit 0x00 last
+ const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
+ __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+ // load up xmm regs 2 thru 6 with first 5 keys
+ for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
+ load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
+ offset += 0x10;
+ }
+
+ // inside here, use the rvec register to point to previous block cipher
+ // with which we xor at the end of each newly decrypted block
+ const Register prev_block_cipher_ptr = rvec;
+
+ // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
+ __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+ __ cmpl(rax, 44);
+ __ jcc(Assembler::notEqual, L_key_192_256);
+
+
+ // 128-bit code follows here, parallelized
+ __ movptr(pos, 0);
+ __ align(OptoLoopAlignment);
+ __ BIND(L_singleBlock_loopTop_128);
+ __ cmpptr(len_reg, 0); // any blocks left??
+ __ jcc(Assembler::equal, L_exit);
+ __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
+ __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds
+ for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
+ __ aesdec(xmm_result, as_XMMRegister(rnum));
+ }
+ for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xa0; key_offset += 0x10) { // 128-bit runs up to key offset a0
+ aes_dec_key(xmm_result, xmm_temp, key, key_offset);
+ }
+ load_key(xmm_temp, key, 0x00); // final key is stored in java expanded array at offset 0
+ __ aesdeclast(xmm_result, xmm_temp);
+ __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
+ __ pxor (xmm_result, xmm_temp); // xor with the current r vector
+ __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
+ // no need to store r to memory until we exit
+ __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0)); // set up new ptr
+ __ addptr(pos, AESBlockSize);
+ __ subptr(len_reg, AESBlockSize);
+ __ jmp(L_singleBlock_loopTop_128);
+
+
+ __ BIND(L_exit);
+ __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
+ __ movptr(rvec , rvec_param); // restore this since used in loop
+ __ movdqu(Address(rvec, 0), xmm_temp); // final value of r stored in rvec of CipherBlockChaining object
+ handleSOERegisters(false /*restoring*/);
+ __ movl(rax, 0); // return 0 (why?)
+ __ leave(); // required for proper stackwalking of RuntimeStub frame
+ __ ret(0);
+
+
+ __ BIND(L_key_192_256);
+ // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
+ __ cmpl(rax, 52);
+ __ jcc(Assembler::notEqual, L_key_256);
+
+ // 192-bit code follows here (could be optimized to use parallelism)
+ __ movptr(pos, 0);
+ __ align(OptoLoopAlignment);
+ __ BIND(L_singleBlock_loopTop_192);
+ __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
+ __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds
+ for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
+ __ aesdec(xmm_result, as_XMMRegister(rnum));
+ }
+ for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xc0; key_offset += 0x10) { // 192-bit runs up to key offset c0
+ aes_dec_key(xmm_result, xmm_temp, key, key_offset);
+ }
+ load_key(xmm_temp, key, 0x00); // final key is stored in java expanded array at offset 0
+ __ aesdeclast(xmm_result, xmm_temp);
+ __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
+ __ pxor (xmm_result, xmm_temp); // xor with the current r vector
+ __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
+ // no need to store r to memory until we exit
+ __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0)); // set up new ptr
+ __ addptr(pos, AESBlockSize);
+ __ subptr(len_reg, AESBlockSize);
+ __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192);
+ __ jmp(L_exit);
+
+ __ BIND(L_key_256);
+ // 256-bit code follows here (could be optimized to use parallelism)
+ __ movptr(pos, 0);
+ __ align(OptoLoopAlignment);
+ __ BIND(L_singleBlock_loopTop_256);
+ __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
+ __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds
+ for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
+ __ aesdec(xmm_result, as_XMMRegister(rnum));
+ }
+ for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xe0; key_offset += 0x10) { // 256-bit runs up to key offset e0
+ aes_dec_key(xmm_result, xmm_temp, key, key_offset);
+ }
+ load_key(xmm_temp, key, 0x00); // final key is stored in java expanded array at offset 0
+ __ aesdeclast(xmm_result, xmm_temp);
+ __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
+ __ pxor (xmm_result, xmm_temp); // xor with the current r vector
+ __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
+ // no need to store r to memory until we exit
+ __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0)); // set up new ptr
+ __ addptr(pos, AESBlockSize);
+ __ subptr(len_reg, AESBlockSize);
+ __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256);
+ __ jmp(L_exit);
+
+ return start;
+ }
+
+
public:
// Information about frame layout at time of blocking runtime call.
// Note that we only have to preserve callee-saved registers since
@@ -2332,6 +2855,16 @@ class StubGenerator: public StubCodeGenerator {
generate_arraycopy_stubs();
generate_math_stubs();
+
+ // don't bother generating these AES intrinsic stubs unless global flag is set
+ if (UseAESIntrinsics) {
+ StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // might be needed by the others
+
+ StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
+ StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
+ StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
+ StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
+ }
}
diff --git a/src/cpu/x86/vm/stubGenerator_x86_64.cpp b/src/cpu/x86/vm/stubGenerator_x86_64.cpp
index 8ae595a56..3e223387c 100644
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp
@@ -2941,6 +2941,548 @@ class StubGenerator: public StubCodeGenerator {
}
}
+ // AES intrinsic stubs
+ enum {AESBlockSize = 16};
+
+ address generate_key_shuffle_mask() {
+ __ align(16);
+ StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
+ address start = __ pc();
+ __ emit_data64( 0x0405060700010203, relocInfo::none );
+ __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
+ return start;
+ }
+
+ // Utility routine for loading a 128-bit key word in little endian format
+ // can optionally specify that the shuffle mask is already in an xmmregister
+ void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
+ __ movdqu(xmmdst, Address(key, offset));
+ if (xmm_shuf_mask != NULL) {
+ __ pshufb(xmmdst, xmm_shuf_mask);
+ } else {
+ __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+ }
+ }
+
+ // aesenc using specified key+offset
+ // can optionally specify that the shuffle mask is already in an xmmregister
+ void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
+ load_key(xmmtmp, key, offset, xmm_shuf_mask);
+ __ aesenc(xmmdst, xmmtmp);
+ }
+
+ // aesdec using specified key+offset
+ // can optionally specify that the shuffle mask is already in an xmmregister
+ void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
+ load_key(xmmtmp, key, offset, xmm_shuf_mask);
+ __ aesdec(xmmdst, xmmtmp);
+ }
+
+
+ // Arguments:
+ //
+ // Inputs:
+ // c_rarg0 - source byte array address
+ // c_rarg1 - destination byte array address
+ // c_rarg2 - K (key) in little endian int array
+ //
+ address generate_aescrypt_encryptBlock() {
+ assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
+ Label L_doLast;
+ address start = __ pc();
+
+ const Register from = c_rarg0; // source array address
+ const Register to = c_rarg1; // destination array address
+ const Register key = c_rarg2; // key array address
+ const Register keylen = rax;
+
+ const XMMRegister xmm_result = xmm0;
+ const XMMRegister xmm_temp = xmm1;
+ const XMMRegister xmm_key_shuf_mask = xmm2;
+
+ __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+ __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+ // keylen = # of 32-bit words, convert to 128-bit words
+ __ shrl(keylen, 2);
+ __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more
+
+ __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+ __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input
+
+ // For encryption, the java expanded key ordering is just what we need
+ // we don't know if the key is aligned, hence not using load-execute form
+
+ load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
+ __ pxor(xmm_result, xmm_temp);
+ for (int offset = 0x10; offset <= 0x90; offset += 0x10) {
+ aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
+ }
+ load_key (xmm_temp, key, 0xa0, xmm_key_shuf_mask);
+ __ cmpl(keylen, 0);
+ __ jcc(Assembler::equal, L_doLast);
+ __ aesenc(xmm_result, xmm_temp); // only in 192 and 256 bit keys
+ aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
+ load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask);
+ __ subl(keylen, 2);
+ __ jcc(Assembler::equal, L_doLast);
+ __ aesenc(xmm_result, xmm_temp); // only in 256 bit keys
+ aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
+ load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask);
+
+ __ BIND(L_doLast);
+ __ aesenclast(xmm_result, xmm_temp);
+ __ movdqu(Address(to, 0), xmm_result); // store the result
+ __ xorptr(rax, rax); // return 0
+ __ leave(); // required for proper stackwalking of RuntimeStub frame
+ __ ret(0);
+
+ return start;
+ }
+
+
+ // Arguments:
+ //
+ // Inputs:
+ // c_rarg0 - source byte array address
+ // c_rarg1 - destination byte array address
+ // c_rarg2 - K (key) in little endian int array
+ //
+ address generate_aescrypt_decryptBlock() {
+ assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
+ Label L_doLast;
+ address start = __ pc();
+
+ const Register from = c_rarg0; // source array address
+ const Register to = c_rarg1; // destination array address
+ const Register key = c_rarg2; // key array address
+ const Register keylen = rax;
+
+ const XMMRegister xmm_result = xmm0;
+ const XMMRegister xmm_temp = xmm1;
+ const XMMRegister xmm_key_shuf_mask = xmm2;
+
+ __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+ __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+ // keylen = # of 32-bit words, convert to 128-bit words
+ __ shrl(keylen, 2);
+ __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more
+
+ __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+ __ movdqu(xmm_result, Address(from, 0));
+
+ // for decryption java expanded key ordering is rotated one position from what we want
+ // so we start from 0x10 here and hit 0x00 last
+ // we don't know if the key is aligned, hence not using load-execute form
+ load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask);
+ __ pxor (xmm_result, xmm_temp);
+ for (int offset = 0x20; offset <= 0xa0; offset += 0x10) {
+ aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
+ }
+ __ cmpl(keylen, 0);
+ __ jcc(Assembler::equal, L_doLast);
+ // only in 192 and 256 bit keys
+ aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
+ aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask);
+ __ subl(keylen, 2);
+ __ jcc(Assembler::equal, L_doLast);
+ // only in 256 bit keys
+ aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
+ aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask);
+
+ __ BIND(L_doLast);
+ // for decryption the aesdeclast operation is always on key+0x00
+ load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
+ __ aesdeclast(xmm_result, xmm_temp);
+
+ __ movdqu(Address(to, 0), xmm_result); // store the result
+
+ __ xorptr(rax, rax); // return 0
+ __ leave(); // required for proper stackwalking of RuntimeStub frame
+ __ ret(0);
+
+ return start;
+ }
+
+
+ // Arguments:
+ //
+ // Inputs:
+ // c_rarg0 - source byte array address
+ // c_rarg1 - destination byte array address
+ // c_rarg2 - K (key) in little endian int array
+ // c_rarg3 - r vector byte array address
+ // c_rarg4 - input length
+ //
+ address generate_cipherBlockChaining_encryptAESCrypt() {
+ assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
+ address start = __ pc();
+
+ Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
+ const Register from = c_rarg0; // source array address
+ const Register to = c_rarg1; // destination array address
+ const Register key = c_rarg2; // key array address
+ const Register rvec = c_rarg3; // r byte array initialized from initvector array address
+ // and left with the results of the last encryption block
+#ifndef _WIN64
+ const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
+#else
+ const Address len_mem(rsp, 6 * wordSize); // length is on stack on Win64
+ const Register len_reg = r10; // pick the first volatile windows register
+#endif
+ const Register pos = rax;
+
+ // xmm register assignments for the loops below
+ const XMMRegister xmm_result = xmm0;
+ const XMMRegister xmm_temp = xmm1;
+ // keys 0-10 preloaded into xmm2-xmm12
+ const int XMM_REG_NUM_KEY_FIRST = 2;
+ const int XMM_REG_NUM_KEY_LAST = 12;
+ const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
+ const XMMRegister xmm_key10 = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
+
+ __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+#ifdef _WIN64
+ // on win64, fill len_reg from stack position
+ __ movl(len_reg, len_mem);
+ // save the xmm registers which must be preserved 6-12
+ __ subptr(rsp, -rsp_after_call_off * wordSize);
+ for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
+ __ movdqu(xmm_save(i), as_XMMRegister(i));
+ }
+#endif
+
+ const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front
+ __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+ // load up xmm regs 2 thru 12 with key 0x00 - 0xa0
+ for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
+ load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
+ offset += 0x10;
+ }
+
+ __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec
+
+ // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
+ __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+ __ cmpl(rax, 44);
+ __ jcc(Assembler::notEqual, L_key_192_256);
+
+ // 128 bit code follows here
+ __ movptr(pos, 0);
+ __ align(OptoLoopAlignment);
+ __ BIND(L_loopTop_128);
+ __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
+ __ pxor (xmm_result, xmm_temp); // xor with the current r vector
+
+ __ pxor (xmm_result, xmm_key0); // do the aes rounds
+ for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
+ __ aesenc(xmm_result, as_XMMRegister(rnum));
+ }
+ __ aesenclast(xmm_result, xmm_key10);
+
+ __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
+ // no need to store r to memory until we exit
+ __ addptr(pos, AESBlockSize);
+ __ subptr(len_reg, AESBlockSize);
+ __ jcc(Assembler::notEqual, L_loopTop_128);
+
+ __ BIND(L_exit);
+ __ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object
+
+#ifdef _WIN64
+ // restore xmm regs belonging to calling function
+ for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
+ __ movdqu(as_XMMRegister(i), xmm_save(i));
+ }
+#endif
+ __ movl(rax, 0); // return 0 (why?)
+ __ leave(); // required for proper stackwalking of RuntimeStub frame
+ __ ret(0);
+
+ __ BIND(L_key_192_256);
+ // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
+ __ cmpl(rax, 52);
+ __ jcc(Assembler::notEqual, L_key_256);
+
+ // 192-bit code follows here (could be changed to use more xmm registers)
+ __ movptr(pos, 0);
+ __ align(OptoLoopAlignment);
+ __ BIND(L_loopTop_192);
+ __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
+ __ pxor (xmm_result, xmm_temp); // xor with the current r vector
+
+ __ pxor (xmm_result, xmm_key0); // do the aes rounds
+ for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
+ __ aesenc(xmm_result, as_XMMRegister(rnum));
+ }
+ aes_enc_key(xmm_result, xmm_temp, key, 0xb0);
+ load_key(xmm_temp, key, 0xc0);
+ __ aesenclast(xmm_result, xmm_temp);
+
+ __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
+ // no need to store r to memory until we exit
+ __ addptr(pos, AESBlockSize);
+ __ subptr(len_reg, AESBlockSize);
+ __ jcc(Assembler::notEqual, L_loopTop_192);
+ __ jmp(L_exit);
+
+ __ BIND(L_key_256);
+ // 256-bit code follows here (could be changed to use more xmm registers)
+ __ movptr(pos, 0);
+ __ align(OptoLoopAlignment);
+ __ BIND(L_loopTop_256);
+ __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
+ __ pxor (xmm_result, xmm_temp); // xor with the current r vector
+
+ __ pxor (xmm_result, xmm_key0); // do the aes rounds
+ for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
+ __ aesenc(xmm_result, as_XMMRegister(rnum));
+ }
+ aes_enc_key(xmm_result, xmm_temp, key, 0xb0);
+ aes_enc_key(xmm_result, xmm_temp, key, 0xc0);
+ aes_enc_key(xmm_result, xmm_temp, key, 0xd0);
+ load_key(xmm_temp, key, 0xe0);
+ __ aesenclast(xmm_result, xmm_temp);
+
+ __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
+ // no need to store r to memory until we exit
+ __ addptr(pos, AESBlockSize);
+ __ subptr(len_reg, AESBlockSize);
+ __ jcc(Assembler::notEqual, L_loopTop_256);
+ __ jmp(L_exit);
+
+ return start;
+ }
+
+
+
+ // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
+ // to hide instruction latency
+ //
+ // Arguments:
+ //
+ // Inputs:
+ // c_rarg0 - source byte array address
+ // c_rarg1 - destination byte array address
+ // c_rarg2 - K (key) in little endian int array
+ // c_rarg3 - r vector byte array address
+ // c_rarg4 - input length
+ //
+
+ address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
+ assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
+ address start = __ pc();
+
+ Label L_exit, L_key_192_256, L_key_256;
+ Label L_singleBlock_loopTop_128, L_multiBlock_loopTop_128;
+ Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256;
+ const Register from = c_rarg0; // source array address
+ const Register to = c_rarg1; // destination array address
+ const Register key = c_rarg2; // key array address
+ const Register rvec = c_rarg3; // r byte array initialized from initvector array address
+ // and left with the results of the last encryption block
+#ifndef _WIN64
+ const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
+#else
+ const Address len_mem(rsp, 6 * wordSize); // length is on stack on Win64
+ const Register len_reg = r10; // pick the first volatile windows register
+#endif
+ const Register pos = rax;
+
+ // xmm register assignments for the loops below
+ const XMMRegister xmm_result = xmm0;
+ // keys 0-10 preloaded into xmm2-xmm12
+ const int XMM_REG_NUM_KEY_FIRST = 5;
+ const int XMM_REG_NUM_KEY_LAST = 15;
+ const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
+ const XMMRegister xmm_key_last = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
+
+ __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+#ifdef _WIN64
+ // on win64, fill len_reg from stack position
+ __ movl(len_reg, len_mem);
+ // save the xmm registers which must be preserved 6-15
+ __ subptr(rsp, -rsp_after_call_off * wordSize);
+ for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
+ __ movdqu(xmm_save(i), as_XMMRegister(i));
+ }
+#endif
+ // the java expanded key ordering is rotated one position from what we want
+ // so we start from 0x10 here and hit 0x00 last
+ const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
+ __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+ // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
+ for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
+ if (rnum == XMM_REG_NUM_KEY_LAST) offset = 0x00;
+ load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
+ offset += 0x10;
+ }
+
+ const XMMRegister xmm_prev_block_cipher = xmm1; // holds cipher of previous block
+ // registers holding the four results in the parallelized loop
+ const XMMRegister xmm_result0 = xmm0;
+ const XMMRegister xmm_result1 = xmm2;
+ const XMMRegister xmm_result2 = xmm3;
+ const XMMRegister xmm_result3 = xmm4;
+
+ __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec
+
+ // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
+ __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+ __ cmpl(rax, 44);
+ __ jcc(Assembler::notEqual, L_key_192_256);
+
+
+ // 128-bit code follows here, parallelized
+ __ movptr(pos, 0);
+ __ align(OptoLoopAlignment);
+ __ BIND(L_multiBlock_loopTop_128);
+ __ cmpptr(len_reg, 4*AESBlockSize); // see if at least 4 blocks left
+ __ jcc(Assembler::less, L_singleBlock_loopTop_128);
+
+ __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0*AESBlockSize)); // get next 4 blocks into xmmresult registers
+ __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1*AESBlockSize));
+ __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2*AESBlockSize));
+ __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3*AESBlockSize));
+
+#define DoFour(opc, src_reg) \
+ __ opc(xmm_result0, src_reg); \
+ __ opc(xmm_result1, src_reg); \
+ __ opc(xmm_result2, src_reg); \
+ __ opc(xmm_result3, src_reg);
+
+ DoFour(pxor, xmm_key_first);
+ for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
+ DoFour(aesdec, as_XMMRegister(rnum));
+ }
+ DoFour(aesdeclast, xmm_key_last);
+ // for each result, xor with the r vector of previous cipher block
+ __ pxor(xmm_result0, xmm_prev_block_cipher);
+ __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0*AESBlockSize));
+ __ pxor(xmm_result1, xmm_prev_block_cipher);
+ __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1*AESBlockSize));
+ __ pxor(xmm_result2, xmm_prev_block_cipher);
+ __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2*AESBlockSize));
+ __ pxor(xmm_result3, xmm_prev_block_cipher);
+ __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3*AESBlockSize)); // this will carry over to next set of blocks
+
+ __ movdqu(Address(to, pos, Address::times_1, 0*AESBlockSize), xmm_result0); // store 4 results into the next 64 bytes of output
+ __ movdqu(Address(to, pos, Address::times_1, 1*AESBlockSize), xmm_result1);
+ __ movdqu(Address(to, pos, Address::times_1, 2*AESBlockSize), xmm_result2);
+ __ movdqu(Address(to, pos, Address::times_1, 3*AESBlockSize), xmm_result3);
+
+ __ addptr(pos, 4*AESBlockSize);
+ __ subptr(len_reg, 4*AESBlockSize);
+ __ jmp(L_multiBlock_loopTop_128);
+
+ // registers used in the non-parallelized loops
+ const XMMRegister xmm_prev_block_cipher_save = xmm2;
+ const XMMRegister xmm_temp = xmm3;
+
+ __ align(OptoLoopAlignment);
+ __ BIND(L_singleBlock_loopTop_128);
+ __ cmpptr(len_reg, 0); // any blocks left??
+ __ jcc(Assembler::equal, L_exit);
+ __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
+ __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
+ __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds
+ for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
+ __ aesdec(xmm_result, as_XMMRegister(rnum));
+ }
+ __ aesdeclast(xmm_result, xmm_key_last);
+ __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector
+ __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
+ // no need to store r to memory until we exit
+ __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
+
+ __ addptr(pos, AESBlockSize);
+ __ subptr(len_reg, AESBlockSize);
+ __ jmp(L_singleBlock_loopTop_128);
+
+
+ __ BIND(L_exit);
+ __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object
+#ifdef _WIN64
+ // restore regs belonging to calling function
+ for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
+ __ movdqu(as_XMMRegister(i), xmm_save(i));
+ }
+#endif
+ __ movl(rax, 0); // return 0 (why?)
+ __ leave(); // required for proper stackwalking of RuntimeStub frame
+ __ ret(0);
+
+
+ __ BIND(L_key_192_256);
+ // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
+ __ cmpl(rax, 52);
+ __ jcc(Assembler::notEqual, L_key_256);
+
+ // 192-bit code follows here (could be optimized to use parallelism)
+ __ movptr(pos, 0);
+ __ align(OptoLoopAlignment);
+ __ BIND(L_singleBlock_loopTop_192);
+ __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
+ __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
+ __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds
+ for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
+ __ aesdec(xmm_result, as_XMMRegister(rnum));
+ }
+ aes_dec_key(xmm_result, xmm_temp, key, 0xb0); // 192-bit key goes up to c0
+ aes_dec_key(xmm_result, xmm_temp, key, 0xc0);
+ __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
+ __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector
+ __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
+ // no need to store r to memory until we exit
+ __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
+
+ __ addptr(pos, AESBlockSize);
+ __ subptr(len_reg, AESBlockSize);
+ __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192);
+ __ jmp(L_exit);
+
+ __ BIND(L_key_256);
+ // 256-bit code follows here (could be optimized to use parallelism)
+ __ movptr(pos, 0);
+ __ align(OptoLoopAlignment);
+ __ BIND(L_singleBlock_loopTop_256);
+ __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
+ __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
+ __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds
+ for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
+ __ aesdec(xmm_result, as_XMMRegister(rnum));
+ }
+ aes_dec_key(xmm_result, xmm_temp, key, 0xb0); // 256-bit key goes up to e0
+ aes_dec_key(xmm_result, xmm_temp, key, 0xc0);
+ aes_dec_key(xmm_result, xmm_temp, key, 0xd0);
+ aes_dec_key(xmm_result, xmm_temp, key, 0xe0);
+ __ aesdeclast(xmm_result, xmm_key_last); // xmm15 came from key+0
+ __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector
+ __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
+ // no need to store r to memory until we exit
+ __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
+
+ __ addptr(pos, AESBlockSize);
+ __ subptr(len_reg, AESBlockSize);
+ __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256);
+ __ jmp(L_exit);
+
+ return start;
+ }
+
+
+
#undef __
#define __ masm->
@@ -3135,6 +3677,16 @@ class StubGenerator: public StubCodeGenerator {
generate_arraycopy_stubs();
generate_math_stubs();
+
+ // don't bother generating these AES intrinsic stubs unless global flag is set
+ if (UseAESIntrinsics) {
+ StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // needed by the others
+
+ StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
+ StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
+ StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
+ StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
+ }
}
public:
diff --git a/src/cpu/x86/vm/stubRoutines_x86_32.cpp b/src/cpu/x86/vm/stubRoutines_x86_32.cpp
index 6ec4121b9..cfd4f33a6 100644
--- a/src/cpu/x86/vm/stubRoutines_x86_32.cpp
+++ b/src/cpu/x86/vm/stubRoutines_x86_32.cpp
@@ -44,3 +44,4 @@
address StubRoutines::x86::_verify_mxcsr_entry = NULL;
address StubRoutines::x86::_verify_fpu_cntrl_wrd_entry = NULL;
+address StubRoutines::x86::_key_shuffle_mask_addr = NULL;
diff --git a/src/cpu/x86/vm/stubRoutines_x86_32.hpp b/src/cpu/x86/vm/stubRoutines_x86_32.hpp
index 64767c8ad..d53124fc6 100644
--- a/src/cpu/x86/vm/stubRoutines_x86_32.hpp
+++ b/src/cpu/x86/vm/stubRoutines_x86_32.hpp
@@ -41,10 +41,14 @@ class x86 {
private:
static address _verify_mxcsr_entry;
static address _verify_fpu_cntrl_wrd_entry;
+ // shuffle mask for fixing up 128-bit words consisting of big-endian 32-bit integers
+ static address _key_shuffle_mask_addr;
public:
static address verify_mxcsr_entry() { return _verify_mxcsr_entry; }
static address verify_fpu_cntrl_wrd_entry() { return _verify_fpu_cntrl_wrd_entry; }
+ static address key_shuffle_mask_addr() { return _key_shuffle_mask_addr; }
+
};
static bool returns_to_call_stub(address return_pc) { return return_pc == _call_stub_return_address; }
diff --git a/src/cpu/x86/vm/stubRoutines_x86_64.cpp b/src/cpu/x86/vm/stubRoutines_x86_64.cpp
index 084bbf8fb..cf8ec5d7b 100644
--- a/src/cpu/x86/vm/stubRoutines_x86_64.cpp
+++ b/src/cpu/x86/vm/stubRoutines_x86_64.cpp
@@ -56,3 +56,4 @@ address StubRoutines::x86::_float_sign_flip = NULL;
address StubRoutines::x86::_double_sign_mask = NULL;
address StubRoutines::x86::_double_sign_flip = NULL;
address StubRoutines::x86::_mxcsr_std = NULL;
+address StubRoutines::x86::_key_shuffle_mask_addr = NULL;
diff --git a/src/cpu/x86/vm/stubRoutines_x86_64.hpp b/src/cpu/x86/vm/stubRoutines_x86_64.hpp
index 9b9cede4f..c3efeecb7 100644
--- a/src/cpu/x86/vm/stubRoutines_x86_64.hpp
+++ b/src/cpu/x86/vm/stubRoutines_x86_64.hpp
@@ -54,6 +54,8 @@ class x86 {
static address _double_sign_mask;
static address _double_sign_flip;
static address _mxcsr_std;
+ // shuffle mask for fixing up 128-bit words consisting of big-endian 32-bit integers
+ static address _key_shuffle_mask_addr;
public:
@@ -116,6 +118,9 @@ class x86 {
{
return _mxcsr_std;
}
+
+ static address key_shuffle_mask_addr() { return _key_shuffle_mask_addr; }
+
};
#endif // CPU_X86_VM_STUBROUTINES_X86_64_HPP
diff --git a/src/cpu/x86/vm/vm_version_x86.cpp b/src/cpu/x86/vm/vm_version_x86.cpp
index bf7b3c213..182b0ab1a 100644
--- a/src/cpu/x86/vm/vm_version_x86.cpp
+++ b/src/cpu/x86/vm/vm_version_x86.cpp
@@ -419,13 +419,16 @@ void VM_Version::get_processor_features() {
if (UseAVX < 1)
_cpuFeatures &= ~CPU_AVX;
+ if (!UseAES && !FLAG_IS_DEFAULT(UseAES))
+ _cpuFeatures &= ~CPU_AES;
+
if (logical_processors_per_package() == 1) {
// HT processor could be installed on a system which doesn't support HT.
_cpuFeatures &= ~CPU_HT;
}
char buf[256];
- jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+ jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
cores_per_cpu(), threads_per_core(),
cpu_family(), _model, _stepping,
(supports_cmov() ? ", cmov" : ""),
@@ -441,6 +444,7 @@ void VM_Version::get_processor_features() {
(supports_popcnt() ? ", popcnt" : ""),
(supports_avx() ? ", avx" : ""),
(supports_avx2() ? ", avx2" : ""),
+ (supports_aes() ? ", aes" : ""),
(supports_mmx_ext() ? ", mmxext" : ""),
(supports_3dnow_prefetch() ? ", 3dnowpref" : ""),
(supports_lzcnt() ? ", lzcnt": ""),
@@ -472,6 +476,29 @@ void VM_Version::get_processor_features() {
if (!supports_avx ()) // Drop to 0 if no AVX support
UseAVX = 0;
+ // Use AES instructions if available.
+ if (supports_aes()) {
+ if (FLAG_IS_DEFAULT(UseAES)) {
+ UseAES = true;
+ }
+ } else if (UseAES) {
+ if (!FLAG_IS_DEFAULT(UseAES))
+ warning("AES instructions not available on this CPU");
+ FLAG_SET_DEFAULT(UseAES, false);
+ }
+
+ // The AES intrinsic stubs require AES instruction support (of course)
+ // but also require AVX mode for misaligned SSE access
+ if (UseAES && (UseAVX > 0)) {
+ if (FLAG_IS_DEFAULT(UseAESIntrinsics)) {
+ UseAESIntrinsics = true;
+ }
+ } else if (UseAESIntrinsics) {
+ if (!FLAG_IS_DEFAULT(UseAESIntrinsics))
+ warning("AES intrinsics not available on this CPU");
+ FLAG_SET_DEFAULT(UseAESIntrinsics, false);
+ }
+
#ifdef COMPILER2
if (UseFPUForSpilling) {
if (UseSSE < 2) {
@@ -714,6 +741,9 @@ void VM_Version::get_processor_features() {
if (UseAVX > 0) {
tty->print(" UseAVX=%d",UseAVX);
}
+ if (UseAES) {
+ tty->print(" UseAES=1");
+ }
tty->cr();
tty->print("Allocation");
if (AllocatePrefetchStyle <= 0 || UseSSE == 0 && !supports_3dnow_prefetch()) {
diff --git a/src/cpu/x86/vm/vm_version_x86.hpp b/src/cpu/x86/vm/vm_version_x86.hpp
index 92cdbd3fd..12bd3b770 100644
--- a/src/cpu/x86/vm/vm_version_x86.hpp
+++ b/src/cpu/x86/vm/vm_version_x86.hpp
@@ -78,7 +78,9 @@ public:
sse4_2 : 1,
: 2,
popcnt : 1,
- : 3,
+ : 1,
+ aes : 1,
+ : 1,
osxsave : 1,
avx : 1,
: 3;
@@ -244,7 +246,8 @@ protected:
CPU_TSC = (1 << 15),
CPU_TSCINV = (1 << 16),
CPU_AVX = (1 << 17),
- CPU_AVX2 = (1 << 18)
+ CPU_AVX2 = (1 << 18),
+ CPU_AES = (1 << 19)
} cpuFeatureFlags;
enum {
@@ -420,6 +423,8 @@ protected:
result |= CPU_TSC;
if (_cpuid_info.ext_cpuid7_edx.bits.tsc_invariance != 0)
result |= CPU_TSCINV;
+ if (_cpuid_info.std_cpuid1_ecx.bits.aes != 0)
+ result |= CPU_AES;
// AMD features.
if (is_amd()) {
@@ -544,6 +549,7 @@ public:
static bool supports_avx() { return (_cpuFeatures & CPU_AVX) != 0; }
static bool supports_avx2() { return (_cpuFeatures & CPU_AVX2) != 0; }
static bool supports_tsc() { return (_cpuFeatures & CPU_TSC) != 0; }
+ static bool supports_aes() { return (_cpuFeatures & CPU_AES) != 0; }
// Intel features
static bool is_intel_family_core() { return is_intel() &&