From ed9b909d1d11f7848769067039d646b4693c3d78 Mon Sep 17 00:00:00 2001 From: kvn Date: Wed, 24 Oct 2012 14:33:22 -0700 Subject: [PATCH] 7184394: add intrinsics to use AES instructions Summary: Use new x86 AES instructions for AESCrypt. Reviewed-by: twisti, kvn, roland Contributed-by: tom.deneau@amd.com --- src/cpu/x86/vm/assembler_x86.cpp | 97 ++++ src/cpu/x86/vm/assembler_x86.hpp | 25 + src/cpu/x86/vm/stubGenerator_x86_32.cpp | 533 ++++++++++++++++++++++ src/cpu/x86/vm/stubGenerator_x86_64.cpp | 552 +++++++++++++++++++++++ src/cpu/x86/vm/stubRoutines_x86_32.cpp | 1 + src/cpu/x86/vm/stubRoutines_x86_32.hpp | 4 + src/cpu/x86/vm/stubRoutines_x86_64.cpp | 1 + src/cpu/x86/vm/stubRoutines_x86_64.hpp | 5 + src/cpu/x86/vm/vm_version_x86.cpp | 32 +- src/cpu/x86/vm/vm_version_x86.hpp | 10 +- src/share/vm/classfile/vmSymbols.hpp | 16 + src/share/vm/oops/method.cpp | 6 +- src/share/vm/opto/callGenerator.cpp | 123 +++++ src/share/vm/opto/callGenerator.hpp | 3 + src/share/vm/opto/doCall.cpp | 12 +- src/share/vm/opto/escape.cpp | 14 +- src/share/vm/opto/library_call.cpp | 369 ++++++++++++++- src/share/vm/opto/runtime.cpp | 42 ++ src/share/vm/opto/runtime.hpp | 3 + src/share/vm/runtime/globals.hpp | 6 + src/share/vm/runtime/stubRoutines.cpp | 4 + src/share/vm/runtime/stubRoutines.hpp | 10 + test/compiler/7184394/TestAESBase.java | 154 +++++++ test/compiler/7184394/TestAESDecode.java | 57 +++ test/compiler/7184394/TestAESEncode.java | 57 +++ test/compiler/7184394/TestAESMain.java | 57 +++ 26 files changed, 2181 insertions(+), 12 deletions(-) create mode 100644 test/compiler/7184394/TestAESBase.java create mode 100644 test/compiler/7184394/TestAESDecode.java create mode 100644 test/compiler/7184394/TestAESEncode.java create mode 100644 test/compiler/7184394/TestAESMain.java diff --git a/src/cpu/x86/vm/assembler_x86.cpp b/src/cpu/x86/vm/assembler_x86.cpp index 378c1f8c6..6b9677d30 100644 --- a/src/cpu/x86/vm/assembler_x86.cpp +++ b/src/cpu/x86/vm/assembler_x86.cpp @@ -1007,6 +1007,67 @@ void Assembler::addss(XMMRegister dst, Address src) { emit_simd_arith(0x58, dst, src, VEX_SIMD_F3); } +void Assembler::aesdec(XMMRegister dst, Address src) { + assert(VM_Version::supports_aes(), ""); + InstructionMark im(this); + simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + emit_byte(0xde); + emit_operand(dst, src); +} + +void Assembler::aesdec(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_aes(), ""); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + emit_byte(0xde); + emit_byte(0xC0 | encode); +} + +void Assembler::aesdeclast(XMMRegister dst, Address src) { + assert(VM_Version::supports_aes(), ""); + InstructionMark im(this); + simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + emit_byte(0xdf); + emit_operand(dst, src); +} + +void Assembler::aesdeclast(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_aes(), ""); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + emit_byte(0xdf); + emit_byte(0xC0 | encode); +} + +void Assembler::aesenc(XMMRegister dst, Address src) { + assert(VM_Version::supports_aes(), ""); + InstructionMark im(this); + simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + emit_byte(0xdc); + emit_operand(dst, src); +} + +void Assembler::aesenc(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_aes(), ""); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + emit_byte(0xdc); + emit_byte(0xC0 | encode); +} + +void Assembler::aesenclast(XMMRegister dst, Address src) { + assert(VM_Version::supports_aes(), ""); + InstructionMark im(this); + simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + emit_byte(0xdd); + emit_operand(dst, src); +} + +void Assembler::aesenclast(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_aes(), ""); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + emit_byte(0xdd); + emit_byte(0xC0 | encode); +} + + void Assembler::andl(Address dst, int32_t imm32) { InstructionMark im(this); prefix(dst); @@ -2307,6 +2368,22 @@ void Assembler::prefix(Prefix p) { a_byte(p); } +void Assembler::pshufb(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_ssse3(), ""); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + emit_byte(0x00); + emit_byte(0xC0 | encode); +} + +void Assembler::pshufb(XMMRegister dst, Address src) { + assert(VM_Version::supports_ssse3(), ""); + assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); + InstructionMark im(this); + simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + emit_byte(0x00); + emit_operand(dst, src); +} + void Assembler::pshufd(XMMRegister dst, XMMRegister src, int mode) { assert(isByte(mode), "invalid value"); NOT_LP64(assert(VM_Version::supports_sse2(), "")); @@ -8067,6 +8144,15 @@ void MacroAssembler::movptr(Address dst, Register src) { LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src)); } +void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src) { + if (reachable(src)) { + Assembler::movdqu(dst, as_Address(src)); + } else { + lea(rscratch1, src); + Assembler::movdqu(dst, Address(rscratch1, 0)); + } +} + void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) { if (reachable(src)) { Assembler::movsd(dst, as_Address(src)); @@ -8357,6 +8443,17 @@ void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) { } } +void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) { + // Used in sign-bit flipping with aligned address. + assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); + if (reachable(src)) { + Assembler::pshufb(dst, as_Address(src)); + } else { + lea(rscratch1, src); + Assembler::pshufb(dst, Address(rscratch1, 0)); + } +} + // AVX 3-operands instructions void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) { diff --git a/src/cpu/x86/vm/assembler_x86.hpp b/src/cpu/x86/vm/assembler_x86.hpp index c936e13f5..8a9bbaf42 100644 --- a/src/cpu/x86/vm/assembler_x86.hpp +++ b/src/cpu/x86/vm/assembler_x86.hpp @@ -875,6 +875,17 @@ private: void addss(XMMRegister dst, Address src); void addss(XMMRegister dst, XMMRegister src); + // AES instructions + void aesdec(XMMRegister dst, Address src); + void aesdec(XMMRegister dst, XMMRegister src); + void aesdeclast(XMMRegister dst, Address src); + void aesdeclast(XMMRegister dst, XMMRegister src); + void aesenc(XMMRegister dst, Address src); + void aesenc(XMMRegister dst, XMMRegister src); + void aesenclast(XMMRegister dst, Address src); + void aesenclast(XMMRegister dst, XMMRegister src); + + void andl(Address dst, int32_t imm32); void andl(Register dst, int32_t imm32); void andl(Register dst, Address src); @@ -1424,6 +1435,10 @@ private: void prefetcht2(Address src); void prefetchw(Address src); + // Shuffle Bytes + void pshufb(XMMRegister dst, XMMRegister src); + void pshufb(XMMRegister dst, Address src); + // Shuffle Packed Doublewords void pshufd(XMMRegister dst, XMMRegister src, int mode); void pshufd(XMMRegister dst, Address src, int mode); @@ -2611,6 +2626,12 @@ public: void divss(XMMRegister dst, Address src) { Assembler::divss(dst, src); } void divss(XMMRegister dst, AddressLiteral src); + // Move Unaligned Double Quadword + void movdqu(Address dst, XMMRegister src) { Assembler::movdqu(dst, src); } + void movdqu(XMMRegister dst, Address src) { Assembler::movdqu(dst, src); } + void movdqu(XMMRegister dst, XMMRegister src) { Assembler::movdqu(dst, src); } + void movdqu(XMMRegister dst, AddressLiteral src); + void movsd(XMMRegister dst, XMMRegister src) { Assembler::movsd(dst, src); } void movsd(Address dst, XMMRegister src) { Assembler::movsd(dst, src); } void movsd(XMMRegister dst, Address src) { Assembler::movsd(dst, src); } @@ -2658,6 +2679,10 @@ public: void xorps(XMMRegister dst, Address src) { Assembler::xorps(dst, src); } void xorps(XMMRegister dst, AddressLiteral src); + // Shuffle Bytes + void pshufb(XMMRegister dst, XMMRegister src) { Assembler::pshufb(dst, src); } + void pshufb(XMMRegister dst, Address src) { Assembler::pshufb(dst, src); } + void pshufb(XMMRegister dst, AddressLiteral src); // AVX 3-operands instructions void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vaddsd(dst, nds, src); } diff --git a/src/cpu/x86/vm/stubGenerator_x86_32.cpp b/src/cpu/x86/vm/stubGenerator_x86_32.cpp index f149fde83..d8b61e0b2 100644 --- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp +++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp @@ -2137,6 +2137,529 @@ class StubGenerator: public StubCodeGenerator { } } + // AES intrinsic stubs + enum {AESBlockSize = 16}; + + address generate_key_shuffle_mask() { + __ align(16); + StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask"); + address start = __ pc(); + __ emit_data(0x00010203, relocInfo::none, 0 ); + __ emit_data(0x04050607, relocInfo::none, 0 ); + __ emit_data(0x08090a0b, relocInfo::none, 0 ); + __ emit_data(0x0c0d0e0f, relocInfo::none, 0 ); + return start; + } + + // Utility routine for loading a 128-bit key word in little endian format + // can optionally specify that the shuffle mask is already in an xmmregister + void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { + __ movdqu(xmmdst, Address(key, offset)); + if (xmm_shuf_mask != NULL) { + __ pshufb(xmmdst, xmm_shuf_mask); + } else { + __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); + } + } + + // aesenc using specified key+offset + // can optionally specify that the shuffle mask is already in an xmmregister + void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { + load_key(xmmtmp, key, offset, xmm_shuf_mask); + __ aesenc(xmmdst, xmmtmp); + } + + // aesdec using specified key+offset + // can optionally specify that the shuffle mask is already in an xmmregister + void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { + load_key(xmmtmp, key, offset, xmm_shuf_mask); + __ aesdec(xmmdst, xmmtmp); + } + + + // Arguments: + // + // Inputs: + // c_rarg0 - source byte array address + // c_rarg1 - destination byte array address + // c_rarg2 - K (key) in little endian int array + // + address generate_aescrypt_encryptBlock() { + assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); + Label L_doLast; + address start = __ pc(); + + const Register from = rsi; // source array address + const Register to = rdx; // destination array address + const Register key = rcx; // key array address + const Register keylen = rax; + const Address from_param(rbp, 8+0); + const Address to_param (rbp, 8+4); + const Address key_param (rbp, 8+8); + + const XMMRegister xmm_result = xmm0; + const XMMRegister xmm_temp = xmm1; + const XMMRegister xmm_key_shuf_mask = xmm2; + + __ enter(); // required for proper stackwalking of RuntimeStub frame + __ push(rsi); + __ movptr(from , from_param); + __ movptr(to , to_param); + __ movptr(key , key_param); + + __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); + // keylen = # of 32-bit words, convert to 128-bit words + __ shrl(keylen, 2); + __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more + + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); + __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input + + // For encryption, the java expanded key ordering is just what we need + + load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask); + __ pxor(xmm_result, xmm_temp); + for (int offset = 0x10; offset <= 0x90; offset += 0x10) { + aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask); + } + load_key (xmm_temp, key, 0xa0, xmm_key_shuf_mask); + __ cmpl(keylen, 0); + __ jcc(Assembler::equal, L_doLast); + __ aesenc(xmm_result, xmm_temp); // only in 192 and 256 bit keys + aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask); + load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask); + __ subl(keylen, 2); + __ jcc(Assembler::equal, L_doLast); + __ aesenc(xmm_result, xmm_temp); // only in 256 bit keys + aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask); + load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask); + + __ BIND(L_doLast); + __ aesenclast(xmm_result, xmm_temp); + __ movdqu(Address(to, 0), xmm_result); // store the result + __ xorptr(rax, rax); // return 0 + __ pop(rsi); + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + + return start; + } + + + // Arguments: + // + // Inputs: + // c_rarg0 - source byte array address + // c_rarg1 - destination byte array address + // c_rarg2 - K (key) in little endian int array + // + address generate_aescrypt_decryptBlock() { + assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); + Label L_doLast; + address start = __ pc(); + + const Register from = rsi; // source array address + const Register to = rdx; // destination array address + const Register key = rcx; // key array address + const Register keylen = rax; + const Address from_param(rbp, 8+0); + const Address to_param (rbp, 8+4); + const Address key_param (rbp, 8+8); + + const XMMRegister xmm_result = xmm0; + const XMMRegister xmm_temp = xmm1; + const XMMRegister xmm_key_shuf_mask = xmm2; + + __ enter(); // required for proper stackwalking of RuntimeStub frame + __ push(rsi); + __ movptr(from , from_param); + __ movptr(to , to_param); + __ movptr(key , key_param); + + __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); + // keylen = # of 32-bit words, convert to 128-bit words + __ shrl(keylen, 2); + __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more + + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); + __ movdqu(xmm_result, Address(from, 0)); + + // for decryption java expanded key ordering is rotated one position from what we want + // so we start from 0x10 here and hit 0x00 last + // we don't know if the key is aligned, hence not using load-execute form + load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask); + __ pxor (xmm_result, xmm_temp); + for (int offset = 0x20; offset <= 0xa0; offset += 0x10) { + aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask); + } + __ cmpl(keylen, 0); + __ jcc(Assembler::equal, L_doLast); + // only in 192 and 256 bit keys + aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask); + aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask); + __ subl(keylen, 2); + __ jcc(Assembler::equal, L_doLast); + // only in 256 bit keys + aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask); + aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask); + + __ BIND(L_doLast); + // for decryption the aesdeclast operation is always on key+0x00 + load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask); + __ aesdeclast(xmm_result, xmm_temp); + + __ movdqu(Address(to, 0), xmm_result); // store the result + + __ xorptr(rax, rax); // return 0 + __ pop(rsi); + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + + return start; + } + + void handleSOERegisters(bool saving) { + const int saveFrameSizeInBytes = 4 * wordSize; + const Address saved_rbx (rbp, -3 * wordSize); + const Address saved_rsi (rbp, -2 * wordSize); + const Address saved_rdi (rbp, -1 * wordSize); + + if (saving) { + __ subptr(rsp, saveFrameSizeInBytes); + __ movptr(saved_rsi, rsi); + __ movptr(saved_rdi, rdi); + __ movptr(saved_rbx, rbx); + } else { + // restoring + __ movptr(rsi, saved_rsi); + __ movptr(rdi, saved_rdi); + __ movptr(rbx, saved_rbx); + } + } + + // Arguments: + // + // Inputs: + // c_rarg0 - source byte array address + // c_rarg1 - destination byte array address + // c_rarg2 - K (key) in little endian int array + // c_rarg3 - r vector byte array address + // c_rarg4 - input length + // + address generate_cipherBlockChaining_encryptAESCrypt() { + assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); + address start = __ pc(); + + Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256; + const Register from = rsi; // source array address + const Register to = rdx; // destination array address + const Register key = rcx; // key array address + const Register rvec = rdi; // r byte array initialized from initvector array address + // and left with the results of the last encryption block + const Register len_reg = rbx; // src len (must be multiple of blocksize 16) + const Register pos = rax; + + // xmm register assignments for the loops below + const XMMRegister xmm_result = xmm0; + const XMMRegister xmm_temp = xmm1; + // first 6 keys preloaded into xmm2-xmm7 + const int XMM_REG_NUM_KEY_FIRST = 2; + const int XMM_REG_NUM_KEY_LAST = 7; + const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); + + __ enter(); // required for proper stackwalking of RuntimeStub frame + handleSOERegisters(true /*saving*/); + + // load registers from incoming parameters + const Address from_param(rbp, 8+0); + const Address to_param (rbp, 8+4); + const Address key_param (rbp, 8+8); + const Address rvec_param (rbp, 8+12); + const Address len_param (rbp, 8+16); + __ movptr(from , from_param); + __ movptr(to , to_param); + __ movptr(key , key_param); + __ movptr(rvec , rvec_param); + __ movptr(len_reg , len_param); + + const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); + // load up xmm regs 2 thru 7 with keys 0-5 + for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { + load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); + offset += 0x10; + } + + __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec + + // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) + __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); + __ cmpl(rax, 44); + __ jcc(Assembler::notEqual, L_key_192_256); + + // 128 bit code follows here + __ movptr(pos, 0); + __ align(OptoLoopAlignment); + __ BIND(L_loopTop_128); + __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input + __ pxor (xmm_result, xmm_temp); // xor with the current r vector + + __ pxor (xmm_result, xmm_key0); // do the aes rounds + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { + __ aesenc(xmm_result, as_XMMRegister(rnum)); + } + for (int key_offset = 0x60; key_offset <= 0x90; key_offset += 0x10) { + aes_enc_key(xmm_result, xmm_temp, key, key_offset); + } + load_key(xmm_temp, key, 0xa0); + __ aesenclast(xmm_result, xmm_temp); + + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output + // no need to store r to memory until we exit + __ addptr(pos, AESBlockSize); + __ subptr(len_reg, AESBlockSize); + __ jcc(Assembler::notEqual, L_loopTop_128); + + __ BIND(L_exit); + __ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object + + handleSOERegisters(false /*restoring*/); + __ movl(rax, 0); // return 0 (why?) + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + + __ BIND(L_key_192_256); + // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) + __ cmpl(rax, 52); + __ jcc(Assembler::notEqual, L_key_256); + + // 192-bit code follows here (could be changed to use more xmm registers) + __ movptr(pos, 0); + __ align(OptoLoopAlignment); + __ BIND(L_loopTop_192); + __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input + __ pxor (xmm_result, xmm_temp); // xor with the current r vector + + __ pxor (xmm_result, xmm_key0); // do the aes rounds + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { + __ aesenc(xmm_result, as_XMMRegister(rnum)); + } + for (int key_offset = 0x60; key_offset <= 0xb0; key_offset += 0x10) { + aes_enc_key(xmm_result, xmm_temp, key, key_offset); + } + load_key(xmm_temp, key, 0xc0); + __ aesenclast(xmm_result, xmm_temp); + + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output + // no need to store r to memory until we exit + __ addptr(pos, AESBlockSize); + __ subptr(len_reg, AESBlockSize); + __ jcc(Assembler::notEqual, L_loopTop_192); + __ jmp(L_exit); + + __ BIND(L_key_256); + // 256-bit code follows here (could be changed to use more xmm registers) + __ movptr(pos, 0); + __ align(OptoLoopAlignment); + __ BIND(L_loopTop_256); + __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input + __ pxor (xmm_result, xmm_temp); // xor with the current r vector + + __ pxor (xmm_result, xmm_key0); // do the aes rounds + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { + __ aesenc(xmm_result, as_XMMRegister(rnum)); + } + for (int key_offset = 0x60; key_offset <= 0xd0; key_offset += 0x10) { + aes_enc_key(xmm_result, xmm_temp, key, key_offset); + } + load_key(xmm_temp, key, 0xe0); + __ aesenclast(xmm_result, xmm_temp); + + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output + // no need to store r to memory until we exit + __ addptr(pos, AESBlockSize); + __ subptr(len_reg, AESBlockSize); + __ jcc(Assembler::notEqual, L_loopTop_256); + __ jmp(L_exit); + + return start; + } + + + // CBC AES Decryption. + // In 32-bit stub, because of lack of registers we do not try to parallelize 4 blocks at a time. + // + // Arguments: + // + // Inputs: + // c_rarg0 - source byte array address + // c_rarg1 - destination byte array address + // c_rarg2 - K (key) in little endian int array + // c_rarg3 - r vector byte array address + // c_rarg4 - input length + // + + address generate_cipherBlockChaining_decryptAESCrypt() { + assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); + address start = __ pc(); + + Label L_exit, L_key_192_256, L_key_256; + Label L_singleBlock_loopTop_128; + Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256; + const Register from = rsi; // source array address + const Register to = rdx; // destination array address + const Register key = rcx; // key array address + const Register rvec = rdi; // r byte array initialized from initvector array address + // and left with the results of the last encryption block + const Register len_reg = rbx; // src len (must be multiple of blocksize 16) + const Register pos = rax; + + // xmm register assignments for the loops below + const XMMRegister xmm_result = xmm0; + const XMMRegister xmm_temp = xmm1; + // first 6 keys preloaded into xmm2-xmm7 + const int XMM_REG_NUM_KEY_FIRST = 2; + const int XMM_REG_NUM_KEY_LAST = 7; + const int FIRST_NON_REG_KEY_offset = 0x70; + const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); + + __ enter(); // required for proper stackwalking of RuntimeStub frame + handleSOERegisters(true /*saving*/); + + // load registers from incoming parameters + const Address from_param(rbp, 8+0); + const Address to_param (rbp, 8+4); + const Address key_param (rbp, 8+8); + const Address rvec_param (rbp, 8+12); + const Address len_param (rbp, 8+16); + __ movptr(from , from_param); + __ movptr(to , to_param); + __ movptr(key , key_param); + __ movptr(rvec , rvec_param); + __ movptr(len_reg , len_param); + + // the java expanded key ordering is rotated one position from what we want + // so we start from 0x10 here and hit 0x00 last + const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); + // load up xmm regs 2 thru 6 with first 5 keys + for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { + load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); + offset += 0x10; + } + + // inside here, use the rvec register to point to previous block cipher + // with which we xor at the end of each newly decrypted block + const Register prev_block_cipher_ptr = rvec; + + // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) + __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); + __ cmpl(rax, 44); + __ jcc(Assembler::notEqual, L_key_192_256); + + + // 128-bit code follows here, parallelized + __ movptr(pos, 0); + __ align(OptoLoopAlignment); + __ BIND(L_singleBlock_loopTop_128); + __ cmpptr(len_reg, 0); // any blocks left?? + __ jcc(Assembler::equal, L_exit); + __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input + __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { + __ aesdec(xmm_result, as_XMMRegister(rnum)); + } + for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xa0; key_offset += 0x10) { // 128-bit runs up to key offset a0 + aes_dec_key(xmm_result, xmm_temp, key, key_offset); + } + load_key(xmm_temp, key, 0x00); // final key is stored in java expanded array at offset 0 + __ aesdeclast(xmm_result, xmm_temp); + __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00)); + __ pxor (xmm_result, xmm_temp); // xor with the current r vector + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output + // no need to store r to memory until we exit + __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0)); // set up new ptr + __ addptr(pos, AESBlockSize); + __ subptr(len_reg, AESBlockSize); + __ jmp(L_singleBlock_loopTop_128); + + + __ BIND(L_exit); + __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00)); + __ movptr(rvec , rvec_param); // restore this since used in loop + __ movdqu(Address(rvec, 0), xmm_temp); // final value of r stored in rvec of CipherBlockChaining object + handleSOERegisters(false /*restoring*/); + __ movl(rax, 0); // return 0 (why?) + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + + + __ BIND(L_key_192_256); + // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) + __ cmpl(rax, 52); + __ jcc(Assembler::notEqual, L_key_256); + + // 192-bit code follows here (could be optimized to use parallelism) + __ movptr(pos, 0); + __ align(OptoLoopAlignment); + __ BIND(L_singleBlock_loopTop_192); + __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input + __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { + __ aesdec(xmm_result, as_XMMRegister(rnum)); + } + for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xc0; key_offset += 0x10) { // 192-bit runs up to key offset c0 + aes_dec_key(xmm_result, xmm_temp, key, key_offset); + } + load_key(xmm_temp, key, 0x00); // final key is stored in java expanded array at offset 0 + __ aesdeclast(xmm_result, xmm_temp); + __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00)); + __ pxor (xmm_result, xmm_temp); // xor with the current r vector + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output + // no need to store r to memory until we exit + __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0)); // set up new ptr + __ addptr(pos, AESBlockSize); + __ subptr(len_reg, AESBlockSize); + __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192); + __ jmp(L_exit); + + __ BIND(L_key_256); + // 256-bit code follows here (could be optimized to use parallelism) + __ movptr(pos, 0); + __ align(OptoLoopAlignment); + __ BIND(L_singleBlock_loopTop_256); + __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input + __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { + __ aesdec(xmm_result, as_XMMRegister(rnum)); + } + for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xe0; key_offset += 0x10) { // 256-bit runs up to key offset e0 + aes_dec_key(xmm_result, xmm_temp, key, key_offset); + } + load_key(xmm_temp, key, 0x00); // final key is stored in java expanded array at offset 0 + __ aesdeclast(xmm_result, xmm_temp); + __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00)); + __ pxor (xmm_result, xmm_temp); // xor with the current r vector + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output + // no need to store r to memory until we exit + __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0)); // set up new ptr + __ addptr(pos, AESBlockSize); + __ subptr(len_reg, AESBlockSize); + __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256); + __ jmp(L_exit); + + return start; + } + + public: // Information about frame layout at time of blocking runtime call. // Note that we only have to preserve callee-saved registers since @@ -2332,6 +2855,16 @@ class StubGenerator: public StubCodeGenerator { generate_arraycopy_stubs(); generate_math_stubs(); + + // don't bother generating these AES intrinsic stubs unless global flag is set + if (UseAESIntrinsics) { + StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // might be needed by the others + + StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); + StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); + StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); + StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); + } } diff --git a/src/cpu/x86/vm/stubGenerator_x86_64.cpp b/src/cpu/x86/vm/stubGenerator_x86_64.cpp index 8ae595a56..3e223387c 100644 --- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp +++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp @@ -2941,6 +2941,548 @@ class StubGenerator: public StubCodeGenerator { } } + // AES intrinsic stubs + enum {AESBlockSize = 16}; + + address generate_key_shuffle_mask() { + __ align(16); + StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask"); + address start = __ pc(); + __ emit_data64( 0x0405060700010203, relocInfo::none ); + __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none ); + return start; + } + + // Utility routine for loading a 128-bit key word in little endian format + // can optionally specify that the shuffle mask is already in an xmmregister + void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { + __ movdqu(xmmdst, Address(key, offset)); + if (xmm_shuf_mask != NULL) { + __ pshufb(xmmdst, xmm_shuf_mask); + } else { + __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); + } + } + + // aesenc using specified key+offset + // can optionally specify that the shuffle mask is already in an xmmregister + void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { + load_key(xmmtmp, key, offset, xmm_shuf_mask); + __ aesenc(xmmdst, xmmtmp); + } + + // aesdec using specified key+offset + // can optionally specify that the shuffle mask is already in an xmmregister + void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { + load_key(xmmtmp, key, offset, xmm_shuf_mask); + __ aesdec(xmmdst, xmmtmp); + } + + + // Arguments: + // + // Inputs: + // c_rarg0 - source byte array address + // c_rarg1 - destination byte array address + // c_rarg2 - K (key) in little endian int array + // + address generate_aescrypt_encryptBlock() { + assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); + Label L_doLast; + address start = __ pc(); + + const Register from = c_rarg0; // source array address + const Register to = c_rarg1; // destination array address + const Register key = c_rarg2; // key array address + const Register keylen = rax; + + const XMMRegister xmm_result = xmm0; + const XMMRegister xmm_temp = xmm1; + const XMMRegister xmm_key_shuf_mask = xmm2; + + __ enter(); // required for proper stackwalking of RuntimeStub frame + + __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); + // keylen = # of 32-bit words, convert to 128-bit words + __ shrl(keylen, 2); + __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more + + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); + __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input + + // For encryption, the java expanded key ordering is just what we need + // we don't know if the key is aligned, hence not using load-execute form + + load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask); + __ pxor(xmm_result, xmm_temp); + for (int offset = 0x10; offset <= 0x90; offset += 0x10) { + aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask); + } + load_key (xmm_temp, key, 0xa0, xmm_key_shuf_mask); + __ cmpl(keylen, 0); + __ jcc(Assembler::equal, L_doLast); + __ aesenc(xmm_result, xmm_temp); // only in 192 and 256 bit keys + aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask); + load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask); + __ subl(keylen, 2); + __ jcc(Assembler::equal, L_doLast); + __ aesenc(xmm_result, xmm_temp); // only in 256 bit keys + aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask); + load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask); + + __ BIND(L_doLast); + __ aesenclast(xmm_result, xmm_temp); + __ movdqu(Address(to, 0), xmm_result); // store the result + __ xorptr(rax, rax); // return 0 + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + + return start; + } + + + // Arguments: + // + // Inputs: + // c_rarg0 - source byte array address + // c_rarg1 - destination byte array address + // c_rarg2 - K (key) in little endian int array + // + address generate_aescrypt_decryptBlock() { + assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); + Label L_doLast; + address start = __ pc(); + + const Register from = c_rarg0; // source array address + const Register to = c_rarg1; // destination array address + const Register key = c_rarg2; // key array address + const Register keylen = rax; + + const XMMRegister xmm_result = xmm0; + const XMMRegister xmm_temp = xmm1; + const XMMRegister xmm_key_shuf_mask = xmm2; + + __ enter(); // required for proper stackwalking of RuntimeStub frame + + __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); + // keylen = # of 32-bit words, convert to 128-bit words + __ shrl(keylen, 2); + __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more + + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); + __ movdqu(xmm_result, Address(from, 0)); + + // for decryption java expanded key ordering is rotated one position from what we want + // so we start from 0x10 here and hit 0x00 last + // we don't know if the key is aligned, hence not using load-execute form + load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask); + __ pxor (xmm_result, xmm_temp); + for (int offset = 0x20; offset <= 0xa0; offset += 0x10) { + aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask); + } + __ cmpl(keylen, 0); + __ jcc(Assembler::equal, L_doLast); + // only in 192 and 256 bit keys + aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask); + aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask); + __ subl(keylen, 2); + __ jcc(Assembler::equal, L_doLast); + // only in 256 bit keys + aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask); + aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask); + + __ BIND(L_doLast); + // for decryption the aesdeclast operation is always on key+0x00 + load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask); + __ aesdeclast(xmm_result, xmm_temp); + + __ movdqu(Address(to, 0), xmm_result); // store the result + + __ xorptr(rax, rax); // return 0 + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + + return start; + } + + + // Arguments: + // + // Inputs: + // c_rarg0 - source byte array address + // c_rarg1 - destination byte array address + // c_rarg2 - K (key) in little endian int array + // c_rarg3 - r vector byte array address + // c_rarg4 - input length + // + address generate_cipherBlockChaining_encryptAESCrypt() { + assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); + address start = __ pc(); + + Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256; + const Register from = c_rarg0; // source array address + const Register to = c_rarg1; // destination array address + const Register key = c_rarg2; // key array address + const Register rvec = c_rarg3; // r byte array initialized from initvector array address + // and left with the results of the last encryption block +#ifndef _WIN64 + const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) +#else + const Address len_mem(rsp, 6 * wordSize); // length is on stack on Win64 + const Register len_reg = r10; // pick the first volatile windows register +#endif + const Register pos = rax; + + // xmm register assignments for the loops below + const XMMRegister xmm_result = xmm0; + const XMMRegister xmm_temp = xmm1; + // keys 0-10 preloaded into xmm2-xmm12 + const int XMM_REG_NUM_KEY_FIRST = 2; + const int XMM_REG_NUM_KEY_LAST = 12; + const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); + const XMMRegister xmm_key10 = as_XMMRegister(XMM_REG_NUM_KEY_LAST); + + __ enter(); // required for proper stackwalking of RuntimeStub frame + +#ifdef _WIN64 + // on win64, fill len_reg from stack position + __ movl(len_reg, len_mem); + // save the xmm registers which must be preserved 6-12 + __ subptr(rsp, -rsp_after_call_off * wordSize); + for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { + __ movdqu(xmm_save(i), as_XMMRegister(i)); + } +#endif + + const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); + // load up xmm regs 2 thru 12 with key 0x00 - 0xa0 + for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { + load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); + offset += 0x10; + } + + __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec + + // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) + __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); + __ cmpl(rax, 44); + __ jcc(Assembler::notEqual, L_key_192_256); + + // 128 bit code follows here + __ movptr(pos, 0); + __ align(OptoLoopAlignment); + __ BIND(L_loopTop_128); + __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input + __ pxor (xmm_result, xmm_temp); // xor with the current r vector + + __ pxor (xmm_result, xmm_key0); // do the aes rounds + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { + __ aesenc(xmm_result, as_XMMRegister(rnum)); + } + __ aesenclast(xmm_result, xmm_key10); + + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output + // no need to store r to memory until we exit + __ addptr(pos, AESBlockSize); + __ subptr(len_reg, AESBlockSize); + __ jcc(Assembler::notEqual, L_loopTop_128); + + __ BIND(L_exit); + __ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object + +#ifdef _WIN64 + // restore xmm regs belonging to calling function + for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { + __ movdqu(as_XMMRegister(i), xmm_save(i)); + } +#endif + __ movl(rax, 0); // return 0 (why?) + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + + __ BIND(L_key_192_256); + // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) + __ cmpl(rax, 52); + __ jcc(Assembler::notEqual, L_key_256); + + // 192-bit code follows here (could be changed to use more xmm registers) + __ movptr(pos, 0); + __ align(OptoLoopAlignment); + __ BIND(L_loopTop_192); + __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input + __ pxor (xmm_result, xmm_temp); // xor with the current r vector + + __ pxor (xmm_result, xmm_key0); // do the aes rounds + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { + __ aesenc(xmm_result, as_XMMRegister(rnum)); + } + aes_enc_key(xmm_result, xmm_temp, key, 0xb0); + load_key(xmm_temp, key, 0xc0); + __ aesenclast(xmm_result, xmm_temp); + + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output + // no need to store r to memory until we exit + __ addptr(pos, AESBlockSize); + __ subptr(len_reg, AESBlockSize); + __ jcc(Assembler::notEqual, L_loopTop_192); + __ jmp(L_exit); + + __ BIND(L_key_256); + // 256-bit code follows here (could be changed to use more xmm registers) + __ movptr(pos, 0); + __ align(OptoLoopAlignment); + __ BIND(L_loopTop_256); + __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input + __ pxor (xmm_result, xmm_temp); // xor with the current r vector + + __ pxor (xmm_result, xmm_key0); // do the aes rounds + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { + __ aesenc(xmm_result, as_XMMRegister(rnum)); + } + aes_enc_key(xmm_result, xmm_temp, key, 0xb0); + aes_enc_key(xmm_result, xmm_temp, key, 0xc0); + aes_enc_key(xmm_result, xmm_temp, key, 0xd0); + load_key(xmm_temp, key, 0xe0); + __ aesenclast(xmm_result, xmm_temp); + + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output + // no need to store r to memory until we exit + __ addptr(pos, AESBlockSize); + __ subptr(len_reg, AESBlockSize); + __ jcc(Assembler::notEqual, L_loopTop_256); + __ jmp(L_exit); + + return start; + } + + + + // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time + // to hide instruction latency + // + // Arguments: + // + // Inputs: + // c_rarg0 - source byte array address + // c_rarg1 - destination byte array address + // c_rarg2 - K (key) in little endian int array + // c_rarg3 - r vector byte array address + // c_rarg4 - input length + // + + address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { + assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); + address start = __ pc(); + + Label L_exit, L_key_192_256, L_key_256; + Label L_singleBlock_loopTop_128, L_multiBlock_loopTop_128; + Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256; + const Register from = c_rarg0; // source array address + const Register to = c_rarg1; // destination array address + const Register key = c_rarg2; // key array address + const Register rvec = c_rarg3; // r byte array initialized from initvector array address + // and left with the results of the last encryption block +#ifndef _WIN64 + const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) +#else + const Address len_mem(rsp, 6 * wordSize); // length is on stack on Win64 + const Register len_reg = r10; // pick the first volatile windows register +#endif + const Register pos = rax; + + // xmm register assignments for the loops below + const XMMRegister xmm_result = xmm0; + // keys 0-10 preloaded into xmm2-xmm12 + const int XMM_REG_NUM_KEY_FIRST = 5; + const int XMM_REG_NUM_KEY_LAST = 15; + const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); + const XMMRegister xmm_key_last = as_XMMRegister(XMM_REG_NUM_KEY_LAST); + + __ enter(); // required for proper stackwalking of RuntimeStub frame + +#ifdef _WIN64 + // on win64, fill len_reg from stack position + __ movl(len_reg, len_mem); + // save the xmm registers which must be preserved 6-15 + __ subptr(rsp, -rsp_after_call_off * wordSize); + for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { + __ movdqu(xmm_save(i), as_XMMRegister(i)); + } +#endif + // the java expanded key ordering is rotated one position from what we want + // so we start from 0x10 here and hit 0x00 last + const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); + // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00 + for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { + if (rnum == XMM_REG_NUM_KEY_LAST) offset = 0x00; + load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); + offset += 0x10; + } + + const XMMRegister xmm_prev_block_cipher = xmm1; // holds cipher of previous block + // registers holding the four results in the parallelized loop + const XMMRegister xmm_result0 = xmm0; + const XMMRegister xmm_result1 = xmm2; + const XMMRegister xmm_result2 = xmm3; + const XMMRegister xmm_result3 = xmm4; + + __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec + + // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) + __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); + __ cmpl(rax, 44); + __ jcc(Assembler::notEqual, L_key_192_256); + + + // 128-bit code follows here, parallelized + __ movptr(pos, 0); + __ align(OptoLoopAlignment); + __ BIND(L_multiBlock_loopTop_128); + __ cmpptr(len_reg, 4*AESBlockSize); // see if at least 4 blocks left + __ jcc(Assembler::less, L_singleBlock_loopTop_128); + + __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0*AESBlockSize)); // get next 4 blocks into xmmresult registers + __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1*AESBlockSize)); + __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2*AESBlockSize)); + __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3*AESBlockSize)); + +#define DoFour(opc, src_reg) \ + __ opc(xmm_result0, src_reg); \ + __ opc(xmm_result1, src_reg); \ + __ opc(xmm_result2, src_reg); \ + __ opc(xmm_result3, src_reg); + + DoFour(pxor, xmm_key_first); + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { + DoFour(aesdec, as_XMMRegister(rnum)); + } + DoFour(aesdeclast, xmm_key_last); + // for each result, xor with the r vector of previous cipher block + __ pxor(xmm_result0, xmm_prev_block_cipher); + __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0*AESBlockSize)); + __ pxor(xmm_result1, xmm_prev_block_cipher); + __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1*AESBlockSize)); + __ pxor(xmm_result2, xmm_prev_block_cipher); + __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2*AESBlockSize)); + __ pxor(xmm_result3, xmm_prev_block_cipher); + __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3*AESBlockSize)); // this will carry over to next set of blocks + + __ movdqu(Address(to, pos, Address::times_1, 0*AESBlockSize), xmm_result0); // store 4 results into the next 64 bytes of output + __ movdqu(Address(to, pos, Address::times_1, 1*AESBlockSize), xmm_result1); + __ movdqu(Address(to, pos, Address::times_1, 2*AESBlockSize), xmm_result2); + __ movdqu(Address(to, pos, Address::times_1, 3*AESBlockSize), xmm_result3); + + __ addptr(pos, 4*AESBlockSize); + __ subptr(len_reg, 4*AESBlockSize); + __ jmp(L_multiBlock_loopTop_128); + + // registers used in the non-parallelized loops + const XMMRegister xmm_prev_block_cipher_save = xmm2; + const XMMRegister xmm_temp = xmm3; + + __ align(OptoLoopAlignment); + __ BIND(L_singleBlock_loopTop_128); + __ cmpptr(len_reg, 0); // any blocks left?? + __ jcc(Assembler::equal, L_exit); + __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input + __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector + __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { + __ aesdec(xmm_result, as_XMMRegister(rnum)); + } + __ aesdeclast(xmm_result, xmm_key_last); + __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output + // no need to store r to memory until we exit + __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block + + __ addptr(pos, AESBlockSize); + __ subptr(len_reg, AESBlockSize); + __ jmp(L_singleBlock_loopTop_128); + + + __ BIND(L_exit); + __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object +#ifdef _WIN64 + // restore regs belonging to calling function + for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { + __ movdqu(as_XMMRegister(i), xmm_save(i)); + } +#endif + __ movl(rax, 0); // return 0 (why?) + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + + + __ BIND(L_key_192_256); + // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) + __ cmpl(rax, 52); + __ jcc(Assembler::notEqual, L_key_256); + + // 192-bit code follows here (could be optimized to use parallelism) + __ movptr(pos, 0); + __ align(OptoLoopAlignment); + __ BIND(L_singleBlock_loopTop_192); + __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input + __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector + __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { + __ aesdec(xmm_result, as_XMMRegister(rnum)); + } + aes_dec_key(xmm_result, xmm_temp, key, 0xb0); // 192-bit key goes up to c0 + aes_dec_key(xmm_result, xmm_temp, key, 0xc0); + __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0 + __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output + // no need to store r to memory until we exit + __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block + + __ addptr(pos, AESBlockSize); + __ subptr(len_reg, AESBlockSize); + __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192); + __ jmp(L_exit); + + __ BIND(L_key_256); + // 256-bit code follows here (could be optimized to use parallelism) + __ movptr(pos, 0); + __ align(OptoLoopAlignment); + __ BIND(L_singleBlock_loopTop_256); + __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input + __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector + __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { + __ aesdec(xmm_result, as_XMMRegister(rnum)); + } + aes_dec_key(xmm_result, xmm_temp, key, 0xb0); // 256-bit key goes up to e0 + aes_dec_key(xmm_result, xmm_temp, key, 0xc0); + aes_dec_key(xmm_result, xmm_temp, key, 0xd0); + aes_dec_key(xmm_result, xmm_temp, key, 0xe0); + __ aesdeclast(xmm_result, xmm_key_last); // xmm15 came from key+0 + __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output + // no need to store r to memory until we exit + __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block + + __ addptr(pos, AESBlockSize); + __ subptr(len_reg, AESBlockSize); + __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256); + __ jmp(L_exit); + + return start; + } + + + #undef __ #define __ masm-> @@ -3135,6 +3677,16 @@ class StubGenerator: public StubCodeGenerator { generate_arraycopy_stubs(); generate_math_stubs(); + + // don't bother generating these AES intrinsic stubs unless global flag is set + if (UseAESIntrinsics) { + StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // needed by the others + + StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); + StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); + StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); + StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); + } } public: diff --git a/src/cpu/x86/vm/stubRoutines_x86_32.cpp b/src/cpu/x86/vm/stubRoutines_x86_32.cpp index 6ec4121b9..cfd4f33a6 100644 --- a/src/cpu/x86/vm/stubRoutines_x86_32.cpp +++ b/src/cpu/x86/vm/stubRoutines_x86_32.cpp @@ -44,3 +44,4 @@ address StubRoutines::x86::_verify_mxcsr_entry = NULL; address StubRoutines::x86::_verify_fpu_cntrl_wrd_entry = NULL; +address StubRoutines::x86::_key_shuffle_mask_addr = NULL; diff --git a/src/cpu/x86/vm/stubRoutines_x86_32.hpp b/src/cpu/x86/vm/stubRoutines_x86_32.hpp index 64767c8ad..d53124fc6 100644 --- a/src/cpu/x86/vm/stubRoutines_x86_32.hpp +++ b/src/cpu/x86/vm/stubRoutines_x86_32.hpp @@ -41,10 +41,14 @@ class x86 { private: static address _verify_mxcsr_entry; static address _verify_fpu_cntrl_wrd_entry; + // shuffle mask for fixing up 128-bit words consisting of big-endian 32-bit integers + static address _key_shuffle_mask_addr; public: static address verify_mxcsr_entry() { return _verify_mxcsr_entry; } static address verify_fpu_cntrl_wrd_entry() { return _verify_fpu_cntrl_wrd_entry; } + static address key_shuffle_mask_addr() { return _key_shuffle_mask_addr; } + }; static bool returns_to_call_stub(address return_pc) { return return_pc == _call_stub_return_address; } diff --git a/src/cpu/x86/vm/stubRoutines_x86_64.cpp b/src/cpu/x86/vm/stubRoutines_x86_64.cpp index 084bbf8fb..cf8ec5d7b 100644 --- a/src/cpu/x86/vm/stubRoutines_x86_64.cpp +++ b/src/cpu/x86/vm/stubRoutines_x86_64.cpp @@ -56,3 +56,4 @@ address StubRoutines::x86::_float_sign_flip = NULL; address StubRoutines::x86::_double_sign_mask = NULL; address StubRoutines::x86::_double_sign_flip = NULL; address StubRoutines::x86::_mxcsr_std = NULL; +address StubRoutines::x86::_key_shuffle_mask_addr = NULL; diff --git a/src/cpu/x86/vm/stubRoutines_x86_64.hpp b/src/cpu/x86/vm/stubRoutines_x86_64.hpp index 9b9cede4f..c3efeecb7 100644 --- a/src/cpu/x86/vm/stubRoutines_x86_64.hpp +++ b/src/cpu/x86/vm/stubRoutines_x86_64.hpp @@ -54,6 +54,8 @@ class x86 { static address _double_sign_mask; static address _double_sign_flip; static address _mxcsr_std; + // shuffle mask for fixing up 128-bit words consisting of big-endian 32-bit integers + static address _key_shuffle_mask_addr; public: @@ -116,6 +118,9 @@ class x86 { { return _mxcsr_std; } + + static address key_shuffle_mask_addr() { return _key_shuffle_mask_addr; } + }; #endif // CPU_X86_VM_STUBROUTINES_X86_64_HPP diff --git a/src/cpu/x86/vm/vm_version_x86.cpp b/src/cpu/x86/vm/vm_version_x86.cpp index bf7b3c213..182b0ab1a 100644 --- a/src/cpu/x86/vm/vm_version_x86.cpp +++ b/src/cpu/x86/vm/vm_version_x86.cpp @@ -419,13 +419,16 @@ void VM_Version::get_processor_features() { if (UseAVX < 1) _cpuFeatures &= ~CPU_AVX; + if (!UseAES && !FLAG_IS_DEFAULT(UseAES)) + _cpuFeatures &= ~CPU_AES; + if (logical_processors_per_package() == 1) { // HT processor could be installed on a system which doesn't support HT. _cpuFeatures &= ~CPU_HT; } char buf[256]; - jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", + jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", cores_per_cpu(), threads_per_core(), cpu_family(), _model, _stepping, (supports_cmov() ? ", cmov" : ""), @@ -441,6 +444,7 @@ void VM_Version::get_processor_features() { (supports_popcnt() ? ", popcnt" : ""), (supports_avx() ? ", avx" : ""), (supports_avx2() ? ", avx2" : ""), + (supports_aes() ? ", aes" : ""), (supports_mmx_ext() ? ", mmxext" : ""), (supports_3dnow_prefetch() ? ", 3dnowpref" : ""), (supports_lzcnt() ? ", lzcnt": ""), @@ -472,6 +476,29 @@ void VM_Version::get_processor_features() { if (!supports_avx ()) // Drop to 0 if no AVX support UseAVX = 0; + // Use AES instructions if available. + if (supports_aes()) { + if (FLAG_IS_DEFAULT(UseAES)) { + UseAES = true; + } + } else if (UseAES) { + if (!FLAG_IS_DEFAULT(UseAES)) + warning("AES instructions not available on this CPU"); + FLAG_SET_DEFAULT(UseAES, false); + } + + // The AES intrinsic stubs require AES instruction support (of course) + // but also require AVX mode for misaligned SSE access + if (UseAES && (UseAVX > 0)) { + if (FLAG_IS_DEFAULT(UseAESIntrinsics)) { + UseAESIntrinsics = true; + } + } else if (UseAESIntrinsics) { + if (!FLAG_IS_DEFAULT(UseAESIntrinsics)) + warning("AES intrinsics not available on this CPU"); + FLAG_SET_DEFAULT(UseAESIntrinsics, false); + } + #ifdef COMPILER2 if (UseFPUForSpilling) { if (UseSSE < 2) { @@ -714,6 +741,9 @@ void VM_Version::get_processor_features() { if (UseAVX > 0) { tty->print(" UseAVX=%d",UseAVX); } + if (UseAES) { + tty->print(" UseAES=1"); + } tty->cr(); tty->print("Allocation"); if (AllocatePrefetchStyle <= 0 || UseSSE == 0 && !supports_3dnow_prefetch()) { diff --git a/src/cpu/x86/vm/vm_version_x86.hpp b/src/cpu/x86/vm/vm_version_x86.hpp index 92cdbd3fd..12bd3b770 100644 --- a/src/cpu/x86/vm/vm_version_x86.hpp +++ b/src/cpu/x86/vm/vm_version_x86.hpp @@ -78,7 +78,9 @@ public: sse4_2 : 1, : 2, popcnt : 1, - : 3, + : 1, + aes : 1, + : 1, osxsave : 1, avx : 1, : 3; @@ -244,7 +246,8 @@ protected: CPU_TSC = (1 << 15), CPU_TSCINV = (1 << 16), CPU_AVX = (1 << 17), - CPU_AVX2 = (1 << 18) + CPU_AVX2 = (1 << 18), + CPU_AES = (1 << 19) } cpuFeatureFlags; enum { @@ -420,6 +423,8 @@ protected: result |= CPU_TSC; if (_cpuid_info.ext_cpuid7_edx.bits.tsc_invariance != 0) result |= CPU_TSCINV; + if (_cpuid_info.std_cpuid1_ecx.bits.aes != 0) + result |= CPU_AES; // AMD features. if (is_amd()) { @@ -544,6 +549,7 @@ public: static bool supports_avx() { return (_cpuFeatures & CPU_AVX) != 0; } static bool supports_avx2() { return (_cpuFeatures & CPU_AVX2) != 0; } static bool supports_tsc() { return (_cpuFeatures & CPU_TSC) != 0; } + static bool supports_aes() { return (_cpuFeatures & CPU_AES) != 0; } // Intel features static bool is_intel_family_core() { return is_intel() && diff --git a/src/share/vm/classfile/vmSymbols.hpp b/src/share/vm/classfile/vmSymbols.hpp index 06fdb35be..2febc7b56 100644 --- a/src/share/vm/classfile/vmSymbols.hpp +++ b/src/share/vm/classfile/vmSymbols.hpp @@ -110,6 +110,7 @@ template(sun_jkernel_DownloadManager, "sun/jkernel/DownloadManager") \ template(getBootClassPathEntryForClass_name, "getBootClassPathEntryForClass") \ template(sun_misc_PostVMInitHook, "sun/misc/PostVMInitHook") \ + template(sun_misc_Launcher_ExtClassLoader, "sun/misc/Launcher$ExtClassLoader") \ \ /* Java runtime version access */ \ template(sun_misc_Version, "sun/misc/Version") \ @@ -723,6 +724,21 @@ /* java/lang/ref/Reference */ \ do_intrinsic(_Reference_get, java_lang_ref_Reference, get_name, void_object_signature, F_R) \ \ + /* support for com.sum.crypto.provider.AESCrypt and some of its callers */ \ + do_class(com_sun_crypto_provider_aescrypt, "com/sun/crypto/provider/AESCrypt") \ + do_intrinsic(_aescrypt_encryptBlock, com_sun_crypto_provider_aescrypt, encryptBlock_name, byteArray_int_byteArray_int_signature, F_R) \ + do_intrinsic(_aescrypt_decryptBlock, com_sun_crypto_provider_aescrypt, decryptBlock_name, byteArray_int_byteArray_int_signature, F_R) \ + do_name( encryptBlock_name, "encryptBlock") \ + do_name( decryptBlock_name, "decryptBlock") \ + do_signature(byteArray_int_byteArray_int_signature, "([BI[BI)V") \ + \ + do_class(com_sun_crypto_provider_cipherBlockChaining, "com/sun/crypto/provider/CipherBlockChaining") \ + do_intrinsic(_cipherBlockChaining_encryptAESCrypt, com_sun_crypto_provider_cipherBlockChaining, encrypt_name, byteArray_int_int_byteArray_int_signature, F_R) \ + do_intrinsic(_cipherBlockChaining_decryptAESCrypt, com_sun_crypto_provider_cipherBlockChaining, decrypt_name, byteArray_int_int_byteArray_int_signature, F_R) \ + do_name( encrypt_name, "encrypt") \ + do_name( decrypt_name, "decrypt") \ + do_signature(byteArray_int_int_byteArray_int_signature, "([BII[BI)V") \ + \ /* support for sun.misc.Unsafe */ \ do_class(sun_misc_Unsafe, "sun/misc/Unsafe") \ \ diff --git a/src/share/vm/oops/method.cpp b/src/share/vm/oops/method.cpp index 5a1032f77..9849829ea 100644 --- a/src/share/vm/oops/method.cpp +++ b/src/share/vm/oops/method.cpp @@ -1155,8 +1155,12 @@ methodHandle Method::clone_with_new_data(methodHandle m, u_char* new_code, int n vmSymbols::SID Method::klass_id_for_intrinsics(Klass* holder) { // if loader is not the default loader (i.e., != NULL), we can't know the intrinsics // because we are not loading from core libraries - if (InstanceKlass::cast(holder)->class_loader() != NULL) + // exception: the AES intrinsics come from lib/ext/sunjce_provider.jar + // which does not use the class default class loader so we check for its loader here + if ((InstanceKlass::cast(holder)->class_loader() != NULL) && + InstanceKlass::cast(holder)->class_loader()->klass()->name() != vmSymbols::sun_misc_Launcher_ExtClassLoader()) { return vmSymbols::NO_SID; // regardless of name, no intrinsics here + } // see if the klass name is well-known: Symbol* klass_name = InstanceKlass::cast(holder)->name(); diff --git a/src/share/vm/opto/callGenerator.cpp b/src/share/vm/opto/callGenerator.cpp index 547096b3d..93f2b859b 100644 --- a/src/share/vm/opto/callGenerator.cpp +++ b/src/share/vm/opto/callGenerator.cpp @@ -670,6 +670,129 @@ CallGenerator* CallGenerator::for_method_handle_inline(JVMState* jvms, ciMethod* } +//------------------------PredictedIntrinsicGenerator------------------------------ +// Internal class which handles all predicted Intrinsic calls. +class PredictedIntrinsicGenerator : public CallGenerator { + CallGenerator* _intrinsic; + CallGenerator* _cg; + +public: + PredictedIntrinsicGenerator(CallGenerator* intrinsic, + CallGenerator* cg) + : CallGenerator(cg->method()) + { + _intrinsic = intrinsic; + _cg = cg; + } + + virtual bool is_virtual() const { return true; } + virtual bool is_inlined() const { return true; } + virtual bool is_intrinsic() const { return true; } + + virtual JVMState* generate(JVMState* jvms); +}; + + +CallGenerator* CallGenerator::for_predicted_intrinsic(CallGenerator* intrinsic, + CallGenerator* cg) { + return new PredictedIntrinsicGenerator(intrinsic, cg); +} + + +JVMState* PredictedIntrinsicGenerator::generate(JVMState* jvms) { + GraphKit kit(jvms); + PhaseGVN& gvn = kit.gvn(); + + CompileLog* log = kit.C->log(); + if (log != NULL) { + log->elem("predicted_intrinsic bci='%d' method='%d'", + jvms->bci(), log->identify(method())); + } + + Node* slow_ctl = _intrinsic->generate_predicate(kit.sync_jvms()); + if (kit.failing()) + return NULL; // might happen because of NodeCountInliningCutoff + + SafePointNode* slow_map = NULL; + JVMState* slow_jvms; + if (slow_ctl != NULL) { + PreserveJVMState pjvms(&kit); + kit.set_control(slow_ctl); + if (!kit.stopped()) { + slow_jvms = _cg->generate(kit.sync_jvms()); + if (kit.failing()) + return NULL; // might happen because of NodeCountInliningCutoff + assert(slow_jvms != NULL, "must be"); + kit.add_exception_states_from(slow_jvms); + kit.set_map(slow_jvms->map()); + if (!kit.stopped()) + slow_map = kit.stop(); + } + } + + if (kit.stopped()) { + // Predicate is always false. + kit.set_jvms(slow_jvms); + return kit.transfer_exceptions_into_jvms(); + } + + // Generate intrinsic code: + JVMState* new_jvms = _intrinsic->generate(kit.sync_jvms()); + if (new_jvms == NULL) { + // Intrinsic failed, so use slow code or make a direct call. + if (slow_map == NULL) { + CallGenerator* cg = CallGenerator::for_direct_call(method()); + new_jvms = cg->generate(kit.sync_jvms()); + } else { + kit.set_jvms(slow_jvms); + return kit.transfer_exceptions_into_jvms(); + } + } + kit.add_exception_states_from(new_jvms); + kit.set_jvms(new_jvms); + + // Need to merge slow and fast? + if (slow_map == NULL) { + // The fast path is the only path remaining. + return kit.transfer_exceptions_into_jvms(); + } + + if (kit.stopped()) { + // Intrinsic method threw an exception, so it's just the slow path after all. + kit.set_jvms(slow_jvms); + return kit.transfer_exceptions_into_jvms(); + } + + // Finish the diamond. + kit.C->set_has_split_ifs(true); // Has chance for split-if optimization + RegionNode* region = new (kit.C) RegionNode(3); + region->init_req(1, kit.control()); + region->init_req(2, slow_map->control()); + kit.set_control(gvn.transform(region)); + Node* iophi = PhiNode::make(region, kit.i_o(), Type::ABIO); + iophi->set_req(2, slow_map->i_o()); + kit.set_i_o(gvn.transform(iophi)); + kit.merge_memory(slow_map->merged_memory(), region, 2); + uint tos = kit.jvms()->stkoff() + kit.sp(); + uint limit = slow_map->req(); + for (uint i = TypeFunc::Parms; i < limit; i++) { + // Skip unused stack slots; fast forward to monoff(); + if (i == tos) { + i = kit.jvms()->monoff(); + if( i >= limit ) break; + } + Node* m = kit.map()->in(i); + Node* n = slow_map->in(i); + if (m != n) { + const Type* t = gvn.type(m)->meet(gvn.type(n)); + Node* phi = PhiNode::make(region, m, t); + phi->set_req(2, n); + kit.map()->set_req(i, gvn.transform(phi)); + } + } + return kit.transfer_exceptions_into_jvms(); +} + //-------------------------UncommonTrapCallGenerator----------------------------- // Internal class which handles all out-of-line calls checking receiver type. class UncommonTrapCallGenerator : public CallGenerator { diff --git a/src/share/vm/opto/callGenerator.hpp b/src/share/vm/opto/callGenerator.hpp index 3cfd39df6..ae59173bf 100644 --- a/src/share/vm/opto/callGenerator.hpp +++ b/src/share/vm/opto/callGenerator.hpp @@ -143,6 +143,9 @@ class CallGenerator : public ResourceObj { // Registry for intrinsics: static CallGenerator* for_intrinsic(ciMethod* m); static void register_intrinsic(ciMethod* m, CallGenerator* cg); + static CallGenerator* for_predicted_intrinsic(CallGenerator* intrinsic, + CallGenerator* cg); + virtual Node* generate_predicate(JVMState* jvms) { return NULL; }; static void print_inlining(ciMethod* callee, int inline_level, int bci, const char* msg) { if (PrintInlining) diff --git a/src/share/vm/opto/doCall.cpp b/src/share/vm/opto/doCall.cpp index 30a01f34b..95d148841 100644 --- a/src/share/vm/opto/doCall.cpp +++ b/src/share/vm/opto/doCall.cpp @@ -107,7 +107,17 @@ CallGenerator* Compile::call_generator(ciMethod* callee, int vtable_index, bool // intrinsics handle strict f.p. correctly. if (allow_inline && allow_intrinsics) { CallGenerator* cg = find_intrinsic(callee, call_is_virtual); - if (cg != NULL) return cg; + if (cg != NULL) { + if (cg->is_predicted()) { + // Code without intrinsic but, hopefully, inlined. + CallGenerator* inline_cg = this->call_generator(callee, + vtable_index, call_is_virtual, jvms, allow_inline, prof_factor, false); + if (inline_cg != NULL) { + cg = CallGenerator::for_predicted_intrinsic(cg, inline_cg); + } + } + return cg; + } } // Do method handle calls. diff --git a/src/share/vm/opto/escape.cpp b/src/share/vm/opto/escape.cpp index 9fd318050..2fd6ad1ce 100644 --- a/src/share/vm/opto/escape.cpp +++ b/src/share/vm/opto/escape.cpp @@ -893,12 +893,16 @@ void ConnectionGraph::process_call_arguments(CallNode *call) { arg_has_oops && (i > TypeFunc::Parms); #ifdef ASSERT if (!(is_arraycopy || - call->as_CallLeaf()->_name != NULL && - (strcmp(call->as_CallLeaf()->_name, "g1_wb_pre") == 0 || - strcmp(call->as_CallLeaf()->_name, "g1_wb_post") == 0 )) - ) { + (call->as_CallLeaf()->_name != NULL && + (strcmp(call->as_CallLeaf()->_name, "g1_wb_pre") == 0 || + strcmp(call->as_CallLeaf()->_name, "g1_wb_post") == 0 || + strcmp(call->as_CallLeaf()->_name, "aescrypt_encryptBlock") == 0 || + strcmp(call->as_CallLeaf()->_name, "aescrypt_decryptBlock") == 0 || + strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_encryptAESCrypt") == 0 || + strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_decryptAESCrypt") == 0) + ))) { call->dump(); - assert(false, "EA: unexpected CallLeaf"); + fatal(err_msg_res("EA unexpected CallLeaf %s", call->as_CallLeaf()->_name)); } #endif // Always process arraycopy's destination object since diff --git a/src/share/vm/opto/library_call.cpp b/src/share/vm/opto/library_call.cpp index 4f24d88f1..6b90061ff 100644 --- a/src/share/vm/opto/library_call.cpp +++ b/src/share/vm/opto/library_call.cpp @@ -44,18 +44,22 @@ class LibraryIntrinsic : public InlineCallGenerator { public: private: bool _is_virtual; + bool _is_predicted; vmIntrinsics::ID _intrinsic_id; public: - LibraryIntrinsic(ciMethod* m, bool is_virtual, vmIntrinsics::ID id) + LibraryIntrinsic(ciMethod* m, bool is_virtual, bool is_predicted, vmIntrinsics::ID id) : InlineCallGenerator(m), _is_virtual(is_virtual), + _is_predicted(is_predicted), _intrinsic_id(id) { } virtual bool is_intrinsic() const { return true; } virtual bool is_virtual() const { return _is_virtual; } + virtual bool is_predicted() const { return _is_predicted; } virtual JVMState* generate(JVMState* jvms); + virtual Node* generate_predicate(JVMState* jvms); vmIntrinsics::ID intrinsic_id() const { return _intrinsic_id; } }; @@ -83,6 +87,7 @@ class LibraryCallKit : public GraphKit { int arg_size() const { return callee()->arg_size(); } bool try_to_inline(); + Node* try_to_predicate(); // Helper functions to inline natives void push_result(RegionNode* region, PhiNode* value); @@ -148,6 +153,7 @@ class LibraryCallKit : public GraphKit { CallJavaNode* generate_method_call_virtual(vmIntrinsics::ID method_id) { return generate_method_call(method_id, true, false); } + Node * load_field_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString, bool is_exact, bool is_static); Node* make_string_method_node(int opcode, Node* str1_start, Node* cnt1, Node* str2_start, Node* cnt2); Node* make_string_method_node(int opcode, Node* str1, Node* str2); @@ -253,6 +259,10 @@ class LibraryCallKit : public GraphKit { bool inline_reverseBytes(vmIntrinsics::ID id); bool inline_reference_get(); + bool inline_aescrypt_Block(vmIntrinsics::ID id); + bool inline_cipherBlockChaining_AESCrypt(vmIntrinsics::ID id); + Node* inline_cipherBlockChaining_AESCrypt_predicate(bool decrypting); + Node* get_key_start_from_aescrypt_object(Node* aescrypt_object); }; @@ -306,6 +316,8 @@ CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) { } } + bool is_predicted = false; + switch (id) { case vmIntrinsics::_compareTo: if (!SpecialStringCompareTo) return NULL; @@ -413,6 +425,18 @@ CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) { break; #endif + case vmIntrinsics::_aescrypt_encryptBlock: + case vmIntrinsics::_aescrypt_decryptBlock: + if (!UseAESIntrinsics) return NULL; + break; + + case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt: + case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt: + if (!UseAESIntrinsics) return NULL; + // these two require the predicated logic + is_predicted = true; + break; + default: assert(id <= vmIntrinsics::LAST_COMPILER_INLINE, "caller responsibility"); assert(id != vmIntrinsics::_Object_init && id != vmIntrinsics::_invoke, "enum out of order?"); @@ -444,7 +468,7 @@ CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) { if (!InlineUnsafeOps) return NULL; } - return new LibraryIntrinsic(m, is_virtual, (vmIntrinsics::ID) id); + return new LibraryIntrinsic(m, is_virtual, is_predicted, (vmIntrinsics::ID) id); } //----------------------register_library_intrinsics----------------------- @@ -496,6 +520,47 @@ JVMState* LibraryIntrinsic::generate(JVMState* jvms) { return NULL; } +Node* LibraryIntrinsic::generate_predicate(JVMState* jvms) { + LibraryCallKit kit(jvms, this); + Compile* C = kit.C; + int nodes = C->unique(); +#ifndef PRODUCT + assert(is_predicted(), "sanity"); + if ((PrintIntrinsics || PrintInlining NOT_PRODUCT( || PrintOptoInlining) ) && Verbose) { + char buf[1000]; + const char* str = vmIntrinsics::short_name_as_C_string(intrinsic_id(), buf, sizeof(buf)); + tty->print_cr("Predicate for intrinsic %s", str); + } +#endif + + Node* slow_ctl = kit.try_to_predicate(); + if (!kit.failing()) { + if (C->log()) { + C->log()->elem("predicate_intrinsic id='%s'%s nodes='%d'", + vmIntrinsics::name_at(intrinsic_id()), + (is_virtual() ? " virtual='1'" : ""), + C->unique() - nodes); + } + return slow_ctl; // Could be NULL if the check folds. + } + + // The intrinsic bailed out + if (PrintIntrinsics || PrintInlining NOT_PRODUCT( || PrintOptoInlining) ) { + if (jvms->has_method()) { + // Not a root compile. + const char* msg = "failed to generate predicate for intrinsic"; + CompileTask::print_inlining(kit.callee(), jvms->depth() - 1, kit.bci(), msg); + } else { + // Root compile + tty->print("Did not generate predicate for intrinsic %s%s at bci:%d in", + vmIntrinsics::name_at(intrinsic_id()), + (is_virtual() ? " (virtual)" : ""), kit.bci()); + } + } + C->gather_intrinsic_statistics(intrinsic_id(), is_virtual(), Compile::_intrinsic_failed); + return NULL; +} + bool LibraryCallKit::try_to_inline() { // Handle symbolic names for otherwise undistinguished boolean switches: const bool is_store = true; @@ -767,6 +832,14 @@ bool LibraryCallKit::try_to_inline() { case vmIntrinsics::_Reference_get: return inline_reference_get(); + case vmIntrinsics::_aescrypt_encryptBlock: + case vmIntrinsics::_aescrypt_decryptBlock: + return inline_aescrypt_Block(intrinsic_id()); + + case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt: + case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt: + return inline_cipherBlockChaining_AESCrypt(intrinsic_id()); + default: // If you get here, it may be that someone has added a new intrinsic // to the list in vmSymbols.hpp without implementing it here. @@ -780,6 +853,36 @@ bool LibraryCallKit::try_to_inline() { } } +Node* LibraryCallKit::try_to_predicate() { + if (!jvms()->has_method()) { + // Root JVMState has a null method. + assert(map()->memory()->Opcode() == Op_Parm, ""); + // Insert the memory aliasing node + set_all_memory(reset_memory()); + } + assert(merged_memory(), ""); + + switch (intrinsic_id()) { + case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt: + return inline_cipherBlockChaining_AESCrypt_predicate(false); + case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt: + return inline_cipherBlockChaining_AESCrypt_predicate(true); + + default: + // If you get here, it may be that someone has added a new intrinsic + // to the list in vmSymbols.hpp without implementing it here. +#ifndef PRODUCT + if ((PrintMiscellaneous && (Verbose || WizardMode)) || PrintOpto) { + tty->print_cr("*** Warning: Unimplemented predicate for intrinsic %s(%d)", + vmIntrinsics::name_at(intrinsic_id()), intrinsic_id()); + } +#endif + Node* slow_ctl = control(); + set_control(top()); // No fast path instrinsic + return slow_ctl; + } +} + //------------------------------push_result------------------------------ // Helper function for finishing intrinsics. void LibraryCallKit::push_result(RegionNode* region, PhiNode* value) { @@ -5613,3 +5716,265 @@ bool LibraryCallKit::inline_reference_get() { push(result); return true; } + + +Node * LibraryCallKit::load_field_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString, + bool is_exact=true, bool is_static=false) { + + const TypeInstPtr* tinst = _gvn.type(fromObj)->isa_instptr(); + assert(tinst != NULL, "obj is null"); + assert(tinst->klass()->is_loaded(), "obj is not loaded"); + assert(!is_exact || tinst->klass_is_exact(), "klass not exact"); + + ciField* field = tinst->klass()->as_instance_klass()->get_field_by_name(ciSymbol::make(fieldName), + ciSymbol::make(fieldTypeString), + is_static); + if (field == NULL) return (Node *) NULL; + assert (field != NULL, "undefined field"); + + // Next code copied from Parse::do_get_xxx(): + + // Compute address and memory type. + int offset = field->offset_in_bytes(); + bool is_vol = field->is_volatile(); + ciType* field_klass = field->type(); + assert(field_klass->is_loaded(), "should be loaded"); + const TypePtr* adr_type = C->alias_type(field)->adr_type(); + Node *adr = basic_plus_adr(fromObj, fromObj, offset); + BasicType bt = field->layout_type(); + + // Build the resultant type of the load + const Type *type = TypeOopPtr::make_from_klass(field_klass->as_klass()); + + // Build the load. + Node* loadedField = make_load(NULL, adr, type, bt, adr_type, is_vol); + return loadedField; +} + + +//------------------------------inline_aescrypt_Block----------------------- +bool LibraryCallKit::inline_aescrypt_Block(vmIntrinsics::ID id) { + address stubAddr; + const char *stubName; + assert(UseAES, "need AES instruction support"); + + switch(id) { + case vmIntrinsics::_aescrypt_encryptBlock: + stubAddr = StubRoutines::aescrypt_encryptBlock(); + stubName = "aescrypt_encryptBlock"; + break; + case vmIntrinsics::_aescrypt_decryptBlock: + stubAddr = StubRoutines::aescrypt_decryptBlock(); + stubName = "aescrypt_decryptBlock"; + break; + } + if (stubAddr == NULL) return false; + + // Restore the stack and pop off the arguments. + int nargs = 5; // this + 2 oop/offset combos + assert(callee()->signature()->size() == nargs-1, "encryptBlock has 4 arguments"); + + Node *aescrypt_object = argument(0); + Node *src = argument(1); + Node *src_offset = argument(2); + Node *dest = argument(3); + Node *dest_offset = argument(4); + + // (1) src and dest are arrays. + const Type* src_type = src->Value(&_gvn); + const Type* dest_type = dest->Value(&_gvn); + const TypeAryPtr* top_src = src_type->isa_aryptr(); + const TypeAryPtr* top_dest = dest_type->isa_aryptr(); + assert (top_src != NULL && top_src->klass() != NULL && top_dest != NULL && top_dest->klass() != NULL, "args are strange"); + + // for the quick and dirty code we will skip all the checks. + // we are just trying to get the call to be generated. + Node* src_start = src; + Node* dest_start = dest; + if (src_offset != NULL || dest_offset != NULL) { + assert(src_offset != NULL && dest_offset != NULL, ""); + src_start = array_element_address(src, src_offset, T_BYTE); + dest_start = array_element_address(dest, dest_offset, T_BYTE); + } + + // now need to get the start of its expanded key array + // this requires a newer class file that has this array as littleEndian ints, otherwise we revert to java + Node* k_start = get_key_start_from_aescrypt_object(aescrypt_object); + if (k_start == NULL) return false; + + // Call the stub. + make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::aescrypt_block_Type(), + stubAddr, stubName, TypePtr::BOTTOM, + src_start, dest_start, k_start); + + return true; +} + +//------------------------------inline_cipherBlockChaining_AESCrypt----------------------- +bool LibraryCallKit::inline_cipherBlockChaining_AESCrypt(vmIntrinsics::ID id) { + address stubAddr; + const char *stubName; + + assert(UseAES, "need AES instruction support"); + + switch(id) { + case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt: + stubAddr = StubRoutines::cipherBlockChaining_encryptAESCrypt(); + stubName = "cipherBlockChaining_encryptAESCrypt"; + break; + case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt: + stubAddr = StubRoutines::cipherBlockChaining_decryptAESCrypt(); + stubName = "cipherBlockChaining_decryptAESCrypt"; + break; + } + if (stubAddr == NULL) return false; + + + // Restore the stack and pop off the arguments. + int nargs = 6; // this + oop/offset + len + oop/offset + assert(callee()->signature()->size() == nargs-1, "wrong number of arguments"); + Node *cipherBlockChaining_object = argument(0); + Node *src = argument(1); + Node *src_offset = argument(2); + Node *len = argument(3); + Node *dest = argument(4); + Node *dest_offset = argument(5); + + // (1) src and dest are arrays. + const Type* src_type = src->Value(&_gvn); + const Type* dest_type = dest->Value(&_gvn); + const TypeAryPtr* top_src = src_type->isa_aryptr(); + const TypeAryPtr* top_dest = dest_type->isa_aryptr(); + assert (top_src != NULL && top_src->klass() != NULL + && top_dest != NULL && top_dest->klass() != NULL, "args are strange"); + + // checks are the responsibility of the caller + Node* src_start = src; + Node* dest_start = dest; + if (src_offset != NULL || dest_offset != NULL) { + assert(src_offset != NULL && dest_offset != NULL, ""); + src_start = array_element_address(src, src_offset, T_BYTE); + dest_start = array_element_address(dest, dest_offset, T_BYTE); + } + + // if we are in this set of code, we "know" the embeddedCipher is an AESCrypt object + // (because of the predicated logic executed earlier). + // so we cast it here safely. + // this requires a newer class file that has this array as littleEndian ints, otherwise we revert to java + + Node* embeddedCipherObj = load_field_from_object(cipherBlockChaining_object, "embeddedCipher", "Lcom/sun/crypto/provider/SymmetricCipher;", /*is_exact*/ false); + if (embeddedCipherObj == NULL) return false; + + // cast it to what we know it will be at runtime + const TypeInstPtr* tinst = _gvn.type(cipherBlockChaining_object)->isa_instptr(); + assert(tinst != NULL, "CBC obj is null"); + assert(tinst->klass()->is_loaded(), "CBC obj is not loaded"); + ciKlass* klass_AESCrypt = tinst->klass()->as_instance_klass()->find_klass(ciSymbol::make("com/sun/crypto/provider/AESCrypt")); + if (!klass_AESCrypt->is_loaded()) return false; + + ciInstanceKlass* instklass_AESCrypt = klass_AESCrypt->as_instance_klass(); + const TypeKlassPtr* aklass = TypeKlassPtr::make(instklass_AESCrypt); + const TypeOopPtr* xtype = aklass->as_instance_type(); + Node* aescrypt_object = new(C) CheckCastPPNode(control(), embeddedCipherObj, xtype); + aescrypt_object = _gvn.transform(aescrypt_object); + + // we need to get the start of the aescrypt_object's expanded key array + Node* k_start = get_key_start_from_aescrypt_object(aescrypt_object); + if (k_start == NULL) return false; + + // similarly, get the start address of the r vector + Node* objRvec = load_field_from_object(cipherBlockChaining_object, "r", "[B", /*is_exact*/ false); + if (objRvec == NULL) return false; + Node* r_start = array_element_address(objRvec, intcon(0), T_BYTE); + + // Call the stub, passing src_start, dest_start, k_start, r_start and src_len + make_runtime_call(RC_LEAF|RC_NO_FP, + OptoRuntime::cipherBlockChaining_aescrypt_Type(), + stubAddr, stubName, TypePtr::BOTTOM, + src_start, dest_start, k_start, r_start, len); + + // return is void so no result needs to be pushed + + return true; +} + +//------------------------------get_key_start_from_aescrypt_object----------------------- +Node * LibraryCallKit::get_key_start_from_aescrypt_object(Node *aescrypt_object) { + Node* objAESCryptKey = load_field_from_object(aescrypt_object, "K", "[I", /*is_exact*/ false); + assert (objAESCryptKey != NULL, "wrong version of com.sun.crypto.provider.AESCrypt"); + if (objAESCryptKey == NULL) return (Node *) NULL; + + // now have the array, need to get the start address of the K array + Node* k_start = array_element_address(objAESCryptKey, intcon(0), T_INT); + return k_start; +} + +//----------------------------inline_cipherBlockChaining_AESCrypt_predicate---------------------------- +// Return node representing slow path of predicate check. +// the pseudo code we want to emulate with this predicate is: +// for encryption: +// if (embeddedCipherObj instanceof AESCrypt) do_intrinsic, else do_javapath +// for decryption: +// if ((embeddedCipherObj instanceof AESCrypt) && (cipher!=plain)) do_intrinsic, else do_javapath +// note cipher==plain is more conservative than the original java code but that's OK +// +Node* LibraryCallKit::inline_cipherBlockChaining_AESCrypt_predicate(bool decrypting) { + // First, check receiver for NULL since it is virtual method. + int nargs = arg_size(); + Node* objCBC = argument(0); + _sp += nargs; + objCBC = do_null_check(objCBC, T_OBJECT); + _sp -= nargs; + + if (stopped()) return NULL; // Always NULL + + // Load embeddedCipher field of CipherBlockChaining object. + Node* embeddedCipherObj = load_field_from_object(objCBC, "embeddedCipher", "Lcom/sun/crypto/provider/SymmetricCipher;", /*is_exact*/ false); + + // get AESCrypt klass for instanceOf check + // AESCrypt might not be loaded yet if some other SymmetricCipher got us to this compile point + // will have same classloader as CipherBlockChaining object + const TypeInstPtr* tinst = _gvn.type(objCBC)->isa_instptr(); + assert(tinst != NULL, "CBCobj is null"); + assert(tinst->klass()->is_loaded(), "CBCobj is not loaded"); + + // we want to do an instanceof comparison against the AESCrypt class + ciKlass* klass_AESCrypt = tinst->klass()->as_instance_klass()->find_klass(ciSymbol::make("com/sun/crypto/provider/AESCrypt")); + if (!klass_AESCrypt->is_loaded()) { + // if AESCrypt is not even loaded, we never take the intrinsic fast path + Node* ctrl = control(); + set_control(top()); // no regular fast path + return ctrl; + } + ciInstanceKlass* instklass_AESCrypt = klass_AESCrypt->as_instance_klass(); + + _sp += nargs; // gen_instanceof might do an uncommon trap + Node* instof = gen_instanceof(embeddedCipherObj, makecon(TypeKlassPtr::make(instklass_AESCrypt))); + _sp -= nargs; + Node* cmp_instof = _gvn.transform(new (C) CmpINode(instof, intcon(1))); + Node* bool_instof = _gvn.transform(new (C) BoolNode(cmp_instof, BoolTest::ne)); + + Node* instof_false = generate_guard(bool_instof, NULL, PROB_MIN); + + // for encryption, we are done + if (!decrypting) + return instof_false; // even if it is NULL + + // for decryption, we need to add a further check to avoid + // taking the intrinsic path when cipher and plain are the same + // see the original java code for why. + RegionNode* region = new(C) RegionNode(3); + region->init_req(1, instof_false); + Node* src = argument(1); + Node *dest = argument(4); + Node* cmp_src_dest = _gvn.transform(new (C) CmpPNode(src, dest)); + Node* bool_src_dest = _gvn.transform(new (C) BoolNode(cmp_src_dest, BoolTest::eq)); + Node* src_dest_conjoint = generate_guard(bool_src_dest, NULL, PROB_MIN); + region->init_req(2, src_dest_conjoint); + + record_for_igvn(region); + return _gvn.transform(region); + +} + + diff --git a/src/share/vm/opto/runtime.cpp b/src/share/vm/opto/runtime.cpp index bb050533d..51987e25e 100644 --- a/src/share/vm/opto/runtime.cpp +++ b/src/share/vm/opto/runtime.cpp @@ -811,6 +811,48 @@ const TypeFunc* OptoRuntime::array_fill_Type() { return TypeFunc::make(domain, range); } +// for aescrypt encrypt/decrypt operations, just three pointers returning void (length is constant) +const TypeFunc* OptoRuntime::aescrypt_block_Type() { + // create input type (domain) + int num_args = 3; + int argcnt = num_args; + const Type** fields = TypeTuple::fields(argcnt); + int argp = TypeFunc::Parms; + fields[argp++] = TypePtr::NOTNULL; // src + fields[argp++] = TypePtr::NOTNULL; // dest + fields[argp++] = TypePtr::NOTNULL; // k array + assert(argp == TypeFunc::Parms+argcnt, "correct decoding"); + const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields); + + // no result type needed + fields = TypeTuple::fields(1); + fields[TypeFunc::Parms+0] = NULL; // void + const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields); + return TypeFunc::make(domain, range); +} + +// for cipherBlockChaining calls of aescrypt encrypt/decrypt, four pointers and a length, returning void +const TypeFunc* OptoRuntime::cipherBlockChaining_aescrypt_Type() { + // create input type (domain) + int num_args = 5; + int argcnt = num_args; + const Type** fields = TypeTuple::fields(argcnt); + int argp = TypeFunc::Parms; + fields[argp++] = TypePtr::NOTNULL; // src + fields[argp++] = TypePtr::NOTNULL; // dest + fields[argp++] = TypePtr::NOTNULL; // k array + fields[argp++] = TypePtr::NOTNULL; // r array + fields[argp++] = TypeInt::INT; // src len + assert(argp == TypeFunc::Parms+argcnt, "correct decoding"); + const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields); + + // no result type needed + fields = TypeTuple::fields(1); + fields[TypeFunc::Parms+0] = NULL; // void + const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields); + return TypeFunc::make(domain, range); +} + //------------- Interpreter state access for on stack replacement const TypeFunc* OptoRuntime::osr_end_Type() { // create input type (domain) diff --git a/src/share/vm/opto/runtime.hpp b/src/share/vm/opto/runtime.hpp index c70777267..13da255b7 100644 --- a/src/share/vm/opto/runtime.hpp +++ b/src/share/vm/opto/runtime.hpp @@ -280,6 +280,9 @@ private: static const TypeFunc* array_fill_Type(); + static const TypeFunc* aescrypt_block_Type(); + static const TypeFunc* cipherBlockChaining_aescrypt_Type(); + // leaf on stack replacement interpreter accessor types static const TypeFunc* osr_end_Type(); diff --git a/src/share/vm/runtime/globals.hpp b/src/share/vm/runtime/globals.hpp index e073cbcd0..42c3cdf6d 100644 --- a/src/share/vm/runtime/globals.hpp +++ b/src/share/vm/runtime/globals.hpp @@ -533,6 +533,9 @@ class CommandLineFlags { product(intx, UseSSE, 99, \ "Highest supported SSE instructions set on x86/x64") \ \ + product(bool, UseAES, false, \ + "Control whether AES instructions can be used on x86/x64") \ + \ product(uintx, LargePageSizeInBytes, 0, \ "Large page size (0 to let VM choose the page size") \ \ @@ -635,6 +638,9 @@ class CommandLineFlags { product(bool, UseSSE42Intrinsics, false, \ "SSE4.2 versions of intrinsics") \ \ + product(bool, UseAESIntrinsics, false, \ + "use intrinsics for AES versions of crypto") \ + \ develop(bool, TraceCallFixup, false, \ "traces all call fixups") \ \ diff --git a/src/share/vm/runtime/stubRoutines.cpp b/src/share/vm/runtime/stubRoutines.cpp index 5ca4ba599..98d428abd 100644 --- a/src/share/vm/runtime/stubRoutines.cpp +++ b/src/share/vm/runtime/stubRoutines.cpp @@ -120,6 +120,10 @@ address StubRoutines::_arrayof_jbyte_fill; address StubRoutines::_arrayof_jshort_fill; address StubRoutines::_arrayof_jint_fill; +address StubRoutines::_aescrypt_encryptBlock = NULL; +address StubRoutines::_aescrypt_decryptBlock = NULL; +address StubRoutines::_cipherBlockChaining_encryptAESCrypt = NULL; +address StubRoutines::_cipherBlockChaining_decryptAESCrypt = NULL; double (* StubRoutines::_intrinsic_log )(double) = NULL; double (* StubRoutines::_intrinsic_log10 )(double) = NULL; diff --git a/src/share/vm/runtime/stubRoutines.hpp b/src/share/vm/runtime/stubRoutines.hpp index 0e583aea0..91f273e65 100644 --- a/src/share/vm/runtime/stubRoutines.hpp +++ b/src/share/vm/runtime/stubRoutines.hpp @@ -199,6 +199,11 @@ class StubRoutines: AllStatic { // zero heap space aligned to jlong (8 bytes) static address _zero_aligned_words; + static address _aescrypt_encryptBlock; + static address _aescrypt_decryptBlock; + static address _cipherBlockChaining_encryptAESCrypt; + static address _cipherBlockChaining_decryptAESCrypt; + // These are versions of the java.lang.Math methods which perform // the same operations as the intrinsic version. They are used for // constant folding in the compiler to ensure equivalence. If the @@ -330,6 +335,11 @@ class StubRoutines: AllStatic { static address arrayof_jshort_fill() { return _arrayof_jshort_fill; } static address arrayof_jint_fill() { return _arrayof_jint_fill; } + static address aescrypt_encryptBlock() { return _aescrypt_encryptBlock; } + static address aescrypt_decryptBlock() { return _aescrypt_decryptBlock; } + static address cipherBlockChaining_encryptAESCrypt() { return _cipherBlockChaining_encryptAESCrypt; } + static address cipherBlockChaining_decryptAESCrypt() { return _cipherBlockChaining_decryptAESCrypt; } + static address select_fill_function(BasicType t, bool aligned, const char* &name); static address zero_aligned_words() { return _zero_aligned_words; } diff --git a/test/compiler/7184394/TestAESBase.java b/test/compiler/7184394/TestAESBase.java new file mode 100644 index 000000000..ad6c835cc --- /dev/null +++ b/test/compiler/7184394/TestAESBase.java @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +/** + * @author Tom Deneau + */ + +import javax.crypto.Cipher; +import javax.crypto.KeyGenerator; +import javax.crypto.SecretKey; +import javax.crypto.spec.IvParameterSpec; +import javax.crypto.spec.SecretKeySpec; +import java.security.AlgorithmParameters; + +import java.util.Random; +import java.util.Arrays; + +abstract public class TestAESBase { + int msgSize = Integer.getInteger("msgSize", 646); + boolean checkOutput = Boolean.getBoolean("checkOutput"); + boolean noReinit = Boolean.getBoolean("noReinit"); + int keySize = Integer.getInteger("keySize", 128); + String algorithm = System.getProperty("algorithm", "AES"); + String mode = System.getProperty("mode", "CBC"); + byte[] input; + byte[] encode; + byte[] expectedEncode; + byte[] decode; + byte[] expectedDecode; + Random random = new Random(0); + Cipher cipher; + Cipher dCipher; + String paddingStr = "PKCS5Padding"; + AlgorithmParameters algParams; + SecretKey key; + int ivLen; + + static int numThreads = 0; + int threadId; + static synchronized int getThreadId() { + int id = numThreads; + numThreads++; + return id; + } + + abstract public void run(); + + public void prepare() { + try { + System.out.println("\nmsgSize=" + msgSize + ", key size=" + keySize + ", reInit=" + !noReinit + ", checkOutput=" + checkOutput); + + int keyLenBytes = (keySize == 0 ? 16 : keySize/8); + byte keyBytes[] = new byte[keyLenBytes]; + if (keySize == 128) + keyBytes = new byte[] {-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7}; + else + random.nextBytes(keyBytes); + + key = new SecretKeySpec(keyBytes, algorithm); + if (threadId == 0) { + System.out.println("Algorithm: " + key.getAlgorithm() + "(" + + key.getEncoded().length * 8 + "bit)"); + } + input = new byte[msgSize]; + for (int i=0; i 0 ? Integer.valueOf(args[0]) : 1000000); + System.out.println(iters + " iterations"); + TestAESEncode etest = new TestAESEncode(); + etest.prepare(); + long start = System.nanoTime(); + for (int i=0; i