提交 32ef56e0 编写于 作者: K kvn

8004835: Improve AES intrinsics on x86

Summary: Enable AES intrinsics on non-AVX cpus, group together aes instructions in crypto stubs.
Reviewed-by: roland, twisti
上级 073672b9
......@@ -2393,7 +2393,6 @@ void Assembler::pshufb(XMMRegister dst, XMMRegister src) {
void Assembler::pshufb(XMMRegister dst, Address src) {
assert(VM_Version::supports_ssse3(), "");
assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
InstructionMark im(this);
simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
emit_byte(0x00);
......
......@@ -3085,7 +3085,8 @@ void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
// Used in sign-bit flipping with aligned address.
assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
if (reachable(src)) {
Assembler::pshufb(dst, as_Address(src));
} else {
......
......@@ -2174,13 +2174,13 @@ class StubGenerator: public StubCodeGenerator {
// c_rarg2 - K (key) in little endian int array
//
address generate_aescrypt_encryptBlock() {
assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
assert(UseAES, "need AES instructions and misaligned SSE support");
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
Label L_doLast;
address start = __ pc();
const Register from = rsi; // source array address
const Register from = rdx; // source array address
const Register to = rdx; // destination array address
const Register key = rcx; // key array address
const Register keylen = rax;
......@@ -2189,47 +2189,74 @@ class StubGenerator: public StubCodeGenerator {
const Address key_param (rbp, 8+8);
const XMMRegister xmm_result = xmm0;
const XMMRegister xmm_temp = xmm1;
const XMMRegister xmm_key_shuf_mask = xmm2;
const XMMRegister xmm_key_shuf_mask = xmm1;
const XMMRegister xmm_temp1 = xmm2;
const XMMRegister xmm_temp2 = xmm3;
const XMMRegister xmm_temp3 = xmm4;
const XMMRegister xmm_temp4 = xmm5;
__ enter(); // required for proper stackwalking of RuntimeStub frame
__ push(rsi);
__ movptr(from , from_param);
__ movptr(to , to_param);
__ movptr(key , key_param);
__ enter(); // required for proper stackwalking of RuntimeStub frame
__ movptr(from, from_param);
__ movptr(key, key_param);
// keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
__ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
// keylen = # of 32-bit words, convert to 128-bit words
__ shrl(keylen, 2);
__ subl(keylen, 11); // every key has at least 11 128-bit words, some have more
__ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
__ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input
__ movptr(to, to_param);
// For encryption, the java expanded key ordering is just what we need
load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
__ pxor(xmm_result, xmm_temp);
for (int offset = 0x10; offset <= 0x90; offset += 0x10) {
aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
}
load_key (xmm_temp, key, 0xa0, xmm_key_shuf_mask);
__ cmpl(keylen, 0);
__ jcc(Assembler::equal, L_doLast);
__ aesenc(xmm_result, xmm_temp); // only in 192 and 256 bit keys
aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask);
__ subl(keylen, 2);
__ jcc(Assembler::equal, L_doLast);
__ aesenc(xmm_result, xmm_temp); // only in 256 bit keys
aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask);
load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
__ pxor(xmm_result, xmm_temp1);
load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
__ aesenc(xmm_result, xmm_temp1);
__ aesenc(xmm_result, xmm_temp2);
__ aesenc(xmm_result, xmm_temp3);
__ aesenc(xmm_result, xmm_temp4);
load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
__ aesenc(xmm_result, xmm_temp1);
__ aesenc(xmm_result, xmm_temp2);
__ aesenc(xmm_result, xmm_temp3);
__ aesenc(xmm_result, xmm_temp4);
load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
__ cmpl(keylen, 44);
__ jccb(Assembler::equal, L_doLast);
__ aesenc(xmm_result, xmm_temp1);
__ aesenc(xmm_result, xmm_temp2);
load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
__ cmpl(keylen, 52);
__ jccb(Assembler::equal, L_doLast);
__ aesenc(xmm_result, xmm_temp1);
__ aesenc(xmm_result, xmm_temp2);
load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
__ BIND(L_doLast);
__ aesenclast(xmm_result, xmm_temp);
__ aesenc(xmm_result, xmm_temp1);
__ aesenclast(xmm_result, xmm_temp2);
__ movdqu(Address(to, 0), xmm_result); // store the result
__ xorptr(rax, rax); // return 0
__ pop(rsi);
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
......@@ -2245,13 +2272,13 @@ class StubGenerator: public StubCodeGenerator {
// c_rarg2 - K (key) in little endian int array
//
address generate_aescrypt_decryptBlock() {
assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
assert(UseAES, "need AES instructions and misaligned SSE support");
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
Label L_doLast;
address start = __ pc();
const Register from = rsi; // source array address
const Register from = rdx; // source array address
const Register to = rdx; // destination array address
const Register key = rcx; // key array address
const Register keylen = rax;
......@@ -2260,51 +2287,76 @@ class StubGenerator: public StubCodeGenerator {
const Address key_param (rbp, 8+8);
const XMMRegister xmm_result = xmm0;
const XMMRegister xmm_temp = xmm1;
const XMMRegister xmm_key_shuf_mask = xmm2;
const XMMRegister xmm_key_shuf_mask = xmm1;
const XMMRegister xmm_temp1 = xmm2;
const XMMRegister xmm_temp2 = xmm3;
const XMMRegister xmm_temp3 = xmm4;
const XMMRegister xmm_temp4 = xmm5;
__ enter(); // required for proper stackwalking of RuntimeStub frame
__ push(rsi);
__ movptr(from , from_param);
__ movptr(to , to_param);
__ movptr(key , key_param);
__ movptr(from, from_param);
__ movptr(key, key_param);
// keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
__ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
// keylen = # of 32-bit words, convert to 128-bit words
__ shrl(keylen, 2);
__ subl(keylen, 11); // every key has at least 11 128-bit words, some have more
__ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
__ movdqu(xmm_result, Address(from, 0));
__ movptr(to, to_param);
// for decryption java expanded key ordering is rotated one position from what we want
// so we start from 0x10 here and hit 0x00 last
// we don't know if the key is aligned, hence not using load-execute form
load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask);
__ pxor (xmm_result, xmm_temp);
for (int offset = 0x20; offset <= 0xa0; offset += 0x10) {
aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
}
__ cmpl(keylen, 0);
__ jcc(Assembler::equal, L_doLast);
// only in 192 and 256 bit keys
aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask);
__ subl(keylen, 2);
__ jcc(Assembler::equal, L_doLast);
// only in 256 bit keys
aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask);
load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
__ pxor (xmm_result, xmm_temp1);
__ aesdec(xmm_result, xmm_temp2);
__ aesdec(xmm_result, xmm_temp3);
__ aesdec(xmm_result, xmm_temp4);
load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
__ aesdec(xmm_result, xmm_temp1);
__ aesdec(xmm_result, xmm_temp2);
__ aesdec(xmm_result, xmm_temp3);
__ aesdec(xmm_result, xmm_temp4);
load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
__ cmpl(keylen, 44);
__ jccb(Assembler::equal, L_doLast);
__ aesdec(xmm_result, xmm_temp1);
__ aesdec(xmm_result, xmm_temp2);
load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
__ cmpl(keylen, 52);
__ jccb(Assembler::equal, L_doLast);
__ aesdec(xmm_result, xmm_temp1);
__ aesdec(xmm_result, xmm_temp2);
load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
__ BIND(L_doLast);
// for decryption the aesdeclast operation is always on key+0x00
load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
__ aesdeclast(xmm_result, xmm_temp);
__ aesdec(xmm_result, xmm_temp1);
__ aesdec(xmm_result, xmm_temp2);
// for decryption the aesdeclast operation is always on key+0x00
__ aesdeclast(xmm_result, xmm_temp3);
__ movdqu(Address(to, 0), xmm_result); // store the result
__ xorptr(rax, rax); // return 0
__ pop(rsi);
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
......@@ -2340,7 +2392,7 @@ class StubGenerator: public StubCodeGenerator {
// c_rarg4 - input length
//
address generate_cipherBlockChaining_encryptAESCrypt() {
assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
assert(UseAES, "need AES instructions and misaligned SSE support");
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
address start = __ pc();
......@@ -2393,7 +2445,7 @@ class StubGenerator: public StubCodeGenerator {
__ jcc(Assembler::notEqual, L_key_192_256);
// 128 bit code follows here
__ movptr(pos, 0);
__ movl(pos, 0);
__ align(OptoLoopAlignment);
__ BIND(L_loopTop_128);
__ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
......@@ -2423,15 +2475,15 @@ class StubGenerator: public StubCodeGenerator {
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
__ BIND(L_key_192_256);
// here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
__ BIND(L_key_192_256);
// here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
__ cmpl(rax, 52);
__ jcc(Assembler::notEqual, L_key_256);
// 192-bit code follows here (could be changed to use more xmm registers)
__ movptr(pos, 0);
__ align(OptoLoopAlignment);
__ BIND(L_loopTop_192);
__ movl(pos, 0);
__ align(OptoLoopAlignment);
__ BIND(L_loopTop_192);
__ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
__ pxor (xmm_result, xmm_temp); // xor with the current r vector
......@@ -2452,11 +2504,11 @@ class StubGenerator: public StubCodeGenerator {
__ jcc(Assembler::notEqual, L_loopTop_192);
__ jmp(L_exit);
__ BIND(L_key_256);
__ BIND(L_key_256);
// 256-bit code follows here (could be changed to use more xmm registers)
__ movptr(pos, 0);
__ align(OptoLoopAlignment);
__ BIND(L_loopTop_256);
__ movl(pos, 0);
__ align(OptoLoopAlignment);
__ BIND(L_loopTop_256);
__ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
__ pxor (xmm_result, xmm_temp); // xor with the current r vector
......@@ -2495,7 +2547,7 @@ class StubGenerator: public StubCodeGenerator {
//
address generate_cipherBlockChaining_decryptAESCrypt() {
assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
assert(UseAES, "need AES instructions and misaligned SSE support");
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
address start = __ pc();
......@@ -2556,9 +2608,9 @@ class StubGenerator: public StubCodeGenerator {
// 128-bit code follows here, parallelized
__ movptr(pos, 0);
__ align(OptoLoopAlignment);
__ BIND(L_singleBlock_loopTop_128);
__ movl(pos, 0);
__ align(OptoLoopAlignment);
__ BIND(L_singleBlock_loopTop_128);
__ cmpptr(len_reg, 0); // any blocks left??
__ jcc(Assembler::equal, L_exit);
__ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
......@@ -2597,7 +2649,7 @@ class StubGenerator: public StubCodeGenerator {
__ jcc(Assembler::notEqual, L_key_256);
// 192-bit code follows here (could be optimized to use parallelism)
__ movptr(pos, 0);
__ movl(pos, 0);
__ align(OptoLoopAlignment);
__ BIND(L_singleBlock_loopTop_192);
__ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
......@@ -2622,7 +2674,7 @@ class StubGenerator: public StubCodeGenerator {
__ BIND(L_key_256);
// 256-bit code follows here (could be optimized to use parallelism)
__ movptr(pos, 0);
__ movl(pos, 0);
__ align(OptoLoopAlignment);
__ BIND(L_singleBlock_loopTop_256);
__ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
......
......@@ -489,8 +489,8 @@ void VM_Version::get_processor_features() {
}
// The AES intrinsic stubs require AES instruction support (of course)
// but also require AVX and sse3 modes for instructions it use.
if (UseAES && (UseAVX > 0) && (UseSSE > 2)) {
// but also require sse3 mode for instructions it use.
if (UseAES && (UseSSE > 2)) {
if (FLAG_IS_DEFAULT(UseAESIntrinsics)) {
UseAESIntrinsics = true;
}
......
......@@ -54,7 +54,6 @@ abstract public class TestAESBase {
String paddingStr = "PKCS5Padding";
AlgorithmParameters algParams;
SecretKey key;
int ivLen;
static int numThreads = 0;
int threadId;
......@@ -68,7 +67,7 @@ abstract public class TestAESBase {
public void prepare() {
try {
System.out.println("\nmsgSize=" + msgSize + ", key size=" + keySize + ", reInit=" + !noReinit + ", checkOutput=" + checkOutput);
System.out.println("\nalgorithm=" + algorithm + ", mode=" + mode + ", msgSize=" + msgSize + ", keySize=" + keySize + ", noReinit=" + noReinit + ", checkOutput=" + checkOutput);
int keyLenBytes = (keySize == 0 ? 16 : keySize/8);
byte keyBytes[] = new byte[keyLenBytes];
......@@ -90,10 +89,14 @@ abstract public class TestAESBase {
cipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE");
dCipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE");
ivLen = (algorithm.equals("AES") ? 16 : algorithm.equals("DES") ? 8 : 0);
IvParameterSpec initVector = new IvParameterSpec(new byte[ivLen]);
cipher.init(Cipher.ENCRYPT_MODE, key, initVector);
if (mode.equals("CBC")) {
int ivLen = (algorithm.equals("AES") ? 16 : algorithm.equals("DES") ? 8 : 0);
IvParameterSpec initVector = new IvParameterSpec(new byte[ivLen]);
cipher.init(Cipher.ENCRYPT_MODE, key, initVector);
} else {
algParams = cipher.getParameters();
cipher.init(Cipher.ENCRYPT_MODE, key, algParams);
}
algParams = cipher.getParameters();
dCipher.init(Cipher.DECRYPT_MODE, key, algParams);
if (threadId == 0) {
......
......@@ -27,7 +27,8 @@
* @bug 7184394
* @summary add intrinsics to use AES instructions
*
* @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true TestAESMain
* @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC TestAESMain
* @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB TestAESMain
*
* @author Tom Deneau
*/
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册