提交 de15436a 编写于 作者: A asaha

Merge

......@@ -810,6 +810,12 @@ da43260704c28b9f19cb652090ae65c258220fd6 jdk8u72-b31
c1031a924f2c910fad078838b88a2f0146f2de98 jdk8u74-b01
ca9cae9aa9e989bbe6713c91d55c913edeaecce4 jdk8u74-b02
a5b78b56841e97ce00463874f1b7f63c54d84934 jdk8u74-b31
94ec11846b18111e73929b6caa9fbe7262e142c1 jdk8u74-b32
1b6d4fd2730e58f17820930f797938dc182117c4 jdk8u77-b00
ddd297e340b1170d3cec011ee64e729f8b493c86 jdk8u77-b01
1b4072e4bb3ad54c4e894998486a8b33f0689160 jdk8u77-b02
223b64a19e94222dd97b92bb40abcfbc0bf6ef1f jdk8u77-b03
dd8507f51d786572dae18af8ffdc5a1ea34c755e jdk8u77-b31
94ec11846b18111e73929b6caa9fbe7262e142c1 jdk8u74-b32
da43260704c28b9f19cb652090ae65c258220fd6 jdk8u72-b31
......@@ -832,6 +838,7 @@ ddd297e340b1170d3cec011ee64e729f8b493c86 jdk8u77-b01
223b64a19e94222dd97b92bb40abcfbc0bf6ef1f jdk8u77-b03
bbbb05e91c629f8d9eef2ba43933767f68a898b0 jdk8u91-b00
e36b6ade0499eadfd8673fe62ef0a613af2e6d67 jdk8u91-b13
fa8991ccf6e5b74890a0b5672440b3c09d8d8732 jdk8u91-b14
d7b01fb81aa8a5437cb03bc36afe15cf0e55fb89 jdk8u76-b00
c1679cc87ba045219169cabb6b9b378c2b5cc578 jdk8u76-b01
218483967e52b419d885d34af4488a81c5133804 jdk8u76-b02
......@@ -847,5 +854,6 @@ a2b0ee820059a44be558a2d435b7d85ed5a8b63a jdk8u76-b10
9a87701e22b3cae79fdfd8cdb732051e02a710fa jdk8u76-b12
481dcde745b6aec035781ed9f6797cfc93719f71 jdk8u92-b00
f3e1e734e2d29101a9537ddeb71ecad413fcd352 jdk8u92-b13
24a09407d71bb2cc4848bfa21660c890b4d722b1 jdk8u92-b14
b374548dcb4834eb8731a06b52faddd0f10bd45d jdk8u81-b00
ead07188d11107e877e8e4ad215ff6cb238a8a92 jdk8u101-b01
......@@ -2318,6 +2318,13 @@ void Assembler::orl(Register dst, Register src) {
emit_arith(0x0B, 0xC0, dst, src);
}
void Assembler::orl(Address dst, Register src) {
InstructionMark im(this);
prefix(dst, src);
emit_int8(0x09);
emit_operand(src, dst);
}
void Assembler::packuswb(XMMRegister dst, Address src) {
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
......@@ -5613,6 +5620,19 @@ void Assembler::rclq(Register dst, int imm8) {
}
}
void Assembler::rcrq(Register dst, int imm8) {
assert(isShiftCount(imm8 >> 1), "illegal shift count");
int encode = prefixq_and_encode(dst->encoding());
if (imm8 == 1) {
emit_int8((unsigned char)0xD1);
emit_int8((unsigned char)(0xD8 | encode));
} else {
emit_int8((unsigned char)0xC1);
emit_int8((unsigned char)(0xD8 | encode));
emit_int8(imm8);
}
}
void Assembler::rorq(Register dst, int imm8) {
assert(isShiftCount(imm8 >> 1), "illegal shift count");
int encode = prefixq_and_encode(dst->encoding());
......
......@@ -1455,6 +1455,7 @@ private:
void orl(Register dst, int32_t imm32);
void orl(Register dst, Address src);
void orl(Register dst, Register src);
void orl(Address dst, Register src);
void orq(Address dst, int32_t imm32);
void orq(Register dst, int32_t imm32);
......@@ -1555,6 +1556,8 @@ private:
void rclq(Register dst, int imm8);
void rcrq(Register dst, int imm8);
void rdtsc();
void ret(int imm16);
......
......@@ -7769,6 +7769,503 @@ void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Regi
pop(tmp2);
pop(tmp1);
}
//Helper functions for square_to_len()
/**
* Store the squares of x[], right shifted one bit (divided by 2) into z[]
* Preserves x and z and modifies rest of the registers.
*/
void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
// Perform square and right shift by 1
// Handle odd xlen case first, then for even xlen do the following
// jlong carry = 0;
// for (int j=0, i=0; j < xlen; j+=2, i+=4) {
// huge_128 product = x[j:j+1] * x[j:j+1];
// z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
// z[i+2:i+3] = (jlong)(product >>> 1);
// carry = (jlong)product;
// }
xorq(tmp5, tmp5); // carry
xorq(rdxReg, rdxReg);
xorl(tmp1, tmp1); // index for x
xorl(tmp4, tmp4); // index for z
Label L_first_loop, L_first_loop_exit;
testl(xlen, 1);
jccb(Assembler::zero, L_first_loop); //jump if xlen is even
// Square and right shift by 1 the odd element using 32 bit multiply
movl(raxReg, Address(x, tmp1, Address::times_4, 0));
imulq(raxReg, raxReg);
shrq(raxReg, 1);
adcq(tmp5, 0);
movq(Address(z, tmp4, Address::times_4, 0), raxReg);
incrementl(tmp1);
addl(tmp4, 2);
// Square and right shift by 1 the rest using 64 bit multiply
bind(L_first_loop);
cmpptr(tmp1, xlen);
jccb(Assembler::equal, L_first_loop_exit);
// Square
movq(raxReg, Address(x, tmp1, Address::times_4, 0));
rorq(raxReg, 32); // convert big-endian to little-endian
mulq(raxReg); // 64-bit multiply rax * rax -> rdx:rax
// Right shift by 1 and save carry
shrq(tmp5, 1); // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
rcrq(rdxReg, 1);
rcrq(raxReg, 1);
adcq(tmp5, 0);
// Store result in z
movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
movq(Address(z, tmp4, Address::times_4, 8), raxReg);
// Update indices for x and z
addl(tmp1, 2);
addl(tmp4, 4);
jmp(L_first_loop);
bind(L_first_loop_exit);
}
/**
* Perform the following multiply add operation using BMI2 instructions
* carry:sum = sum + op1*op2 + carry
* op2 should be in rdx
* op2 is preserved, all other registers are modified
*/
void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
// assert op2 is rdx
mulxq(tmp2, op1, op1); // op1 * op2 -> tmp2:op1
addq(sum, carry);
adcq(tmp2, 0);
addq(sum, op1);
adcq(tmp2, 0);
movq(carry, tmp2);
}
/**
* Perform the following multiply add operation:
* carry:sum = sum + op1*op2 + carry
* Preserves op1, op2 and modifies rest of registers
*/
void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
// rdx:rax = op1 * op2
movq(raxReg, op2);
mulq(op1);
// rdx:rax = sum + carry + rdx:rax
addq(sum, carry);
adcq(rdxReg, 0);
addq(sum, raxReg);
adcq(rdxReg, 0);
// carry:sum = rdx:sum
movq(carry, rdxReg);
}
/**
* Add 64 bit long carry into z[] with carry propogation.
* Preserves z and carry register values and modifies rest of registers.
*
*/
void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
Label L_fourth_loop, L_fourth_loop_exit;
movl(tmp1, 1);
subl(zlen, 2);
addq(Address(z, zlen, Address::times_4, 0), carry);
bind(L_fourth_loop);
jccb(Assembler::carryClear, L_fourth_loop_exit);
subl(zlen, 2);
jccb(Assembler::negative, L_fourth_loop_exit);
addq(Address(z, zlen, Address::times_4, 0), tmp1);
jmp(L_fourth_loop);
bind(L_fourth_loop_exit);
}
/**
* Shift z[] left by 1 bit.
* Preserves x, len, z and zlen registers and modifies rest of the registers.
*
*/
void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
Label L_fifth_loop, L_fifth_loop_exit;
// Fifth loop
// Perform primitiveLeftShift(z, zlen, 1)
const Register prev_carry = tmp1;
const Register new_carry = tmp4;
const Register value = tmp2;
const Register zidx = tmp3;
// int zidx, carry;
// long value;
// carry = 0;
// for (zidx = zlen-2; zidx >=0; zidx -= 2) {
// (carry:value) = (z[i] << 1) | carry ;
// z[i] = value;
// }
movl(zidx, zlen);
xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
bind(L_fifth_loop);
decl(zidx); // Use decl to preserve carry flag
decl(zidx);
jccb(Assembler::negative, L_fifth_loop_exit);
if (UseBMI2Instructions) {
movq(value, Address(z, zidx, Address::times_4, 0));
rclq(value, 1);
rorxq(value, value, 32);
movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form
}
else {
// clear new_carry
xorl(new_carry, new_carry);
// Shift z[i] by 1, or in previous carry and save new carry
movq(value, Address(z, zidx, Address::times_4, 0));
shlq(value, 1);
adcl(new_carry, 0);
orq(value, prev_carry);
rorq(value, 0x20);
movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form
// Set previous carry = new carry
movl(prev_carry, new_carry);
}
jmp(L_fifth_loop);
bind(L_fifth_loop_exit);
}
/**
* Code for BigInteger::squareToLen() intrinsic
*
* rdi: x
* rsi: len
* r8: z
* rcx: zlen
* r12: tmp1
* r13: tmp2
* r14: tmp3
* r15: tmp4
* rbx: tmp5
*
*/
void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, fifth_loop, fifth_loop_exit, L_last_x, L_multiply;
push(tmp1);
push(tmp2);
push(tmp3);
push(tmp4);
push(tmp5);
// First loop
// Store the squares, right shifted one bit (i.e., divided by 2).
square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
// Add in off-diagonal sums.
//
// Second, third (nested) and fourth loops.
// zlen +=2;
// for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
// carry = 0;
// long op2 = x[xidx:xidx+1];
// for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
// k -= 2;
// long op1 = x[j:j+1];
// long sum = z[k:k+1];
// carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
// z[k:k+1] = sum;
// }
// add_one_64(z, k, carry, tmp_regs);
// }
const Register carry = tmp5;
const Register sum = tmp3;
const Register op1 = tmp4;
Register op2 = tmp2;
push(zlen);
push(len);
addl(zlen,2);
bind(L_second_loop);
xorq(carry, carry);
subl(zlen, 4);
subl(len, 2);
push(zlen);
push(len);
cmpl(len, 0);
jccb(Assembler::lessEqual, L_second_loop_exit);
// Multiply an array by one 64 bit long.
if (UseBMI2Instructions) {
op2 = rdxReg;
movq(op2, Address(x, len, Address::times_4, 0));
rorxq(op2, op2, 32);
}
else {
movq(op2, Address(x, len, Address::times_4, 0));
rorq(op2, 32);
}
bind(L_third_loop);
decrementl(len);
jccb(Assembler::negative, L_third_loop_exit);
decrementl(len);
jccb(Assembler::negative, L_last_x);
movq(op1, Address(x, len, Address::times_4, 0));
rorq(op1, 32);
bind(L_multiply);
subl(zlen, 2);
movq(sum, Address(z, zlen, Address::times_4, 0));
// Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
if (UseBMI2Instructions) {
multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
}
else {
multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
}
movq(Address(z, zlen, Address::times_4, 0), sum);
jmp(L_third_loop);
bind(L_third_loop_exit);
// Fourth loop
// Add 64 bit long carry into z with carry propogation.
// Uses offsetted zlen.
add_one_64(z, zlen, carry, tmp1);
pop(len);
pop(zlen);
jmp(L_second_loop);
// Next infrequent code is moved outside loops.
bind(L_last_x);
movl(op1, Address(x, 0));
jmp(L_multiply);
bind(L_second_loop_exit);
pop(len);
pop(zlen);
pop(len);
pop(zlen);
// Fifth loop
// Shift z left 1 bit.
lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
// z[zlen-1] |= x[len-1] & 1;
movl(tmp3, Address(x, len, Address::times_4, -4));
andl(tmp3, 1);
orl(Address(z, zlen, Address::times_4, -4), tmp3);
pop(tmp5);
pop(tmp4);
pop(tmp3);
pop(tmp2);
pop(tmp1);
}
/**
* Helper function for mul_add()
* Multiply the in[] by int k and add to out[] starting at offset offs using
* 128 bit by 32 bit multiply and return the carry in tmp5.
* Only quad int aligned length of in[] is operated on in this function.
* k is in rdxReg for BMI2Instructions, for others it is in tmp2.
* This function preserves out, in and k registers.
* len and offset point to the appropriate index in "in" & "out" correspondingly
* tmp5 has the carry.
* other registers are temporary and are modified.
*
*/
void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
Label L_first_loop, L_first_loop_exit;
movl(tmp1, len);
shrl(tmp1, 2);
bind(L_first_loop);
subl(tmp1, 1);
jccb(Assembler::negative, L_first_loop_exit);
subl(len, 4);
subl(offset, 4);
Register op2 = tmp2;
const Register sum = tmp3;
const Register op1 = tmp4;
const Register carry = tmp5;
if (UseBMI2Instructions) {
op2 = rdxReg;
}
movq(op1, Address(in, len, Address::times_4, 8));
rorq(op1, 32);
movq(sum, Address(out, offset, Address::times_4, 8));
rorq(sum, 32);
if (UseBMI2Instructions) {
multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
}
else {
multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
}
// Store back in big endian from little endian
rorq(sum, 0x20);
movq(Address(out, offset, Address::times_4, 8), sum);
movq(op1, Address(in, len, Address::times_4, 0));
rorq(op1, 32);
movq(sum, Address(out, offset, Address::times_4, 0));
rorq(sum, 32);
if (UseBMI2Instructions) {
multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
}
else {
multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
}
// Store back in big endian from little endian
rorq(sum, 0x20);
movq(Address(out, offset, Address::times_4, 0), sum);
jmp(L_first_loop);
bind(L_first_loop_exit);
}
/**
* Code for BigInteger::mulAdd() intrinsic
*
* rdi: out
* rsi: in
* r11: offs (out.length - offset)
* rcx: len
* r8: k
* r12: tmp1
* r13: tmp2
* r14: tmp3
* r15: tmp4
* rbx: tmp5
* Multiply the in[] by word k and add to out[], return the carry in rax
*/
void MacroAssembler::mul_add(Register out, Register in, Register offs,
Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
Label L_carry, L_last_in, L_done;
// carry = 0;
// for (int j=len-1; j >= 0; j--) {
// long product = (in[j] & LONG_MASK) * kLong +
// (out[offs] & LONG_MASK) + carry;
// out[offs--] = (int)product;
// carry = product >>> 32;
// }
//
push(tmp1);
push(tmp2);
push(tmp3);
push(tmp4);
push(tmp5);
Register op2 = tmp2;
const Register sum = tmp3;
const Register op1 = tmp4;
const Register carry = tmp5;
if (UseBMI2Instructions) {
op2 = rdxReg;
movl(op2, k);
}
else {
movl(op2, k);
}
xorq(carry, carry);
//First loop
//Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
//The carry is in tmp5
mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
//Multiply the trailing in[] entry using 64 bit by 32 bit, if any
decrementl(len);
jccb(Assembler::negative, L_carry);
decrementl(len);
jccb(Assembler::negative, L_last_in);
movq(op1, Address(in, len, Address::times_4, 0));
rorq(op1, 32);
subl(offs, 2);
movq(sum, Address(out, offs, Address::times_4, 0));
rorq(sum, 32);
if (UseBMI2Instructions) {
multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
}
else {
multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
}
// Store back in big endian from little endian
rorq(sum, 0x20);
movq(Address(out, offs, Address::times_4, 0), sum);
testl(len, len);
jccb(Assembler::zero, L_carry);
//Multiply the last in[] entry, if any
bind(L_last_in);
movl(op1, Address(in, 0));
movl(sum, Address(out, offs, Address::times_4, -4));
movl(raxReg, k);
mull(op1); //tmp4 * eax -> edx:eax
addl(sum, carry);
adcl(rdxReg, 0);
addl(sum, raxReg);
adcl(rdxReg, 0);
movl(carry, rdxReg);
movl(Address(out, offs, Address::times_4, -4), sum);
bind(L_carry);
//return tmp5/carry as carry in rax
movl(rax, carry);
bind(L_done);
pop(tmp5);
pop(tmp4);
pop(tmp3);
pop(tmp2);
pop(tmp1);
}
#endif
/**
......
......@@ -1241,6 +1241,25 @@ public:
Register carry2);
void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5);
void square_rshift(Register x, Register len, Register z, Register tmp1, Register tmp3,
Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
void multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry,
Register tmp2);
void multiply_add_64(Register sum, Register op1, Register op2, Register carry,
Register rdxReg, Register raxReg);
void add_one_64(Register z, Register zlen, Register carry, Register tmp1);
void lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
Register tmp3, Register tmp4);
void square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
void mul_add_128_x_32_loop(Register out, Register in, Register offset, Register len, Register tmp1,
Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
Register raxReg);
void mul_add(Register out, Register in, Register offset, Register len, Register k, Register tmp1,
Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
Register raxReg);
#endif
// CRC32 code for java.util.zip.CRC32::updateBytes() instrinsic.
......
......@@ -23,6 +23,9 @@
*/
#include "precompiled.hpp"
#ifndef _WINDOWS
#include "alloca.h"
#endif
#include "asm/macroAssembler.hpp"
#include "asm/macroAssembler.inline.hpp"
#include "code/debugInfoRec.hpp"
......@@ -3966,6 +3969,256 @@ RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const cha
}
//------------------------------Montgomery multiplication------------------------
//
#ifndef _WINDOWS
#define ASM_SUBTRACT
#ifdef ASM_SUBTRACT
// Subtract 0:b from carry:a. Return carry.
static unsigned long
sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
long i = 0, cnt = len;
unsigned long tmp;
asm volatile("clc; "
"0: ; "
"mov (%[b], %[i], 8), %[tmp]; "
"sbb %[tmp], (%[a], %[i], 8); "
"inc %[i]; dec %[cnt]; "
"jne 0b; "
"mov %[carry], %[tmp]; sbb $0, %[tmp]; "
: [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
: [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
: "memory");
return tmp;
}
#else // ASM_SUBTRACT
typedef int __attribute__((mode(TI))) int128;
// Subtract 0:b from carry:a. Return carry.
static unsigned long
sub(unsigned long a[], unsigned long b[], unsigned long carry, int len) {
int128 tmp = 0;
int i;
for (i = 0; i < len; i++) {
tmp += a[i];
tmp -= b[i];
a[i] = tmp;
tmp >>= 64;
assert(-1 <= tmp && tmp <= 0, "invariant");
}
return tmp + carry;
}
#endif // ! ASM_SUBTRACT
// Multiply (unsigned) Long A by Long B, accumulating the double-
// length result into the accumulator formed of T0, T1, and T2.
#define MACC(A, B, T0, T1, T2) \
do { \
unsigned long hi, lo; \
asm volatile("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
: "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
: "r"(A), "a"(B) : "cc"); \
} while(0)
// As above, but add twice the double-length result into the
// accumulator.
#define MACC2(A, B, T0, T1, T2) \
do { \
unsigned long hi, lo; \
asm volatile("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4;" \
"add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
: "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
: "r"(A), "a"(B) : "cc"); \
} while(0)
// Fast Montgomery multiplication. The derivation of the algorithm is
// in A Cryptographic Library for the Motorola DSP56000,
// Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
static void __attribute__((noinline))
montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
unsigned long m[], unsigned long inv, int len) {
unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
int i;
assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
for (i = 0; i < len; i++) {
int j;
for (j = 0; j < i; j++) {
MACC(a[j], b[i-j], t0, t1, t2);
MACC(m[j], n[i-j], t0, t1, t2);
}
MACC(a[i], b[0], t0, t1, t2);
m[i] = t0 * inv;
MACC(m[i], n[0], t0, t1, t2);
assert(t0 == 0, "broken Montgomery multiply");
t0 = t1; t1 = t2; t2 = 0;
}
for (i = len; i < 2*len; i++) {
int j;
for (j = i-len+1; j < len; j++) {
MACC(a[j], b[i-j], t0, t1, t2);
MACC(m[j], n[i-j], t0, t1, t2);
}
m[i-len] = t0;
t0 = t1; t1 = t2; t2 = 0;
}
while (t0)
t0 = sub(m, n, t0, len);
}
// Fast Montgomery squaring. This uses asymptotically 25% fewer
// multiplies so it should be up to 25% faster than Montgomery
// multiplication. However, its loop control is more complex and it
// may actually run slower on some machines.
static void __attribute__((noinline))
montgomery_square(unsigned long a[], unsigned long n[],
unsigned long m[], unsigned long inv, int len) {
unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
int i;
assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
for (i = 0; i < len; i++) {
int j;
int end = (i+1)/2;
for (j = 0; j < end; j++) {
MACC2(a[j], a[i-j], t0, t1, t2);
MACC(m[j], n[i-j], t0, t1, t2);
}
if ((i & 1) == 0) {
MACC(a[j], a[j], t0, t1, t2);
}
for (; j < i; j++) {
MACC(m[j], n[i-j], t0, t1, t2);
}
m[i] = t0 * inv;
MACC(m[i], n[0], t0, t1, t2);
assert(t0 == 0, "broken Montgomery square");
t0 = t1; t1 = t2; t2 = 0;
}
for (i = len; i < 2*len; i++) {
int start = i-len+1;
int end = start + (len - start)/2;
int j;
for (j = start; j < end; j++) {
MACC2(a[j], a[i-j], t0, t1, t2);
MACC(m[j], n[i-j], t0, t1, t2);
}
if ((i & 1) == 0) {
MACC(a[j], a[j], t0, t1, t2);
}
for (; j < len; j++) {
MACC(m[j], n[i-j], t0, t1, t2);
}
m[i-len] = t0;
t0 = t1; t1 = t2; t2 = 0;
}
while (t0)
t0 = sub(m, n, t0, len);
}
// Swap words in a longword.
static unsigned long swap(unsigned long x) {
return (x << 32) | (x >> 32);
}
// Copy len longwords from s to d, word-swapping as we go. The
// destination array is reversed.
static void reverse_words(unsigned long *s, unsigned long *d, int len) {
d += len;
while(len-- > 0) {
d--;
*d = swap(*s);
s++;
}
}
// The threshold at which squaring is advantageous was determined
// experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
#define MONTGOMERY_SQUARING_THRESHOLD 64
void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
jint len, jlong inv,
jint *m_ints) {
assert(len % 2 == 0, "array length in montgomery_multiply must be even");
int longwords = len/2;
// Make very sure we don't use so much space that the stack might
// overflow. 512 jints corresponds to an 16384-bit integer and
// will use here a total of 8k bytes of stack space.
int total_allocation = longwords * sizeof (unsigned long) * 4;
guarantee(total_allocation <= 8192, "must be");
unsigned long *scratch = (unsigned long *)alloca(total_allocation);
// Local scratch arrays
unsigned long
*a = scratch + 0 * longwords,
*b = scratch + 1 * longwords,
*n = scratch + 2 * longwords,
*m = scratch + 3 * longwords;
reverse_words((unsigned long *)a_ints, a, longwords);
reverse_words((unsigned long *)b_ints, b, longwords);
reverse_words((unsigned long *)n_ints, n, longwords);
::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords);
reverse_words(m, (unsigned long *)m_ints, longwords);
}
void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
jint len, jlong inv,
jint *m_ints) {
assert(len % 2 == 0, "array length in montgomery_square must be even");
int longwords = len/2;
// Make very sure we don't use so much space that the stack might
// overflow. 512 jints corresponds to an 16384-bit integer and
// will use here a total of 6k bytes of stack space.
int total_allocation = longwords * sizeof (unsigned long) * 3;
guarantee(total_allocation <= 8192, "must be");
unsigned long *scratch = (unsigned long *)alloca(total_allocation);
// Local scratch arrays
unsigned long
*a = scratch + 0 * longwords,
*n = scratch + 1 * longwords,
*m = scratch + 2 * longwords;
reverse_words((unsigned long *)a_ints, a, longwords);
reverse_words((unsigned long *)n_ints, n, longwords);
//montgomery_square fails to pass BigIntegerTest on solaris amd64
//on jdk7 and jdk8.
#ifndef SOLARIS
if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
#else
if (0) {
#endif
::montgomery_square(a, n, m, (unsigned long)inv, longwords);
} else {
::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords);
}
reverse_words(m, (unsigned long *)m_ints, longwords);
}
#endif // WINDOWS
#ifdef COMPILER2
// This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
//
......
......@@ -3743,6 +3743,107 @@ class StubGenerator: public StubCodeGenerator {
return start;
}
/**
* Arguments:
*
// Input:
// c_rarg0 - x address
// c_rarg1 - x length
// c_rarg2 - z address
// c_rarg3 - z lenth
*
*/
address generate_squareToLen() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "squareToLen");
address start = __ pc();
// Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
// Unix: rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...)
const Register x = rdi;
const Register len = rsi;
const Register z = r8;
const Register zlen = rcx;
const Register tmp1 = r12;
const Register tmp2 = r13;
const Register tmp3 = r14;
const Register tmp4 = r15;
const Register tmp5 = rbx;
BLOCK_COMMENT("Entry:");
__ enter(); // required for proper stackwalking of RuntimeStub frame
setup_arg_regs(4); // x => rdi, len => rsi, z => rdx
// zlen => rcx
// r9 and r10 may be used to save non-volatile registers
__ movptr(r8, rdx);
__ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
restore_arg_regs();
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
return start;
}
/**
* Arguments:
*
* Input:
* c_rarg0 - out address
* c_rarg1 - in address
* c_rarg2 - offset
* c_rarg3 - len
* not Win64
* c_rarg4 - k
* Win64
* rsp+40 - k
*/
address generate_mulAdd() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "mulAdd");
address start = __ pc();
// Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
// Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
const Register out = rdi;
const Register in = rsi;
const Register offset = r11;
const Register len = rcx;
const Register k = r8;
// Next registers will be saved on stack in mul_add().
const Register tmp1 = r12;
const Register tmp2 = r13;
const Register tmp3 = r14;
const Register tmp4 = r15;
const Register tmp5 = rbx;
BLOCK_COMMENT("Entry:");
__ enter(); // required for proper stackwalking of RuntimeStub frame
setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx
// len => rcx, k => r8
// r9 and r10 may be used to save non-volatile registers
#ifdef _WIN64
// last argument is on stack on Win64
__ movl(k, Address(rsp, 6 * wordSize));
#endif
__ movptr(r11, rdx); // move offset in rdx to offset(r11)
__ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
restore_arg_regs();
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
return start;
}
#undef __
#define __ masm->
......@@ -3987,7 +4088,24 @@ class StubGenerator: public StubCodeGenerator {
if (UseMultiplyToLenIntrinsic) {
StubRoutines::_multiplyToLen = generate_multiplyToLen();
}
#endif
if (UseSquareToLenIntrinsic) {
StubRoutines::_squareToLen = generate_squareToLen();
}
if (UseMulAddIntrinsic) {
StubRoutines::_mulAdd = generate_mulAdd();
}
#ifndef _WINDOWS
if (UseMontgomeryMultiplyIntrinsic) {
StubRoutines::_montgomeryMultiply
= CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
}
if (UseMontgomerySquareIntrinsic) {
StubRoutines::_montgomerySquare
= CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
}
#endif // WINDOWS
#endif // COMPILER2
}
public:
......
......@@ -33,7 +33,7 @@ static bool returns_to_call_stub(address return_pc) { return return_pc == _
enum platform_dependent_constants {
code_size1 = 19000, // simply increase if too small (assembler will crash if too small)
code_size2 = 22000 // simply increase if too small (assembler will crash if too small)
code_size2 = 23000 // simply increase if too small (assembler will crash if too small)
};
class x86 {
......
......@@ -703,6 +703,18 @@ void VM_Version::get_processor_features() {
if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
UseMultiplyToLenIntrinsic = true;
}
if (FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) {
UseSquareToLenIntrinsic = false;
}
if (FLAG_IS_DEFAULT(UseMulAddIntrinsic)) {
UseMulAddIntrinsic = false;
}
if (FLAG_IS_DEFAULT(UseMontgomeryMultiplyIntrinsic)) {
UseMontgomeryMultiplyIntrinsic = false;
}
if (FLAG_IS_DEFAULT(UseMontgomerySquareIntrinsic)) {
UseMontgomerySquareIntrinsic = false;
}
#else
if (UseMultiplyToLenIntrinsic) {
if (!FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
......@@ -710,6 +722,30 @@ void VM_Version::get_processor_features() {
}
FLAG_SET_DEFAULT(UseMultiplyToLenIntrinsic, false);
}
if (UseSquareToLenIntrinsic) {
if (!FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) {
warning("squareToLen intrinsic is not available in 32-bit VM");
}
FLAG_SET_DEFAULT(UseSquareToLenIntrinsic, false);
}
if (UseMulAddIntrinsic) {
if (!FLAG_IS_DEFAULT(UseMulAddIntrinsic)) {
warning("mulAdd intrinsic is not available in 32-bit VM");
}
FLAG_SET_DEFAULT(UseMulAddIntrinsic, false);
}
if (UseMontgomeryMultiplyIntrinsic) {
if (!FLAG_IS_DEFAULT(UseMontgomeryMultiplyIntrinsic)) {
warning("montgomeryMultiply intrinsic is not available in 32-bit VM");
}
FLAG_SET_DEFAULT(UseMontgomeryMultiplyIntrinsic, false);
}
if (UseMontgomerySquareIntrinsic) {
if (!FLAG_IS_DEFAULT(UseMontgomerySquareIntrinsic)) {
warning("montgomerySquare intrinsic is not available in 32-bit VM");
}
FLAG_SET_DEFAULT(UseMontgomerySquareIntrinsic, false);
}
#endif
#endif // COMPILER2
......
......@@ -793,10 +793,26 @@
do_signature(encodeISOArray_signature, "([CI[BII)I") \
\
do_class(java_math_BigInteger, "java/math/BigInteger") \
do_intrinsic(_multiplyToLen, java_math_BigInteger, multiplyToLen_name, multiplyToLen_signature, F_R) \
do_intrinsic(_multiplyToLen, java_math_BigInteger, multiplyToLen_name, multiplyToLen_signature, F_S) \
do_name( multiplyToLen_name, "multiplyToLen") \
do_signature(multiplyToLen_signature, "([II[II[I)[I") \
\
do_intrinsic(_squareToLen, java_math_BigInteger, squareToLen_name, squareToLen_signature, F_S) \
do_name( squareToLen_name, "implSquareToLen") \
do_signature(squareToLen_signature, "([II[II)[I") \
\
do_intrinsic(_mulAdd, java_math_BigInteger, mulAdd_name, mulAdd_signature, F_S) \
do_name( mulAdd_name, "implMulAdd") \
do_signature(mulAdd_signature, "([I[IIII)I") \
\
do_intrinsic(_montgomeryMultiply, java_math_BigInteger, montgomeryMultiply_name, montgomeryMultiply_signature, F_S) \
do_name( montgomeryMultiply_name, "implMontgomeryMultiply") \
do_signature(montgomeryMultiply_signature, "([I[I[IIJ[I)[I") \
\
do_intrinsic(_montgomerySquare, java_math_BigInteger, montgomerySquare_name, montgomerySquare_signature, F_S) \
do_name( montgomerySquare_name, "implMontgomerySquare") \
do_signature(montgomerySquare_signature, "([I[IIJ[I)[I") \
\
/* java/lang/ref/Reference */ \
do_intrinsic(_Reference_get, java_lang_ref_Reference, get_name, void_object_signature, F_R) \
\
......
......@@ -662,6 +662,18 @@
product(bool, UseMultiplyToLenIntrinsic, false, \
"Enables intrinsification of BigInteger.multiplyToLen()") \
\
product(bool, UseSquareToLenIntrinsic, false, \
"Enables intrinsification of BigInteger.squareToLen()") \
\
product(bool, UseMulAddIntrinsic, false, \
"Enables intrinsification of BigInteger.mulAdd()") \
\
product(bool, UseMontgomeryMultiplyIntrinsic, false, \
"Enables intrinsification of BigInteger.montgomeryMultiply()") \
\
product(bool, UseMontgomerySquareIntrinsic, false, \
"Enables intrinsification of BigInteger.montgomerySquare()") \
\
product(bool, UseTypeSpeculation, true, \
"Speculatively propagate types from profiles") \
\
......
......@@ -958,8 +958,12 @@ void ConnectionGraph::process_call_arguments(CallNode *call) {
strcmp(call->as_CallLeaf()->_name, "sha256_implCompressMB") == 0 ||
strcmp(call->as_CallLeaf()->_name, "sha512_implCompress") == 0 ||
strcmp(call->as_CallLeaf()->_name, "sha512_implCompressMB") == 0 ||
strcmp(call->as_CallLeaf()->_name, "multiplyToLen") == 0)
))) {
strcmp(call->as_CallLeaf()->_name, "multiplyToLen") == 0 ||
strcmp(call->as_CallLeaf()->_name, "squareToLen") == 0 ||
strcmp(call->as_CallLeaf()->_name, "mulAdd") == 0 ||
strcmp(call->as_CallLeaf()->_name, "montgomery_multiply") == 0 ||
strcmp(call->as_CallLeaf()->_name, "montgomery_square") == 0)
))) {
call->dump();
fatal(err_msg_res("EA unexpected CallLeaf %s", call->as_CallLeaf()->_name));
}
......
......@@ -324,6 +324,10 @@ class LibraryCallKit : public GraphKit {
bool inline_updateBytesCRC32();
bool inline_updateByteBufferCRC32();
bool inline_multiplyToLen();
bool inline_squareToLen();
bool inline_mulAdd();
bool inline_montgomeryMultiply();
bool inline_montgomerySquare();
bool inline_profileBoolean();
};
......@@ -527,6 +531,21 @@ CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) {
if (!UseMultiplyToLenIntrinsic) return NULL;
break;
case vmIntrinsics::_squareToLen:
if (!UseSquareToLenIntrinsic) return NULL;
break;
case vmIntrinsics::_mulAdd:
if (!UseMulAddIntrinsic) return NULL;
break;
case vmIntrinsics::_montgomeryMultiply:
if (!UseMontgomeryMultiplyIntrinsic) return NULL;
break;
case vmIntrinsics::_montgomerySquare:
if (!UseMontgomerySquareIntrinsic) return NULL;
break;
case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt:
case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt:
if (!UseAESIntrinsics) return NULL;
......@@ -927,6 +946,17 @@ bool LibraryCallKit::try_to_inline(int predicate) {
case vmIntrinsics::_multiplyToLen:
return inline_multiplyToLen();
case vmIntrinsics::_squareToLen:
return inline_squareToLen();
case vmIntrinsics::_mulAdd:
return inline_mulAdd();
case vmIntrinsics::_montgomeryMultiply:
return inline_montgomeryMultiply();
case vmIntrinsics::_montgomerySquare:
return inline_montgomerySquare();
case vmIntrinsics::_encodeISOArray:
return inline_encodeISOArray();
......@@ -5767,11 +5797,12 @@ bool LibraryCallKit::inline_multiplyToLen() {
assert(callee()->signature()->size() == 5, "multiplyToLen has 5 parameters");
Node* x = argument(1);
Node* xlen = argument(2);
Node* y = argument(3);
Node* ylen = argument(4);
Node* z = argument(5);
// no receiver because it is a static method
Node* x = argument(0);
Node* xlen = argument(1);
Node* y = argument(2);
Node* ylen = argument(3);
Node* z = argument(4);
const Type* x_type = x->Value(&_gvn);
const Type* y_type = y->Value(&_gvn);
......@@ -5856,6 +5887,215 @@ bool LibraryCallKit::inline_multiplyToLen() {
return true;
}
//-------------inline_squareToLen------------------------------------
bool LibraryCallKit::inline_squareToLen() {
assert(UseSquareToLenIntrinsic, "not implementated on this platform");
address stubAddr = StubRoutines::squareToLen();
if (stubAddr == NULL) {
return false; // Intrinsic's stub is not implemented on this platform
}
const char* stubName = "squareToLen";
assert(callee()->signature()->size() == 4, "implSquareToLen has 4 parameters");
Node* x = argument(0);
Node* len = argument(1);
Node* z = argument(2);
Node* zlen = argument(3);
const Type* x_type = x->Value(&_gvn);
const Type* z_type = z->Value(&_gvn);
const TypeAryPtr* top_x = x_type->isa_aryptr();
const TypeAryPtr* top_z = z_type->isa_aryptr();
if (top_x == NULL || top_x->klass() == NULL ||
top_z == NULL || top_z->klass() == NULL) {
// failed array check
return false;
}
BasicType x_elem = x_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
BasicType z_elem = z_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
if (x_elem != T_INT || z_elem != T_INT) {
return false;
}
Node* x_start = array_element_address(x, intcon(0), x_elem);
Node* z_start = array_element_address(z, intcon(0), z_elem);
Node* call = make_runtime_call(RC_LEAF|RC_NO_FP,
OptoRuntime::squareToLen_Type(),
stubAddr, stubName, TypePtr::BOTTOM,
x_start, len, z_start, zlen);
set_result(z);
return true;
}
//-------------inline_mulAdd------------------------------------------
bool LibraryCallKit::inline_mulAdd() {
assert(UseMulAddIntrinsic, "not implementated on this platform");
address stubAddr = StubRoutines::mulAdd();
if (stubAddr == NULL) {
return false; // Intrinsic's stub is not implemented on this platform
}
const char* stubName = "mulAdd";
assert(callee()->signature()->size() == 5, "mulAdd has 5 parameters");
Node* out = argument(0);
Node* in = argument(1);
Node* offset = argument(2);
Node* len = argument(3);
Node* k = argument(4);
const Type* out_type = out->Value(&_gvn);
const Type* in_type = in->Value(&_gvn);
const TypeAryPtr* top_out = out_type->isa_aryptr();
const TypeAryPtr* top_in = in_type->isa_aryptr();
if (top_out == NULL || top_out->klass() == NULL ||
top_in == NULL || top_in->klass() == NULL) {
// failed array check
return false;
}
BasicType out_elem = out_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
BasicType in_elem = in_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
if (out_elem != T_INT || in_elem != T_INT) {
return false;
}
Node* outlen = load_array_length(out);
Node* new_offset = _gvn.transform(new (C) SubINode(outlen, offset));
Node* out_start = array_element_address(out, intcon(0), out_elem);
Node* in_start = array_element_address(in, intcon(0), in_elem);
Node* call = make_runtime_call(RC_LEAF|RC_NO_FP,
OptoRuntime::mulAdd_Type(),
stubAddr, stubName, TypePtr::BOTTOM,
out_start,in_start, new_offset, len, k);
Node* result = _gvn.transform(new (C) ProjNode(call, TypeFunc::Parms));
set_result(result);
return true;
}
//-------------inline_montgomeryMultiply-----------------------------------
bool LibraryCallKit::inline_montgomeryMultiply() {
address stubAddr = StubRoutines::montgomeryMultiply();
if (stubAddr == NULL) {
return false; // Intrinsic's stub is not implemented on this platform
}
assert(UseMontgomeryMultiplyIntrinsic, "not implemented on this platform");
const char* stubName = "montgomery_square";
assert(callee()->signature()->size() == 7, "montgomeryMultiply has 7 parameters");
Node* a = argument(0);
Node* b = argument(1);
Node* n = argument(2);
Node* len = argument(3);
Node* inv = argument(4);
Node* m = argument(6);
const Type* a_type = a->Value(&_gvn);
const TypeAryPtr* top_a = a_type->isa_aryptr();
const Type* b_type = b->Value(&_gvn);
const TypeAryPtr* top_b = b_type->isa_aryptr();
const Type* n_type = a->Value(&_gvn);
const TypeAryPtr* top_n = n_type->isa_aryptr();
const Type* m_type = a->Value(&_gvn);
const TypeAryPtr* top_m = m_type->isa_aryptr();
if (top_a == NULL || top_a->klass() == NULL ||
top_b == NULL || top_b->klass() == NULL ||
top_n == NULL || top_n->klass() == NULL ||
top_m == NULL || top_m->klass() == NULL) {
// failed array check
return false;
}
BasicType a_elem = a_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
BasicType b_elem = b_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
BasicType n_elem = n_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
BasicType m_elem = m_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
if (a_elem != T_INT || b_elem != T_INT || n_elem != T_INT || m_elem != T_INT) {
return false;
}
// Make the call
{
Node* a_start = array_element_address(a, intcon(0), a_elem);
Node* b_start = array_element_address(b, intcon(0), b_elem);
Node* n_start = array_element_address(n, intcon(0), n_elem);
Node* m_start = array_element_address(m, intcon(0), m_elem);
Node* call = make_runtime_call(RC_LEAF,
OptoRuntime::montgomeryMultiply_Type(),
stubAddr, stubName, TypePtr::BOTTOM,
a_start, b_start, n_start, len, inv, top(),
m_start);
set_result(m);
}
return true;
}
bool LibraryCallKit::inline_montgomerySquare() {
address stubAddr = StubRoutines::montgomerySquare();
if (stubAddr == NULL) {
return false; // Intrinsic's stub is not implemented on this platform
}
assert(UseMontgomerySquareIntrinsic, "not implemented on this platform");
const char* stubName = "montgomery_square";
assert(callee()->signature()->size() == 6, "montgomerySquare has 6 parameters");
Node* a = argument(0);
Node* n = argument(1);
Node* len = argument(2);
Node* inv = argument(3);
Node* m = argument(5);
const Type* a_type = a->Value(&_gvn);
const TypeAryPtr* top_a = a_type->isa_aryptr();
const Type* n_type = a->Value(&_gvn);
const TypeAryPtr* top_n = n_type->isa_aryptr();
const Type* m_type = a->Value(&_gvn);
const TypeAryPtr* top_m = m_type->isa_aryptr();
if (top_a == NULL || top_a->klass() == NULL ||
top_n == NULL || top_n->klass() == NULL ||
top_m == NULL || top_m->klass() == NULL) {
// failed array check
return false;
}
BasicType a_elem = a_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
BasicType n_elem = n_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
BasicType m_elem = m_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
if (a_elem != T_INT || n_elem != T_INT || m_elem != T_INT) {
return false;
}
// Make the call
{
Node* a_start = array_element_address(a, intcon(0), a_elem);
Node* n_start = array_element_address(n, intcon(0), n_elem);
Node* m_start = array_element_address(m, intcon(0), m_elem);
Node* call = make_runtime_call(RC_LEAF,
OptoRuntime::montgomerySquare_Type(),
stubAddr, stubName, TypePtr::BOTTOM,
a_start, n_start, len, inv, top(),
m_start);
set_result(m);
}
return true;
}
/**
* Calculate CRC32 for byte.
......
......@@ -956,6 +956,94 @@ const TypeFunc* OptoRuntime::multiplyToLen_Type() {
return TypeFunc::make(domain, range);
}
const TypeFunc* OptoRuntime::squareToLen_Type() {
// create input type (domain)
int num_args = 4;
int argcnt = num_args;
const Type** fields = TypeTuple::fields(argcnt);
int argp = TypeFunc::Parms;
fields[argp++] = TypePtr::NOTNULL; // x
fields[argp++] = TypeInt::INT; // len
fields[argp++] = TypePtr::NOTNULL; // z
fields[argp++] = TypeInt::INT; // zlen
assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
// no result type needed
fields = TypeTuple::fields(1);
fields[TypeFunc::Parms+0] = NULL;
const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
return TypeFunc::make(domain, range);
}
// for mulAdd calls, 2 pointers and 3 ints, returning int
const TypeFunc* OptoRuntime::mulAdd_Type() {
// create input type (domain)
int num_args = 5;
int argcnt = num_args;
const Type** fields = TypeTuple::fields(argcnt);
int argp = TypeFunc::Parms;
fields[argp++] = TypePtr::NOTNULL; // out
fields[argp++] = TypePtr::NOTNULL; // in
fields[argp++] = TypeInt::INT; // offset
fields[argp++] = TypeInt::INT; // len
fields[argp++] = TypeInt::INT; // k
assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
// returning carry (int)
fields = TypeTuple::fields(1);
fields[TypeFunc::Parms+0] = TypeInt::INT;
const TypeTuple* range = TypeTuple::make(TypeFunc::Parms+1, fields);
return TypeFunc::make(domain, range);
}
const TypeFunc* OptoRuntime::montgomeryMultiply_Type() {
// create input type (domain)
int num_args = 7;
int argcnt = num_args;
const Type** fields = TypeTuple::fields(argcnt);
int argp = TypeFunc::Parms;
fields[argp++] = TypePtr::NOTNULL; // a
fields[argp++] = TypePtr::NOTNULL; // b
fields[argp++] = TypePtr::NOTNULL; // n
fields[argp++] = TypeInt::INT; // len
fields[argp++] = TypeLong::LONG; // inv
fields[argp++] = Type::HALF;
fields[argp++] = TypePtr::NOTNULL; // result
assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
// result type needed
fields = TypeTuple::fields(1);
fields[TypeFunc::Parms+0] = TypePtr::NOTNULL;
const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
return TypeFunc::make(domain, range);
}
const TypeFunc* OptoRuntime::montgomerySquare_Type() {
// create input type (domain)
int num_args = 6;
int argcnt = num_args;
const Type** fields = TypeTuple::fields(argcnt);
int argp = TypeFunc::Parms;
fields[argp++] = TypePtr::NOTNULL; // a
fields[argp++] = TypePtr::NOTNULL; // n
fields[argp++] = TypeInt::INT; // len
fields[argp++] = TypeLong::LONG; // inv
fields[argp++] = Type::HALF;
fields[argp++] = TypePtr::NOTNULL; // result
assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
// result type needed
fields = TypeTuple::fields(1);
fields[TypeFunc::Parms+0] = TypePtr::NOTNULL;
const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
return TypeFunc::make(domain, range);
}
//------------- Interpreter state access for on stack replacement
......
......@@ -305,6 +305,12 @@ private:
static const TypeFunc* multiplyToLen_Type();
static const TypeFunc* squareToLen_Type();
static const TypeFunc* mulAdd_Type();
static const TypeFunc* montgomeryMultiply_Type();
static const TypeFunc* montgomerySquare_Type();
static const TypeFunc* updateBytesCRC32_Type();
// leaf on stack replacement interpreter accessor types
......
......@@ -145,6 +145,12 @@ class SharedRuntime: AllStatic {
static double dsqrt(double f);
#endif
// Montgomery multiplication
static void montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
jint len, jlong inv, jint *m_ints);
static void montgomery_square(jint *a_ints, jint *n_ints,
jint len, jlong inv, jint *m_ints);
#ifdef __SOFTFP__
// C++ compiler generates soft float instructions as well as passing
// float and double in registers.
......
......@@ -136,6 +136,10 @@ address StubRoutines::_updateBytesCRC32 = NULL;
address StubRoutines::_crc_table_adr = NULL;
address StubRoutines::_multiplyToLen = NULL;
address StubRoutines::_squareToLen = NULL;
address StubRoutines::_mulAdd = NULL;
address StubRoutines::_montgomeryMultiply = NULL;
address StubRoutines::_montgomerySquare = NULL;
double (* StubRoutines::_intrinsic_log )(double) = NULL;
double (* StubRoutines::_intrinsic_log10 )(double) = NULL;
......
......@@ -209,6 +209,10 @@ class StubRoutines: AllStatic {
static address _crc_table_adr;
static address _multiplyToLen;
static address _squareToLen;
static address _mulAdd;
static address _montgomeryMultiply;
static address _montgomerySquare;
// These are versions of the java.lang.Math methods which perform
// the same operations as the intrinsic version. They are used for
......@@ -367,6 +371,10 @@ class StubRoutines: AllStatic {
static address crc_table_addr() { return _crc_table_adr; }
static address multiplyToLen() {return _multiplyToLen; }
static address squareToLen() {return _squareToLen; }
static address mulAdd() {return _mulAdd; }
static address montgomeryMultiply() { return _montgomeryMultiply; }
static address montgomerySquare() { return _montgomerySquare; }
static address select_fill_function(BasicType t, bool aligned, const char* &name);
......
......@@ -813,6 +813,8 @@ typedef TwoOopHashtable<Symbol*, mtClass> SymbolTwoOopHashtable;
static_field(StubRoutines, _updateBytesCRC32, address) \
static_field(StubRoutines, _crc_table_adr, address) \
static_field(StubRoutines, _multiplyToLen, address) \
static_field(StubRoutines, _squareToLen, address) \
static_field(StubRoutines, _mulAdd, address) \
\
/*****************/ \
/* SharedRuntime */ \
......
//
// Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved.
// Copyright (c) 2015, Red Hat Inc. All rights reserved.
// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
//
// This code is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License version 2 only, as
// published by the Free Software Foundation.
//
// This code is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// version 2 for more details (a copy is included in the LICENSE file that
// accompanied this code).
//
// You should have received a copy of the GNU General Public License version
// 2 along with this work; if not, write to the Free Software Foundation,
// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
//
// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
// or visit www.oracle.com if you need additional information or have any
// questions.
//
//
import java.lang.invoke.MethodHandle;
import java.lang.invoke.MethodHandles;
import java.lang.invoke.MethodType;
import java.lang.reflect.Constructor;
import java.lang.reflect.Field;
import java.lang.reflect.Method;
import java.math.BigInteger;
import java.util.Arrays;
import java.util.Random;
/**
* @test
* @bug 8130150
* @library /testlibrary
* @requires (os.simpleArch == "x64") & (os.family != "windows")
* @summary Verify that the Montgomery multiply intrinsic works and correctly checks its arguments.
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+UseMontgomerySquareIntrinsic
* -XX:+UseMontgomeryMultiplyIntrinsic MontgomeryMultiplyTest
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+UseMontgomerySquareIntrinsic
* -XX:-UseMontgomeryMultiplyIntrinsic MontgomeryMultiplyTest
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-UseMontgomerySquareIntrinsic
* -XX:+UseMontgomeryMultiplyIntrinsic MontgomeryMultiplyTest
*/
public class MontgomeryMultiplyTest {
static final MethodHandles.Lookup lookup = MethodHandles.lookup();
static final MethodHandle montgomeryMultiplyHandle, montgomerySquareHandle;
static final MethodHandle bigIntegerConstructorHandle;
static final Field bigIntegerMagField;
static {
// Use reflection to gain access to the methods we want to test.
try {
Method m = BigInteger.class.getDeclaredMethod("montgomeryMultiply",
/*a*/int[].class, /*b*/int[].class, /*n*/int[].class, /*len*/int.class,
/*inv*/long.class, /*product*/int[].class);
m.setAccessible(true);
montgomeryMultiplyHandle = lookup.unreflect(m);
m = BigInteger.class.getDeclaredMethod("montgomerySquare",
/*a*/int[].class, /*n*/int[].class, /*len*/int.class,
/*inv*/long.class, /*product*/int[].class);
m.setAccessible(true);
montgomerySquareHandle = lookup.unreflect(m);
Constructor c
= BigInteger.class.getDeclaredConstructor(int.class, int[].class);
c.setAccessible(true);
bigIntegerConstructorHandle = lookup.unreflectConstructor(c);
bigIntegerMagField = BigInteger.class.getDeclaredField("mag");
bigIntegerMagField.setAccessible(true);
} catch (Throwable ex) {
throw new RuntimeException(ex);
}
}
// Invoke either BigInteger.montgomeryMultiply or BigInteger.montgomerySquare.
int[] montgomeryMultiply(int[] a, int[] b, int[] n, int len, long inv,
int[] product) throws Throwable {
int[] result =
(a == b) ? (int[]) montgomerySquareHandle.invokeExact(a, n, len, inv, product)
: (int[]) montgomeryMultiplyHandle.invokeExact(a, b, n, len, inv, product);
return Arrays.copyOf(result, len);
}
// Invoke the private constructor BigInteger(int[]).
BigInteger newBigInteger(int[] val) throws Throwable {
return (BigInteger) bigIntegerConstructorHandle.invokeExact(1, val);
}
// Get the private field BigInteger.mag
int[] mag(BigInteger n) {
try {
return (int[]) bigIntegerMagField.get(n);
} catch (Exception ex) {
throw new RuntimeException(ex);
}
}
// Montgomery multiplication
// Calculate a * b * r^-1 mod n)
//
// R is a power of the word size
// N' = R^-1 mod N
//
// T := ab
// m := (T mod R)N' mod R [so 0 <= m < R]
// t := (T + mN)/R
// if t >= N then return t - N else return t
//
BigInteger montgomeryMultiply(BigInteger a, BigInteger b, BigInteger N,
int len, BigInteger n_prime)
throws Throwable {
BigInteger T = a.multiply(b);
BigInteger R = BigInteger.ONE.shiftLeft(len*32);
BigInteger mask = R.subtract(BigInteger.ONE);
BigInteger m = (T.and(mask)).multiply(n_prime);
m = m.and(mask); // i.e. m.mod(R)
T = T.add(m.multiply(N));
T = T.shiftRight(len*32); // i.e. T.divide(R)
if (T.compareTo(N) > 0) {
T = T.subtract(N);
}
return T;
}
// Call the Montgomery multiply intrinsic.
BigInteger montgomeryMultiply(int[] a_words, int[] b_words, int[] n_words,
int len, BigInteger inv)
throws Throwable {
BigInteger t = montgomeryMultiply(
newBigInteger(a_words),
newBigInteger(b_words),
newBigInteger(n_words),
len, inv);
return t;
}
// Check that the Montgomery multiply intrinsic returns the same
// result as the longhand calculation.
void check(int[] a_words, int[] b_words, int[] n_words, int len, BigInteger inv)
throws Throwable {
BigInteger n = newBigInteger(n_words);
BigInteger slow = montgomeryMultiply(a_words, b_words, n_words, len, inv);
BigInteger fast
= newBigInteger(montgomeryMultiply
(a_words, b_words, n_words, len, inv.longValue(), null));
// The intrinsic may not return the same value as the longhand
// calculation but they must have the same residue mod N.
if (!slow.mod(n).equals(fast.mod(n))) {
throw new RuntimeException();
}
}
Random rnd = new Random(0);
// Return a random value of length <= bits in an array of even length
int[] random_val(int bits) {
int len = (bits+63)/64; // i.e. length in longs
int[] val = new int[len*2];
for (int i = 0; i < val.length; i++)
val[i] = rnd.nextInt();
int leadingZeros = 64 - (bits & 64);
if (leadingZeros >= 32) {
val[0] = 0;
val[1] &= ~(-1l << (leadingZeros & 31));
} else {
val[0] &= ~(-1l << leadingZeros);
}
return val;
}
void testOneLength(int lenInBits, int lenInInts) throws Throwable {
BigInteger mod = new BigInteger(lenInBits, 2, rnd);
BigInteger r = BigInteger.ONE.shiftLeft(lenInInts * 32);
BigInteger n_prime = mod.modInverse(r).negate();
// Make n.length even, padding with a zero if necessary
int[] n = mag(mod);
if (n.length < lenInInts) {
int[] x = new int[lenInInts];
System.arraycopy(n, 0, x, lenInInts-n.length, n.length);
n = x;
}
for (int i = 0; i < 10000; i++) {
// multiply
check(random_val(lenInBits), random_val(lenInBits), n, lenInInts, n_prime);
// square
int[] tmp = random_val(lenInBits);
check(tmp, tmp, n, lenInInts, n_prime);
}
}
// Test the Montgomery multiply intrinsic with a bunch of random
// values of varying lengths. Do this for long enough that the
// caller of the intrinsic is C2-compiled.
void testResultValues() throws Throwable {
// Test a couple of interesting edge cases.
testOneLength(1024, 32);
testOneLength(1025, 34);
for (int j = 10; j > 0; j--) {
// Construct a random prime whose length in words is even
int lenInBits = rnd.nextInt(2048) + 64;
int lenInInts = (lenInBits + 63)/64*2;
testOneLength(lenInBits, lenInInts);
}
}
// Range checks
void testOneMontgomeryMultiplyCheck(int[] a, int[] b, int[] n, int len, long inv,
int[] product, Class klass) {
try {
montgomeryMultiply(a, b, n, len, inv, product);
} catch (Throwable ex) {
if (klass.isAssignableFrom(ex.getClass()))
return;
throw new RuntimeException(klass + " expected, " + ex + " was thrown");
}
throw new RuntimeException(klass + " expected, was not thrown");
}
void testOneMontgomeryMultiplyCheck(int[] a, int[] b, BigInteger n, int len, BigInteger inv,
Class klass) {
testOneMontgomeryMultiplyCheck(a, b, mag(n), len, inv.longValue(), null, klass);
}
void testOneMontgomeryMultiplyCheck(int[] a, int[] b, BigInteger n, int len, BigInteger inv,
int[] product, Class klass) {
testOneMontgomeryMultiplyCheck(a, b, mag(n), len, inv.longValue(), product, klass);
}
void testMontgomeryMultiplyChecks() {
int[] blah = random_val(40);
int[] small = random_val(39);
BigInteger mod = new BigInteger(40*32 , 2, rnd);
BigInteger r = BigInteger.ONE.shiftLeft(40*32);
BigInteger n_prime = mod.modInverse(r).negate();
// Length out of range: square
testOneMontgomeryMultiplyCheck(blah, blah, mod, 41, n_prime, IllegalArgumentException.class);
testOneMontgomeryMultiplyCheck(blah, blah, mod, 0, n_prime, IllegalArgumentException.class);
testOneMontgomeryMultiplyCheck(blah, blah, mod, -1, n_prime, IllegalArgumentException.class);
// As above, but for multiply
testOneMontgomeryMultiplyCheck(blah, blah.clone(), mod, 41, n_prime, IllegalArgumentException.class);
testOneMontgomeryMultiplyCheck(blah, blah.clone(), mod, 0, n_prime, IllegalArgumentException.class);
testOneMontgomeryMultiplyCheck(blah, blah.clone(), mod, 0, n_prime, IllegalArgumentException.class);
// Length odd
testOneMontgomeryMultiplyCheck(small, small, mod, 39, n_prime, IllegalArgumentException.class);
testOneMontgomeryMultiplyCheck(small, small, mod, 0, n_prime, IllegalArgumentException.class);
testOneMontgomeryMultiplyCheck(small, small, mod, -1, n_prime, IllegalArgumentException.class);
// As above, but for multiply
testOneMontgomeryMultiplyCheck(small, small.clone(), mod, 39, n_prime, IllegalArgumentException.class);
testOneMontgomeryMultiplyCheck(small, small.clone(), mod, 0, n_prime, IllegalArgumentException.class);
testOneMontgomeryMultiplyCheck(small, small.clone(), mod, -1, n_prime, IllegalArgumentException.class);
// array too small
testOneMontgomeryMultiplyCheck(blah, blah, mod, 40, n_prime, small, IllegalArgumentException.class);
testOneMontgomeryMultiplyCheck(blah, blah.clone(), mod, 40, n_prime, small, IllegalArgumentException.class);
testOneMontgomeryMultiplyCheck(small, blah, mod, 40, n_prime, blah, IllegalArgumentException.class);
testOneMontgomeryMultiplyCheck(blah, small, mod, 40, n_prime, blah, IllegalArgumentException.class);
testOneMontgomeryMultiplyCheck(blah, blah, mod, 40, n_prime, small, IllegalArgumentException.class);
testOneMontgomeryMultiplyCheck(small, small, mod, 40, n_prime, blah, IllegalArgumentException.class);
}
public static void main(String args[]) {
try {
new MontgomeryMultiplyTest().testMontgomeryMultiplyChecks();
new MontgomeryMultiplyTest().testResultValues();
} catch (Throwable ex) {
throw new RuntimeException(ex);
}
}
}
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
/**
* @test
* @bug 8081778
* @summary Add C2 x86 intrinsic for BigInteger::mulAdd() method
*
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch
* -XX:+IgnoreUnrecognizedVMOptions -XX:-UseSquareToLenIntrinsic -XX:-UseMultiplyToLenIntrinsic
* -XX:+UseMulAddIntrinsic
* -XX:CompileCommand=dontinline,TestMulAdd::main
* -XX:CompileCommand=option,TestMulAdd::base_multiply,ccstr,DisableIntrinsic,_mulAdd
* -XX:CompileCommand=option,java.math.BigInteger::multiply,ccstr,DisableIntrinsic,_mulAdd
* -XX:CompileCommand=option,java.math.BigInteger::square,ccstr,DisableIntrinsic,_mulAdd
* -XX:CompileCommand=option,java.math.BigInteger::squareToLen,ccstr,DisableIntrinsic,_mulAdd
* -XX:CompileCommand=option,java.math.BigInteger::mulAdd,ccstr,DisableIntrinsic,_mulAdd
* -XX:CompileCommand=inline,java.math.BigInteger::multiply
* -XX:CompileCommand=inline,java.math.BigInteger::square
* -XX:CompileCommand=inline,java.math.BigInteger::squareToLen
* -XX:CompileCommand=inline,java.math.BigInteger::mulAdd TestMulAdd
*/
import java.util.Random;
import java.math.*;
public class TestMulAdd {
// Avoid intrinsic by preventing inlining multiply() and mulAdd().
public static BigInteger base_multiply(BigInteger op1) {
return op1.multiply(op1);
}
// Generate mulAdd() intrinsic by inlining multiply().
public static BigInteger new_multiply(BigInteger op1) {
return op1.multiply(op1);
}
public static boolean bytecompare(BigInteger b1, BigInteger b2) {
byte[] data1 = b1.toByteArray();
byte[] data2 = b2.toByteArray();
if (data1.length != data2.length)
return false;
for (int i = 0; i < data1.length; i++) {
if (data1[i] != data2[i])
return false;
}
return true;
}
public static String stringify(BigInteger b) {
String strout= "";
byte [] data = b.toByteArray();
for (int i = 0; i < data.length; i++) {
strout += (String.format("%02x",data[i]) + " ");
}
return strout;
}
public static void main(String args[]) throws Exception {
BigInteger oldsum = new BigInteger("0");
BigInteger newsum = new BigInteger("0");
BigInteger b1, b2, oldres, newres;
Random rand = new Random();
long seed = System.nanoTime();
Random rand1 = new Random();
long seed1 = System.nanoTime();
rand.setSeed(seed);
rand1.setSeed(seed1);
for (int j = 0; j < 100000; j++) {
int rand_int = rand1.nextInt(3136)+32;
b1 = new BigInteger(rand_int, rand);
oldres = base_multiply(b1);
newres = new_multiply(b1);
oldsum = oldsum.add(oldres);
newsum = newsum.add(newres);
if (!bytecompare(oldres,newres)) {
System.out.print("mismatch for:b1:" + stringify(b1) + " :oldres:" + stringify(oldres) + " :newres:" + stringify(newres));
System.out.println(b1);
throw new Exception("Failed");
}
}
if (!bytecompare(oldsum,newsum)) {
System.out.println("Failure: oldsum:" + stringify(oldsum) + " newsum:" + stringify(newsum));
throw new Exception("Failed");
} else {
System.out.println("Success");
}
}
}
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
/**
* @test
* @bug 8081778
* @summary Add C2 x86 intrinsic for BigInteger::squareToLen() method
*
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch
* -XX:+IgnoreUnrecognizedVMOptions
* -XX:+UseSquareToLenIntrinsic
* -XX:CompileCommand=exclude,TestSquareToLen::main
* -XX:CompileCommand=option,TestSquareToLen::base_multiply,ccstr,DisableIntrinsic,_squareToLen
* -XX:CompileCommand=option,java.math.BigInteger::multiply,ccstr,DisableIntrinsic,_squareToLen
* -XX:CompileCommand=option,java.math.BigInteger::square,ccstr,DisableIntrinsic,_squareToLen
* -XX:CompileCommand=option,java.math.BigInteger::squareToLen,ccstr,DisableIntrinsic,_squareToLen
* -XX:CompileCommand=inline,java.math.BigInteger::multiply
* -XX:CompileCommand=inline,java.math.BigInteger::square
* -XX:CompileCommand=inline,java.math.BigInteger::squareToLen TestSquareToLen
*/
import java.util.Random;
import java.math.*;
public class TestSquareToLen {
// Avoid intrinsic by preventing inlining multiply() and squareToLen().
public static BigInteger base_multiply(BigInteger op1) {
return op1.multiply(op1);
}
// Generate squareToLen() intrinsic by inlining multiply().
public static BigInteger new_multiply(BigInteger op1) {
return op1.multiply(op1);
}
public static boolean bytecompare(BigInteger b1, BigInteger b2) {
byte[] data1 = b1.toByteArray();
byte[] data2 = b2.toByteArray();
if (data1.length != data2.length)
return false;
for (int i = 0; i < data1.length; i++) {
if (data1[i] != data2[i])
return false;
}
return true;
}
public static String stringify(BigInteger b) {
String strout= "";
byte [] data = b.toByteArray();
for (int i = 0; i < data.length; i++) {
strout += (String.format("%02x",data[i]) + " ");
}
return strout;
}
public static void main(String args[]) throws Exception {
BigInteger oldsum = new BigInteger("0");
BigInteger newsum = new BigInteger("0");
BigInteger b1, b2, oldres, newres;
Random rand = new Random();
long seed = System.nanoTime();
Random rand1 = new Random();
long seed1 = System.nanoTime();
rand.setSeed(seed);
rand1.setSeed(seed1);
for (int j = 0; j < 100000; j++) {
int rand_int = rand1.nextInt(3136)+32;
b1 = new BigInteger(rand_int, rand);
oldres = base_multiply(b1);
newres = new_multiply(b1);
oldsum = oldsum.add(oldres);
newsum = newsum.add(newres);
if (!bytecompare(oldres,newres)) {
System.out.print("mismatch for:b1:" + stringify(b1) + " :oldres:" + stringify(oldres) + " :newres:" + stringify(newres));
System.out.println(b1);
throw new Exception("Failed");
}
}
if (!bytecompare(oldsum,newsum)) {
System.out.println("Failure: oldsum:" + stringify(oldsum) + " newsum:" + stringify(newsum));
throw new Exception("Failed");
} else {
System.out.println("Success");
}
}
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册