From c5837c061bfc07a971f14524ceaaba72bfe5ab77 Mon Sep 17 00:00:00 2001 From: igerasim Date: Wed, 17 Feb 2016 13:40:12 +0300 Subject: [PATCH] 8081778: Use Intel x64 CPU instructions for RSA acceleration Summary: Add intrinsics for BigInteger squareToLen and mulAdd methods. Reviewed-by: kvn, jrose --- src/cpu/x86/vm/assembler_x86.cpp | 20 + src/cpu/x86/vm/assembler_x86.hpp | 3 + src/cpu/x86/vm/macroAssembler_x86.cpp | 497 ++++++++++++++++++ src/cpu/x86/vm/macroAssembler_x86.hpp | 19 + src/cpu/x86/vm/stubGenerator_x86_64.cpp | 107 ++++ src/cpu/x86/vm/stubRoutines_x86_64.hpp | 2 +- src/cpu/x86/vm/vm_version_x86.cpp | 18 + src/share/vm/classfile/vmSymbols.hpp | 8 + src/share/vm/opto/c2_globals.hpp | 6 + src/share/vm/opto/escape.cpp | 4 +- src/share/vm/opto/library_call.cpp | 110 ++++ src/share/vm/opto/runtime.cpp | 42 ++ src/share/vm/opto/runtime.hpp | 4 + src/share/vm/runtime/stubRoutines.cpp | 2 + src/share/vm/runtime/stubRoutines.hpp | 4 + src/share/vm/runtime/vmStructs.cpp | 2 + .../intrinsics/muladd/TestMulAdd.java | 117 +++++ .../squaretolen/TestSquareToLen.java | 114 ++++ 18 files changed, 1077 insertions(+), 2 deletions(-) create mode 100644 test/compiler/intrinsics/muladd/TestMulAdd.java create mode 100644 test/compiler/intrinsics/squaretolen/TestSquareToLen.java diff --git a/src/cpu/x86/vm/assembler_x86.cpp b/src/cpu/x86/vm/assembler_x86.cpp index 8098e889b..7cbc47d60 100644 --- a/src/cpu/x86/vm/assembler_x86.cpp +++ b/src/cpu/x86/vm/assembler_x86.cpp @@ -2318,6 +2318,13 @@ void Assembler::orl(Register dst, Register src) { emit_arith(0x0B, 0xC0, dst, src); } +void Assembler::orl(Address dst, Register src) { + InstructionMark im(this); + prefix(dst, src); + emit_int8(0x09); + emit_operand(src, dst); +} + void Assembler::packuswb(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); @@ -5613,6 +5620,19 @@ void Assembler::rclq(Register dst, int imm8) { } } +void Assembler::rcrq(Register dst, int imm8) { + assert(isShiftCount(imm8 >> 1), "illegal shift count"); + int encode = prefixq_and_encode(dst->encoding()); + if (imm8 == 1) { + emit_int8((unsigned char)0xD1); + emit_int8((unsigned char)(0xD8 | encode)); + } else { + emit_int8((unsigned char)0xC1); + emit_int8((unsigned char)(0xD8 | encode)); + emit_int8(imm8); + } +} + void Assembler::rorq(Register dst, int imm8) { assert(isShiftCount(imm8 >> 1), "illegal shift count"); int encode = prefixq_and_encode(dst->encoding()); diff --git a/src/cpu/x86/vm/assembler_x86.hpp b/src/cpu/x86/vm/assembler_x86.hpp index fc270dbbd..341d9e39b 100644 --- a/src/cpu/x86/vm/assembler_x86.hpp +++ b/src/cpu/x86/vm/assembler_x86.hpp @@ -1455,6 +1455,7 @@ private: void orl(Register dst, int32_t imm32); void orl(Register dst, Address src); void orl(Register dst, Register src); + void orl(Address dst, Register src); void orq(Address dst, int32_t imm32); void orq(Register dst, int32_t imm32); @@ -1555,6 +1556,8 @@ private: void rclq(Register dst, int imm8); + void rcrq(Register dst, int imm8); + void rdtsc(); void ret(int imm16); diff --git a/src/cpu/x86/vm/macroAssembler_x86.cpp b/src/cpu/x86/vm/macroAssembler_x86.cpp index d6f179770..764cf1bcf 100644 --- a/src/cpu/x86/vm/macroAssembler_x86.cpp +++ b/src/cpu/x86/vm/macroAssembler_x86.cpp @@ -7767,6 +7767,503 @@ void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Regi pop(tmp2); pop(tmp1); } + +//Helper functions for square_to_len() + +/** + * Store the squares of x[], right shifted one bit (divided by 2) into z[] + * Preserves x and z and modifies rest of the registers. + */ + +void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { + // Perform square and right shift by 1 + // Handle odd xlen case first, then for even xlen do the following + // jlong carry = 0; + // for (int j=0, i=0; j < xlen; j+=2, i+=4) { + // huge_128 product = x[j:j+1] * x[j:j+1]; + // z[i:i+1] = (carry << 63) | (jlong)(product >>> 65); + // z[i+2:i+3] = (jlong)(product >>> 1); + // carry = (jlong)product; + // } + + xorq(tmp5, tmp5); // carry + xorq(rdxReg, rdxReg); + xorl(tmp1, tmp1); // index for x + xorl(tmp4, tmp4); // index for z + + Label L_first_loop, L_first_loop_exit; + + testl(xlen, 1); + jccb(Assembler::zero, L_first_loop); //jump if xlen is even + + // Square and right shift by 1 the odd element using 32 bit multiply + movl(raxReg, Address(x, tmp1, Address::times_4, 0)); + imulq(raxReg, raxReg); + shrq(raxReg, 1); + adcq(tmp5, 0); + movq(Address(z, tmp4, Address::times_4, 0), raxReg); + incrementl(tmp1); + addl(tmp4, 2); + + // Square and right shift by 1 the rest using 64 bit multiply + bind(L_first_loop); + cmpptr(tmp1, xlen); + jccb(Assembler::equal, L_first_loop_exit); + + // Square + movq(raxReg, Address(x, tmp1, Address::times_4, 0)); + rorq(raxReg, 32); // convert big-endian to little-endian + mulq(raxReg); // 64-bit multiply rax * rax -> rdx:rax + + // Right shift by 1 and save carry + shrq(tmp5, 1); // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1 + rcrq(rdxReg, 1); + rcrq(raxReg, 1); + adcq(tmp5, 0); + + // Store result in z + movq(Address(z, tmp4, Address::times_4, 0), rdxReg); + movq(Address(z, tmp4, Address::times_4, 8), raxReg); + + // Update indices for x and z + addl(tmp1, 2); + addl(tmp4, 4); + jmp(L_first_loop); + + bind(L_first_loop_exit); +} + + +/** + * Perform the following multiply add operation using BMI2 instructions + * carry:sum = sum + op1*op2 + carry + * op2 should be in rdx + * op2 is preserved, all other registers are modified + */ +void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) { + // assert op2 is rdx + mulxq(tmp2, op1, op1); // op1 * op2 -> tmp2:op1 + addq(sum, carry); + adcq(tmp2, 0); + addq(sum, op1); + adcq(tmp2, 0); + movq(carry, tmp2); +} + +/** + * Perform the following multiply add operation: + * carry:sum = sum + op1*op2 + carry + * Preserves op1, op2 and modifies rest of registers + */ +void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) { + // rdx:rax = op1 * op2 + movq(raxReg, op2); + mulq(op1); + + // rdx:rax = sum + carry + rdx:rax + addq(sum, carry); + adcq(rdxReg, 0); + addq(sum, raxReg); + adcq(rdxReg, 0); + + // carry:sum = rdx:sum + movq(carry, rdxReg); +} + +/** + * Add 64 bit long carry into z[] with carry propogation. + * Preserves z and carry register values and modifies rest of registers. + * + */ +void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) { + Label L_fourth_loop, L_fourth_loop_exit; + + movl(tmp1, 1); + subl(zlen, 2); + addq(Address(z, zlen, Address::times_4, 0), carry); + + bind(L_fourth_loop); + jccb(Assembler::carryClear, L_fourth_loop_exit); + subl(zlen, 2); + jccb(Assembler::negative, L_fourth_loop_exit); + addq(Address(z, zlen, Address::times_4, 0), tmp1); + jmp(L_fourth_loop); + bind(L_fourth_loop_exit); +} + +/** + * Shift z[] left by 1 bit. + * Preserves x, len, z and zlen registers and modifies rest of the registers. + * + */ +void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) { + + Label L_fifth_loop, L_fifth_loop_exit; + + // Fifth loop + // Perform primitiveLeftShift(z, zlen, 1) + + const Register prev_carry = tmp1; + const Register new_carry = tmp4; + const Register value = tmp2; + const Register zidx = tmp3; + + // int zidx, carry; + // long value; + // carry = 0; + // for (zidx = zlen-2; zidx >=0; zidx -= 2) { + // (carry:value) = (z[i] << 1) | carry ; + // z[i] = value; + // } + + movl(zidx, zlen); + xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register + + bind(L_fifth_loop); + decl(zidx); // Use decl to preserve carry flag + decl(zidx); + jccb(Assembler::negative, L_fifth_loop_exit); + + if (UseBMI2Instructions) { + movq(value, Address(z, zidx, Address::times_4, 0)); + rclq(value, 1); + rorxq(value, value, 32); + movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form + } + else { + // clear new_carry + xorl(new_carry, new_carry); + + // Shift z[i] by 1, or in previous carry and save new carry + movq(value, Address(z, zidx, Address::times_4, 0)); + shlq(value, 1); + adcl(new_carry, 0); + + orq(value, prev_carry); + rorq(value, 0x20); + movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form + + // Set previous carry = new carry + movl(prev_carry, new_carry); + } + jmp(L_fifth_loop); + + bind(L_fifth_loop_exit); +} + + +/** + * Code for BigInteger::squareToLen() intrinsic + * + * rdi: x + * rsi: len + * r8: z + * rcx: zlen + * r12: tmp1 + * r13: tmp2 + * r14: tmp3 + * r15: tmp4 + * rbx: tmp5 + * + */ +void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { + + Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, fifth_loop, fifth_loop_exit, L_last_x, L_multiply; + push(tmp1); + push(tmp2); + push(tmp3); + push(tmp4); + push(tmp5); + + // First loop + // Store the squares, right shifted one bit (i.e., divided by 2). + square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg); + + // Add in off-diagonal sums. + // + // Second, third (nested) and fourth loops. + // zlen +=2; + // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) { + // carry = 0; + // long op2 = x[xidx:xidx+1]; + // for (int j=xidx-2,k=zidx; j >= 0; j-=2) { + // k -= 2; + // long op1 = x[j:j+1]; + // long sum = z[k:k+1]; + // carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs); + // z[k:k+1] = sum; + // } + // add_one_64(z, k, carry, tmp_regs); + // } + + const Register carry = tmp5; + const Register sum = tmp3; + const Register op1 = tmp4; + Register op2 = tmp2; + + push(zlen); + push(len); + addl(zlen,2); + bind(L_second_loop); + xorq(carry, carry); + subl(zlen, 4); + subl(len, 2); + push(zlen); + push(len); + cmpl(len, 0); + jccb(Assembler::lessEqual, L_second_loop_exit); + + // Multiply an array by one 64 bit long. + if (UseBMI2Instructions) { + op2 = rdxReg; + movq(op2, Address(x, len, Address::times_4, 0)); + rorxq(op2, op2, 32); + } + else { + movq(op2, Address(x, len, Address::times_4, 0)); + rorq(op2, 32); + } + + bind(L_third_loop); + decrementl(len); + jccb(Assembler::negative, L_third_loop_exit); + decrementl(len); + jccb(Assembler::negative, L_last_x); + + movq(op1, Address(x, len, Address::times_4, 0)); + rorq(op1, 32); + + bind(L_multiply); + subl(zlen, 2); + movq(sum, Address(z, zlen, Address::times_4, 0)); + + // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry. + if (UseBMI2Instructions) { + multiply_add_64_bmi2(sum, op1, op2, carry, tmp2); + } + else { + multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); + } + + movq(Address(z, zlen, Address::times_4, 0), sum); + + jmp(L_third_loop); + bind(L_third_loop_exit); + + // Fourth loop + // Add 64 bit long carry into z with carry propogation. + // Uses offsetted zlen. + add_one_64(z, zlen, carry, tmp1); + + pop(len); + pop(zlen); + jmp(L_second_loop); + + // Next infrequent code is moved outside loops. + bind(L_last_x); + movl(op1, Address(x, 0)); + jmp(L_multiply); + + bind(L_second_loop_exit); + pop(len); + pop(zlen); + pop(len); + pop(zlen); + + // Fifth loop + // Shift z left 1 bit. + lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4); + + // z[zlen-1] |= x[len-1] & 1; + movl(tmp3, Address(x, len, Address::times_4, -4)); + andl(tmp3, 1); + orl(Address(z, zlen, Address::times_4, -4), tmp3); + + pop(tmp5); + pop(tmp4); + pop(tmp3); + pop(tmp2); + pop(tmp1); +} + +/** + * Helper function for mul_add() + * Multiply the in[] by int k and add to out[] starting at offset offs using + * 128 bit by 32 bit multiply and return the carry in tmp5. + * Only quad int aligned length of in[] is operated on in this function. + * k is in rdxReg for BMI2Instructions, for others it is in tmp2. + * This function preserves out, in and k registers. + * len and offset point to the appropriate index in "in" & "out" correspondingly + * tmp5 has the carry. + * other registers are temporary and are modified. + * + */ +void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in, + Register offset, Register len, Register tmp1, Register tmp2, Register tmp3, + Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { + + Label L_first_loop, L_first_loop_exit; + + movl(tmp1, len); + shrl(tmp1, 2); + + bind(L_first_loop); + subl(tmp1, 1); + jccb(Assembler::negative, L_first_loop_exit); + + subl(len, 4); + subl(offset, 4); + + Register op2 = tmp2; + const Register sum = tmp3; + const Register op1 = tmp4; + const Register carry = tmp5; + + if (UseBMI2Instructions) { + op2 = rdxReg; + } + + movq(op1, Address(in, len, Address::times_4, 8)); + rorq(op1, 32); + movq(sum, Address(out, offset, Address::times_4, 8)); + rorq(sum, 32); + if (UseBMI2Instructions) { + multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); + } + else { + multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); + } + // Store back in big endian from little endian + rorq(sum, 0x20); + movq(Address(out, offset, Address::times_4, 8), sum); + + movq(op1, Address(in, len, Address::times_4, 0)); + rorq(op1, 32); + movq(sum, Address(out, offset, Address::times_4, 0)); + rorq(sum, 32); + if (UseBMI2Instructions) { + multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); + } + else { + multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); + } + // Store back in big endian from little endian + rorq(sum, 0x20); + movq(Address(out, offset, Address::times_4, 0), sum); + + jmp(L_first_loop); + bind(L_first_loop_exit); +} + +/** + * Code for BigInteger::mulAdd() intrinsic + * + * rdi: out + * rsi: in + * r11: offs (out.length - offset) + * rcx: len + * r8: k + * r12: tmp1 + * r13: tmp2 + * r14: tmp3 + * r15: tmp4 + * rbx: tmp5 + * Multiply the in[] by word k and add to out[], return the carry in rax + */ +void MacroAssembler::mul_add(Register out, Register in, Register offs, + Register len, Register k, Register tmp1, Register tmp2, Register tmp3, + Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { + + Label L_carry, L_last_in, L_done; + +// carry = 0; +// for (int j=len-1; j >= 0; j--) { +// long product = (in[j] & LONG_MASK) * kLong + +// (out[offs] & LONG_MASK) + carry; +// out[offs--] = (int)product; +// carry = product >>> 32; +// } +// + push(tmp1); + push(tmp2); + push(tmp3); + push(tmp4); + push(tmp5); + + Register op2 = tmp2; + const Register sum = tmp3; + const Register op1 = tmp4; + const Register carry = tmp5; + + if (UseBMI2Instructions) { + op2 = rdxReg; + movl(op2, k); + } + else { + movl(op2, k); + } + + xorq(carry, carry); + + //First loop + + //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply + //The carry is in tmp5 + mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg); + + //Multiply the trailing in[] entry using 64 bit by 32 bit, if any + decrementl(len); + jccb(Assembler::negative, L_carry); + decrementl(len); + jccb(Assembler::negative, L_last_in); + + movq(op1, Address(in, len, Address::times_4, 0)); + rorq(op1, 32); + + subl(offs, 2); + movq(sum, Address(out, offs, Address::times_4, 0)); + rorq(sum, 32); + + if (UseBMI2Instructions) { + multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); + } + else { + multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); + } + + // Store back in big endian from little endian + rorq(sum, 0x20); + movq(Address(out, offs, Address::times_4, 0), sum); + + testl(len, len); + jccb(Assembler::zero, L_carry); + + //Multiply the last in[] entry, if any + bind(L_last_in); + movl(op1, Address(in, 0)); + movl(sum, Address(out, offs, Address::times_4, -4)); + + movl(raxReg, k); + mull(op1); //tmp4 * eax -> edx:eax + addl(sum, carry); + adcl(rdxReg, 0); + addl(sum, raxReg); + adcl(rdxReg, 0); + movl(carry, rdxReg); + + movl(Address(out, offs, Address::times_4, -4), sum); + + bind(L_carry); + //return tmp5/carry as carry in rax + movl(rax, carry); + + bind(L_done); + pop(tmp5); + pop(tmp4); + pop(tmp3); + pop(tmp2); + pop(tmp1); +} #endif /** diff --git a/src/cpu/x86/vm/macroAssembler_x86.hpp b/src/cpu/x86/vm/macroAssembler_x86.hpp index 69c9e8aa3..cc5e5856a 100644 --- a/src/cpu/x86/vm/macroAssembler_x86.hpp +++ b/src/cpu/x86/vm/macroAssembler_x86.hpp @@ -1241,6 +1241,25 @@ public: Register carry2); void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5); + + void square_rshift(Register x, Register len, Register z, Register tmp1, Register tmp3, + Register tmp4, Register tmp5, Register rdxReg, Register raxReg); + void multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, + Register tmp2); + void multiply_add_64(Register sum, Register op1, Register op2, Register carry, + Register rdxReg, Register raxReg); + void add_one_64(Register z, Register zlen, Register carry, Register tmp1); + void lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, + Register tmp3, Register tmp4); + void square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, + Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg); + + void mul_add_128_x_32_loop(Register out, Register in, Register offset, Register len, Register tmp1, + Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, + Register raxReg); + void mul_add(Register out, Register in, Register offset, Register len, Register k, Register tmp1, + Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, + Register raxReg); #endif // CRC32 code for java.util.zip.CRC32::updateBytes() instrinsic. diff --git a/src/cpu/x86/vm/stubGenerator_x86_64.cpp b/src/cpu/x86/vm/stubGenerator_x86_64.cpp index 0bb6118d7..a9f4e9448 100644 --- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp +++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp @@ -3743,6 +3743,107 @@ class StubGenerator: public StubCodeGenerator { return start; } +/** + * Arguments: + * + // Input: + // c_rarg0 - x address + // c_rarg1 - x length + // c_rarg2 - z address + // c_rarg3 - z lenth + * + */ + address generate_squareToLen() { + + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "squareToLen"); + + address start = __ pc(); + // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...) + // Unix: rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...) + const Register x = rdi; + const Register len = rsi; + const Register z = r8; + const Register zlen = rcx; + + const Register tmp1 = r12; + const Register tmp2 = r13; + const Register tmp3 = r14; + const Register tmp4 = r15; + const Register tmp5 = rbx; + + BLOCK_COMMENT("Entry:"); + __ enter(); // required for proper stackwalking of RuntimeStub frame + + setup_arg_regs(4); // x => rdi, len => rsi, z => rdx + // zlen => rcx + // r9 and r10 may be used to save non-volatile registers + __ movptr(r8, rdx); + __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax); + + restore_arg_regs(); + + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + + return start; + } + + /** + * Arguments: + * + * Input: + * c_rarg0 - out address + * c_rarg1 - in address + * c_rarg2 - offset + * c_rarg3 - len + * not Win64 + * c_rarg4 - k + * Win64 + * rsp+40 - k + */ + address generate_mulAdd() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "mulAdd"); + + address start = __ pc(); + // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...) + // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...) + const Register out = rdi; + const Register in = rsi; + const Register offset = r11; + const Register len = rcx; + const Register k = r8; + + // Next registers will be saved on stack in mul_add(). + const Register tmp1 = r12; + const Register tmp2 = r13; + const Register tmp3 = r14; + const Register tmp4 = r15; + const Register tmp5 = rbx; + + BLOCK_COMMENT("Entry:"); + __ enter(); // required for proper stackwalking of RuntimeStub frame + + setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx + // len => rcx, k => r8 + // r9 and r10 may be used to save non-volatile registers +#ifdef _WIN64 + // last argument is on stack on Win64 + __ movl(k, Address(rsp, 6 * wordSize)); +#endif + __ movptr(r11, rdx); // move offset in rdx to offset(r11) + __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax); + + restore_arg_regs(); + + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + + return start; + } + + #undef __ #define __ masm-> @@ -3987,6 +4088,12 @@ class StubGenerator: public StubCodeGenerator { if (UseMultiplyToLenIntrinsic) { StubRoutines::_multiplyToLen = generate_multiplyToLen(); } + if (UseSquareToLenIntrinsic) { + StubRoutines::_squareToLen = generate_squareToLen(); + } + if (UseMulAddIntrinsic) { + StubRoutines::_mulAdd = generate_mulAdd(); + } #endif } diff --git a/src/cpu/x86/vm/stubRoutines_x86_64.hpp b/src/cpu/x86/vm/stubRoutines_x86_64.hpp index 205bce4eb..15922b8ae 100644 --- a/src/cpu/x86/vm/stubRoutines_x86_64.hpp +++ b/src/cpu/x86/vm/stubRoutines_x86_64.hpp @@ -33,7 +33,7 @@ static bool returns_to_call_stub(address return_pc) { return return_pc == _ enum platform_dependent_constants { code_size1 = 19000, // simply increase if too small (assembler will crash if too small) - code_size2 = 22000 // simply increase if too small (assembler will crash if too small) + code_size2 = 23000 // simply increase if too small (assembler will crash if too small) }; class x86 { diff --git a/src/cpu/x86/vm/vm_version_x86.cpp b/src/cpu/x86/vm/vm_version_x86.cpp index adfdfd67e..807f33efd 100644 --- a/src/cpu/x86/vm/vm_version_x86.cpp +++ b/src/cpu/x86/vm/vm_version_x86.cpp @@ -703,6 +703,12 @@ void VM_Version::get_processor_features() { if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) { UseMultiplyToLenIntrinsic = true; } + if (FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) { + UseSquareToLenIntrinsic = true; + } + if (FLAG_IS_DEFAULT(UseMulAddIntrinsic)) { + UseMulAddIntrinsic = true; + } #else if (UseMultiplyToLenIntrinsic) { if (!FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) { @@ -710,6 +716,18 @@ void VM_Version::get_processor_features() { } FLAG_SET_DEFAULT(UseMultiplyToLenIntrinsic, false); } + if (UseSquareToLenIntrinsic) { + if (!FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) { + warning("squareToLen intrinsic is not available in 32-bit VM"); + } + FLAG_SET_DEFAULT(UseSquareToLenIntrinsic, false); + } + if (UseMulAddIntrinsic) { + if (!FLAG_IS_DEFAULT(UseMulAddIntrinsic)) { + warning("mulAdd intrinsic is not available in 32-bit VM"); + } + FLAG_SET_DEFAULT(UseMulAddIntrinsic, false); + } #endif #endif // COMPILER2 diff --git a/src/share/vm/classfile/vmSymbols.hpp b/src/share/vm/classfile/vmSymbols.hpp index 0a84cb074..96032fbe9 100644 --- a/src/share/vm/classfile/vmSymbols.hpp +++ b/src/share/vm/classfile/vmSymbols.hpp @@ -793,6 +793,14 @@ do_name( multiplyToLen_name, "multiplyToLen") \ do_signature(multiplyToLen_signature, "([II[II[I)[I") \ \ + do_intrinsic(_squareToLen, java_math_BigInteger, squareToLen_name, squareToLen_signature, F_S) \ + do_name( squareToLen_name, "implSquareToLen") \ + do_signature(squareToLen_signature, "([II[II)[I") \ + \ + do_intrinsic(_mulAdd, java_math_BigInteger, mulAdd_name, mulAdd_signature, F_S) \ + do_name( mulAdd_name, "implMulAdd") \ + do_signature(mulAdd_signature, "([I[IIII)I") \ + \ /* java/lang/ref/Reference */ \ do_intrinsic(_Reference_get, java_lang_ref_Reference, get_name, void_object_signature, F_R) \ \ diff --git a/src/share/vm/opto/c2_globals.hpp b/src/share/vm/opto/c2_globals.hpp index adeee7a83..5a7cdf51f 100644 --- a/src/share/vm/opto/c2_globals.hpp +++ b/src/share/vm/opto/c2_globals.hpp @@ -659,6 +659,12 @@ product(bool, UseMultiplyToLenIntrinsic, false, \ "Enables intrinsification of BigInteger.multiplyToLen()") \ \ + product(bool, UseSquareToLenIntrinsic, false, \ + "Enables intrinsification of BigInteger.squareToLen()") \ + \ + product(bool, UseMulAddIntrinsic, false, \ + "Enables intrinsification of BigInteger.mulAdd()") \ + \ product(bool, UseTypeSpeculation, true, \ "Speculatively propagate types from profiles") \ \ diff --git a/src/share/vm/opto/escape.cpp b/src/share/vm/opto/escape.cpp index b5e1f6b72..779af14fa 100644 --- a/src/share/vm/opto/escape.cpp +++ b/src/share/vm/opto/escape.cpp @@ -958,7 +958,9 @@ void ConnectionGraph::process_call_arguments(CallNode *call) { strcmp(call->as_CallLeaf()->_name, "sha256_implCompressMB") == 0 || strcmp(call->as_CallLeaf()->_name, "sha512_implCompress") == 0 || strcmp(call->as_CallLeaf()->_name, "sha512_implCompressMB") == 0 || - strcmp(call->as_CallLeaf()->_name, "multiplyToLen") == 0) + strcmp(call->as_CallLeaf()->_name, "multiplyToLen") == 0 || + strcmp(call->as_CallLeaf()->_name, "squareToLen") == 0 || + strcmp(call->as_CallLeaf()->_name, "mulAdd") == 0) ))) { call->dump(); fatal(err_msg_res("EA unexpected CallLeaf %s", call->as_CallLeaf()->_name)); diff --git a/src/share/vm/opto/library_call.cpp b/src/share/vm/opto/library_call.cpp index b234d8bb6..461bcc966 100644 --- a/src/share/vm/opto/library_call.cpp +++ b/src/share/vm/opto/library_call.cpp @@ -324,6 +324,8 @@ class LibraryCallKit : public GraphKit { bool inline_updateBytesCRC32(); bool inline_updateByteBufferCRC32(); bool inline_multiplyToLen(); + bool inline_squareToLen(); + bool inline_mulAdd(); bool inline_profileBoolean(); }; @@ -527,6 +529,14 @@ CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) { if (!UseMultiplyToLenIntrinsic) return NULL; break; + case vmIntrinsics::_squareToLen: + if (!UseSquareToLenIntrinsic) return NULL; + break; + + case vmIntrinsics::_mulAdd: + if (!UseMulAddIntrinsic) return NULL; + break; + case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt: case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt: if (!UseAESIntrinsics) return NULL; @@ -927,6 +937,12 @@ bool LibraryCallKit::try_to_inline(int predicate) { case vmIntrinsics::_multiplyToLen: return inline_multiplyToLen(); + case vmIntrinsics::_squareToLen: + return inline_squareToLen(); + + case vmIntrinsics::_mulAdd: + return inline_mulAdd(); + case vmIntrinsics::_encodeISOArray: return inline_encodeISOArray(); @@ -5856,6 +5872,100 @@ bool LibraryCallKit::inline_multiplyToLen() { return true; } +//-------------inline_squareToLen------------------------------------ +bool LibraryCallKit::inline_squareToLen() { + assert(UseSquareToLenIntrinsic, "not implementated on this platform"); + + address stubAddr = StubRoutines::squareToLen(); + if (stubAddr == NULL) { + return false; // Intrinsic's stub is not implemented on this platform + } + const char* stubName = "squareToLen"; + + assert(callee()->signature()->size() == 4, "implSquareToLen has 4 parameters"); + + Node* x = argument(0); + Node* len = argument(1); + Node* z = argument(2); + Node* zlen = argument(3); + + const Type* x_type = x->Value(&_gvn); + const Type* z_type = z->Value(&_gvn); + const TypeAryPtr* top_x = x_type->isa_aryptr(); + const TypeAryPtr* top_z = z_type->isa_aryptr(); + if (top_x == NULL || top_x->klass() == NULL || + top_z == NULL || top_z->klass() == NULL) { + // failed array check + return false; + } + + BasicType x_elem = x_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type(); + BasicType z_elem = z_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type(); + if (x_elem != T_INT || z_elem != T_INT) { + return false; + } + + + Node* x_start = array_element_address(x, intcon(0), x_elem); + Node* z_start = array_element_address(z, intcon(0), z_elem); + + Node* call = make_runtime_call(RC_LEAF|RC_NO_FP, + OptoRuntime::squareToLen_Type(), + stubAddr, stubName, TypePtr::BOTTOM, + x_start, len, z_start, zlen); + + set_result(z); + return true; +} + +//-------------inline_mulAdd------------------------------------------ +bool LibraryCallKit::inline_mulAdd() { + assert(UseMulAddIntrinsic, "not implementated on this platform"); + + address stubAddr = StubRoutines::mulAdd(); + if (stubAddr == NULL) { + return false; // Intrinsic's stub is not implemented on this platform + } + const char* stubName = "mulAdd"; + + assert(callee()->signature()->size() == 5, "mulAdd has 5 parameters"); + + Node* out = argument(0); + Node* in = argument(1); + Node* offset = argument(2); + Node* len = argument(3); + Node* k = argument(4); + + const Type* out_type = out->Value(&_gvn); + const Type* in_type = in->Value(&_gvn); + const TypeAryPtr* top_out = out_type->isa_aryptr(); + const TypeAryPtr* top_in = in_type->isa_aryptr(); + if (top_out == NULL || top_out->klass() == NULL || + top_in == NULL || top_in->klass() == NULL) { + // failed array check + return false; + } + + BasicType out_elem = out_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type(); + BasicType in_elem = in_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type(); + if (out_elem != T_INT || in_elem != T_INT) { + return false; + } + + Node* outlen = load_array_length(out); + Node* new_offset = _gvn.transform(new (C) SubINode(outlen, offset)); + Node* out_start = array_element_address(out, intcon(0), out_elem); + Node* in_start = array_element_address(in, intcon(0), in_elem); + + Node* call = make_runtime_call(RC_LEAF|RC_NO_FP, + OptoRuntime::mulAdd_Type(), + stubAddr, stubName, TypePtr::BOTTOM, + out_start,in_start, new_offset, len, k); + Node* result = _gvn.transform(new (C) ProjNode(call, TypeFunc::Parms)); + set_result(result); + return true; +} + /** * Calculate CRC32 for byte. diff --git a/src/share/vm/opto/runtime.cpp b/src/share/vm/opto/runtime.cpp index 2b62a9961..4f96e3ea0 100644 --- a/src/share/vm/opto/runtime.cpp +++ b/src/share/vm/opto/runtime.cpp @@ -956,6 +956,48 @@ const TypeFunc* OptoRuntime::multiplyToLen_Type() { return TypeFunc::make(domain, range); } +const TypeFunc* OptoRuntime::squareToLen_Type() { + // create input type (domain) + int num_args = 4; + int argcnt = num_args; + const Type** fields = TypeTuple::fields(argcnt); + int argp = TypeFunc::Parms; + fields[argp++] = TypePtr::NOTNULL; // x + fields[argp++] = TypeInt::INT; // len + fields[argp++] = TypePtr::NOTNULL; // z + fields[argp++] = TypeInt::INT; // zlen + assert(argp == TypeFunc::Parms+argcnt, "correct decoding"); + const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields); + + // no result type needed + fields = TypeTuple::fields(1); + fields[TypeFunc::Parms+0] = NULL; + const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields); + return TypeFunc::make(domain, range); +} + +// for mulAdd calls, 2 pointers and 3 ints, returning int +const TypeFunc* OptoRuntime::mulAdd_Type() { + // create input type (domain) + int num_args = 5; + int argcnt = num_args; + const Type** fields = TypeTuple::fields(argcnt); + int argp = TypeFunc::Parms; + fields[argp++] = TypePtr::NOTNULL; // out + fields[argp++] = TypePtr::NOTNULL; // in + fields[argp++] = TypeInt::INT; // offset + fields[argp++] = TypeInt::INT; // len + fields[argp++] = TypeInt::INT; // k + assert(argp == TypeFunc::Parms+argcnt, "correct decoding"); + const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields); + + // returning carry (int) + fields = TypeTuple::fields(1); + fields[TypeFunc::Parms+0] = TypeInt::INT; + const TypeTuple* range = TypeTuple::make(TypeFunc::Parms+1, fields); + return TypeFunc::make(domain, range); +} + //------------- Interpreter state access for on stack replacement diff --git a/src/share/vm/opto/runtime.hpp b/src/share/vm/opto/runtime.hpp index 3258c36ea..aae3c2b17 100644 --- a/src/share/vm/opto/runtime.hpp +++ b/src/share/vm/opto/runtime.hpp @@ -305,6 +305,10 @@ private: static const TypeFunc* multiplyToLen_Type(); + static const TypeFunc* squareToLen_Type(); + + static const TypeFunc* mulAdd_Type(); + static const TypeFunc* updateBytesCRC32_Type(); // leaf on stack replacement interpreter accessor types diff --git a/src/share/vm/runtime/stubRoutines.cpp b/src/share/vm/runtime/stubRoutines.cpp index f35cce9f5..9fc123a73 100644 --- a/src/share/vm/runtime/stubRoutines.cpp +++ b/src/share/vm/runtime/stubRoutines.cpp @@ -136,6 +136,8 @@ address StubRoutines::_updateBytesCRC32 = NULL; address StubRoutines::_crc_table_adr = NULL; address StubRoutines::_multiplyToLen = NULL; +address StubRoutines::_squareToLen = NULL; +address StubRoutines::_mulAdd = NULL; double (* StubRoutines::_intrinsic_log )(double) = NULL; double (* StubRoutines::_intrinsic_log10 )(double) = NULL; diff --git a/src/share/vm/runtime/stubRoutines.hpp b/src/share/vm/runtime/stubRoutines.hpp index 819b9c42e..56ae9ae45 100644 --- a/src/share/vm/runtime/stubRoutines.hpp +++ b/src/share/vm/runtime/stubRoutines.hpp @@ -209,6 +209,8 @@ class StubRoutines: AllStatic { static address _crc_table_adr; static address _multiplyToLen; + static address _squareToLen; + static address _mulAdd; // These are versions of the java.lang.Math methods which perform // the same operations as the intrinsic version. They are used for @@ -367,6 +369,8 @@ class StubRoutines: AllStatic { static address crc_table_addr() { return _crc_table_adr; } static address multiplyToLen() {return _multiplyToLen; } + static address squareToLen() {return _squareToLen; } + static address mulAdd() {return _mulAdd; } static address select_fill_function(BasicType t, bool aligned, const char* &name); diff --git a/src/share/vm/runtime/vmStructs.cpp b/src/share/vm/runtime/vmStructs.cpp index f1200e339..80b01150b 100644 --- a/src/share/vm/runtime/vmStructs.cpp +++ b/src/share/vm/runtime/vmStructs.cpp @@ -813,6 +813,8 @@ typedef TwoOopHashtable SymbolTwoOopHashtable; static_field(StubRoutines, _updateBytesCRC32, address) \ static_field(StubRoutines, _crc_table_adr, address) \ static_field(StubRoutines, _multiplyToLen, address) \ + static_field(StubRoutines, _squareToLen, address) \ + static_field(StubRoutines, _mulAdd, address) \ \ /*****************/ \ /* SharedRuntime */ \ diff --git a/test/compiler/intrinsics/muladd/TestMulAdd.java b/test/compiler/intrinsics/muladd/TestMulAdd.java new file mode 100644 index 000000000..4d7b274c2 --- /dev/null +++ b/test/compiler/intrinsics/muladd/TestMulAdd.java @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +/** + * @test + * @bug 8081778 + * @summary Add C2 x86 intrinsic for BigInteger::mulAdd() method + * + * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch + * -XX:+IgnoreUnrecognizedVMOptions -XX:-UseSquareToLenIntrinsic -XX:-UseMultiplyToLenIntrinsic + * -XX:CompileCommand=dontinline,TestMulAdd::main + * -XX:CompileCommand=option,TestMulAdd::base_multiply,ccstr,DisableIntrinsic,_mulAdd + * -XX:CompileCommand=option,java.math.BigInteger::multiply,ccstr,DisableIntrinsic,_mulAdd + * -XX:CompileCommand=option,java.math.BigInteger::square,ccstr,DisableIntrinsic,_mulAdd + * -XX:CompileCommand=option,java.math.BigInteger::squareToLen,ccstr,DisableIntrinsic,_mulAdd + * -XX:CompileCommand=option,java.math.BigInteger::mulAdd,ccstr,DisableIntrinsic,_mulAdd + * -XX:CompileCommand=inline,java.math.BigInteger::multiply + * -XX:CompileCommand=inline,java.math.BigInteger::square + * -XX:CompileCommand=inline,java.math.BigInteger::squareToLen + * -XX:CompileCommand=inline,java.math.BigInteger::mulAdd TestMulAdd + */ + +import java.util.Random; +import java.math.*; + +public class TestMulAdd { + + // Avoid intrinsic by preventing inlining multiply() and mulAdd(). + public static BigInteger base_multiply(BigInteger op1) { + return op1.multiply(op1); + } + + // Generate mulAdd() intrinsic by inlining multiply(). + public static BigInteger new_multiply(BigInteger op1) { + return op1.multiply(op1); + } + + public static boolean bytecompare(BigInteger b1, BigInteger b2) { + byte[] data1 = b1.toByteArray(); + byte[] data2 = b2.toByteArray(); + if (data1.length != data2.length) + return false; + for (int i = 0; i < data1.length; i++) { + if (data1[i] != data2[i]) + return false; + } + return true; + } + + public static String stringify(BigInteger b) { + String strout= ""; + byte [] data = b.toByteArray(); + for (int i = 0; i < data.length; i++) { + strout += (String.format("%02x",data[i]) + " "); + } + return strout; + } + + public static void main(String args[]) throws Exception { + + BigInteger oldsum = new BigInteger("0"); + BigInteger newsum = new BigInteger("0"); + + BigInteger b1, b2, oldres, newres; + + Random rand = new Random(); + long seed = System.nanoTime(); + Random rand1 = new Random(); + long seed1 = System.nanoTime(); + rand.setSeed(seed); + rand1.setSeed(seed1); + + for (int j = 0; j < 100000; j++) { + int rand_int = rand1.nextInt(3136)+32; + b1 = new BigInteger(rand_int, rand); + + oldres = base_multiply(b1); + newres = new_multiply(b1); + + oldsum = oldsum.add(oldres); + newsum = newsum.add(newres); + + if (!bytecompare(oldres,newres)) { + System.out.print("mismatch for:b1:" + stringify(b1) + " :oldres:" + stringify(oldres) + " :newres:" + stringify(newres)); + System.out.println(b1); + throw new Exception("Failed"); + } + } + if (!bytecompare(oldsum,newsum)) { + System.out.println("Failure: oldsum:" + stringify(oldsum) + " newsum:" + stringify(newsum)); + throw new Exception("Failed"); + } else { + System.out.println("Success"); + } + } +} diff --git a/test/compiler/intrinsics/squaretolen/TestSquareToLen.java b/test/compiler/intrinsics/squaretolen/TestSquareToLen.java new file mode 100644 index 000000000..79ebbe3af --- /dev/null +++ b/test/compiler/intrinsics/squaretolen/TestSquareToLen.java @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +/** + * @test + * @bug 8081778 + * @summary Add C2 x86 intrinsic for BigInteger::squareToLen() method + * + * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch + * -XX:CompileCommand=exclude,TestSquareToLen::main + * -XX:CompileCommand=option,TestSquareToLen::base_multiply,ccstr,DisableIntrinsic,_squareToLen + * -XX:CompileCommand=option,java.math.BigInteger::multiply,ccstr,DisableIntrinsic,_squareToLen + * -XX:CompileCommand=option,java.math.BigInteger::square,ccstr,DisableIntrinsic,_squareToLen + * -XX:CompileCommand=option,java.math.BigInteger::squareToLen,ccstr,DisableIntrinsic,_squareToLen + * -XX:CompileCommand=inline,java.math.BigInteger::multiply + * -XX:CompileCommand=inline,java.math.BigInteger::square + * -XX:CompileCommand=inline,java.math.BigInteger::squareToLen TestSquareToLen + */ + +import java.util.Random; +import java.math.*; + +public class TestSquareToLen { + + // Avoid intrinsic by preventing inlining multiply() and squareToLen(). + public static BigInteger base_multiply(BigInteger op1) { + return op1.multiply(op1); + } + + // Generate squareToLen() intrinsic by inlining multiply(). + public static BigInteger new_multiply(BigInteger op1) { + return op1.multiply(op1); + } + + public static boolean bytecompare(BigInteger b1, BigInteger b2) { + byte[] data1 = b1.toByteArray(); + byte[] data2 = b2.toByteArray(); + if (data1.length != data2.length) + return false; + for (int i = 0; i < data1.length; i++) { + if (data1[i] != data2[i]) + return false; + } + return true; + } + + public static String stringify(BigInteger b) { + String strout= ""; + byte [] data = b.toByteArray(); + for (int i = 0; i < data.length; i++) { + strout += (String.format("%02x",data[i]) + " "); + } + return strout; + } + + public static void main(String args[]) throws Exception { + + BigInteger oldsum = new BigInteger("0"); + BigInteger newsum = new BigInteger("0"); + + BigInteger b1, b2, oldres, newres; + + Random rand = new Random(); + long seed = System.nanoTime(); + Random rand1 = new Random(); + long seed1 = System.nanoTime(); + rand.setSeed(seed); + rand1.setSeed(seed1); + + for (int j = 0; j < 100000; j++) { + int rand_int = rand1.nextInt(3136)+32; + b1 = new BigInteger(rand_int, rand); + + oldres = base_multiply(b1); + newres = new_multiply(b1); + + oldsum = oldsum.add(oldres); + newsum = newsum.add(newres); + + if (!bytecompare(oldres,newres)) { + System.out.print("mismatch for:b1:" + stringify(b1) + " :oldres:" + stringify(oldres) + " :newres:" + stringify(newres)); + System.out.println(b1); + throw new Exception("Failed"); + } + } + if (!bytecompare(oldsum,newsum)) { + System.out.println("Failure: oldsum:" + stringify(oldsum) + " newsum:" + stringify(newsum)); + throw new Exception("Failed"); + } else { + System.out.println("Success"); + } + } +} -- GitLab