diff --git a/src/cpu/x86/vm/assembler_x86.cpp b/src/cpu/x86/vm/assembler_x86.cpp index 342192775236b64b89c78f06f7d6fc7008dace92..8098e889ba99142364f55aa7cb6e2a726a398e20 100644 --- a/src/cpu/x86/vm/assembler_x86.cpp +++ b/src/cpu/x86/vm/assembler_x86.cpp @@ -4937,6 +4937,26 @@ void Assembler::addq(Register dst, Register src) { emit_arith(0x03, 0xC0, dst, src); } +void Assembler::adcxq(Register dst, Register src) { + //assert(VM_Version::supports_adx(), "adx instructions not supported"); + emit_int8((unsigned char)0x66); + int encode = prefixq_and_encode(dst->encoding(), src->encoding()); + emit_int8(0x0F); + emit_int8(0x38); + emit_int8((unsigned char)0xF6); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::adoxq(Register dst, Register src) { + //assert(VM_Version::supports_adx(), "adx instructions not supported"); + emit_int8((unsigned char)0xF3); + int encode = prefixq_and_encode(dst->encoding(), src->encoding()); + emit_int8(0x0F); + emit_int8(0x38); + emit_int8((unsigned char)0xF6); + emit_int8((unsigned char)(0xC0 | encode)); +} + void Assembler::andq(Address dst, int32_t imm32) { InstructionMark im(this); prefixq(dst); @@ -5444,6 +5464,26 @@ void Assembler::movzwq(Register dst, Register src) { emit_int8((unsigned char)(0xC0 | encode)); } +void Assembler::mulq(Address src) { + InstructionMark im(this); + prefixq(src); + emit_int8((unsigned char)0xF7); + emit_operand(rsp, src); +} + +void Assembler::mulq(Register src) { + int encode = prefixq_and_encode(src->encoding()); + emit_int8((unsigned char)0xF7); + emit_int8((unsigned char)(0xE0 | encode)); +} + +void Assembler::mulxq(Register dst1, Register dst2, Register src) { + assert(VM_Version::supports_bmi2(), "bit manipulation instructions not supported"); + int encode = vex_prefix_and_encode(dst1->encoding(), dst2->encoding(), src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_38, true, false); + emit_int8((unsigned char)0xF6); + emit_int8((unsigned char)(0xC0 | encode)); +} + void Assembler::negq(Register dst) { int encode = prefixq_and_encode(dst->encoding()); emit_int8((unsigned char)0xF7); @@ -5572,6 +5612,28 @@ void Assembler::rclq(Register dst, int imm8) { emit_int8(imm8); } } + +void Assembler::rorq(Register dst, int imm8) { + assert(isShiftCount(imm8 >> 1), "illegal shift count"); + int encode = prefixq_and_encode(dst->encoding()); + if (imm8 == 1) { + emit_int8((unsigned char)0xD1); + emit_int8((unsigned char)(0xC8 | encode)); + } else { + emit_int8((unsigned char)0xC1); + emit_int8((unsigned char)(0xc8 | encode)); + emit_int8(imm8); + } +} + +void Assembler::rorxq(Register dst, Register src, int imm8) { + assert(VM_Version::supports_bmi2(), "bit manipulation instructions not supported"); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_3A, true, false); + emit_int8((unsigned char)0xF0); + emit_int8((unsigned char)(0xC0 | encode)); + emit_int8(imm8); +} + void Assembler::sarq(Register dst, int imm8) { assert(isShiftCount(imm8 >> 1), "illegal shift count"); int encode = prefixq_and_encode(dst->encoding()); diff --git a/src/cpu/x86/vm/assembler_x86.hpp b/src/cpu/x86/vm/assembler_x86.hpp index 8edf31cada80cad7b01f38a952e364ac09232a92..2ac9df8c92758ca3d9de300a376f7aef70141c92 100644 --- a/src/cpu/x86/vm/assembler_x86.hpp +++ b/src/cpu/x86/vm/assembler_x86.hpp @@ -888,6 +888,14 @@ private: void addq(Register dst, Address src); void addq(Register dst, Register src); +#ifdef _LP64 + //Add Unsigned Integers with Carry Flag + void adcxq(Register dst, Register src); + + //Add Unsigned Integers with Overflow Flag + void adoxq(Register dst, Register src); +#endif + void addr_nop_4(); void addr_nop_5(); void addr_nop_7(); @@ -1204,19 +1212,20 @@ private: void idivl(Register src); void divl(Register src); // Unsigned division +#ifdef _LP64 void idivq(Register src); +#endif void imull(Register dst, Register src); void imull(Register dst, Register src, int value); void imull(Register dst, Address src); +#ifdef _LP64 void imulq(Register dst, Register src); void imulq(Register dst, Register src, int value); -#ifdef _LP64 void imulq(Register dst, Address src); #endif - // jcc is the generic conditional branch generator to run- // time routines, jcc is used for branches to labels. jcc // takes a branch opcode (cc) and a label (L) and generates @@ -1408,9 +1417,16 @@ private: void movzwq(Register dst, Register src); #endif + // Unsigned multiply with RAX destination register void mull(Address src); void mull(Register src); +#ifdef _LP64 + void mulq(Address src); + void mulq(Register src); + void mulxq(Register dst1, Register dst2, Register src); +#endif + // Multiply Scalar Double-Precision Floating-Point Values void mulsd(XMMRegister dst, Address src); void mulsd(XMMRegister dst, XMMRegister src); @@ -1541,6 +1557,11 @@ private: void ret(int imm16); +#ifdef _LP64 + void rorq(Register dst, int imm8); + void rorxq(Register dst, Register src, int imm8); +#endif + void sahf(); void sarl(Register dst, int imm8); diff --git a/src/cpu/x86/vm/globals_x86.hpp b/src/cpu/x86/vm/globals_x86.hpp index 5b34293ea6ab3714b7de8f6e499a6352a4ed614f..1401997b3a1eeb9d51f54f706eb1fba4e6f08929 100644 --- a/src/cpu/x86/vm/globals_x86.hpp +++ b/src/cpu/x86/vm/globals_x86.hpp @@ -176,6 +176,8 @@ define_pd_global(uintx, TypeProfileLevel, 111); "Use count trailing zeros instruction") \ \ product(bool, UseBMI1Instructions, false, \ - "Use BMI instructions") - + "Use BMI1 instructions") \ + \ + product(bool, UseBMI2Instructions, false, \ + "Use BMI2 instructions") #endif // CPU_X86_VM_GLOBALS_X86_HPP diff --git a/src/cpu/x86/vm/macroAssembler_x86.cpp b/src/cpu/x86/vm/macroAssembler_x86.cpp index 7216c1980278b974caf94160d3e4d1a3131f2a65..592b45c9fd0c5f33b4b3193c3266c5dc7fdd067d 100644 --- a/src/cpu/x86/vm/macroAssembler_x86.cpp +++ b/src/cpu/x86/vm/macroAssembler_x86.cpp @@ -7293,6 +7293,467 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, Register len, bind(L_done); } +#ifdef _LP64 +/** + * Helper for multiply_to_len(). + */ +void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) { + addq(dest_lo, src1); + adcq(dest_hi, 0); + addq(dest_lo, src2); + adcq(dest_hi, 0); +} + +/** + * Multiply 64 bit by 64 bit first loop. + */ +void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, + Register y, Register y_idx, Register z, + Register carry, Register product, + Register idx, Register kdx) { + // + // jlong carry, x[], y[], z[]; + // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { + // huge_128 product = y[idx] * x[xstart] + carry; + // z[kdx] = (jlong)product; + // carry = (jlong)(product >>> 64); + // } + // z[xstart] = carry; + // + + Label L_first_loop, L_first_loop_exit; + Label L_one_x, L_one_y, L_multiply; + + decrementl(xstart); + jcc(Assembler::negative, L_one_x); + + movq(x_xstart, Address(x, xstart, Address::times_4, 0)); + rorq(x_xstart, 32); // convert big-endian to little-endian + + bind(L_first_loop); + decrementl(idx); + jcc(Assembler::negative, L_first_loop_exit); + decrementl(idx); + jcc(Assembler::negative, L_one_y); + movq(y_idx, Address(y, idx, Address::times_4, 0)); + rorq(y_idx, 32); // convert big-endian to little-endian + bind(L_multiply); + movq(product, x_xstart); + mulq(y_idx); // product(rax) * y_idx -> rdx:rax + addq(product, carry); + adcq(rdx, 0); + subl(kdx, 2); + movl(Address(z, kdx, Address::times_4, 4), product); + shrq(product, 32); + movl(Address(z, kdx, Address::times_4, 0), product); + movq(carry, rdx); + jmp(L_first_loop); + + bind(L_one_y); + movl(y_idx, Address(y, 0)); + jmp(L_multiply); + + bind(L_one_x); + movl(x_xstart, Address(x, 0)); + jmp(L_first_loop); + + bind(L_first_loop_exit); +} + +/** + * Multiply 64 bit by 64 bit and add 128 bit. + */ +void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z, + Register yz_idx, Register idx, + Register carry, Register product, int offset) { + // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; + // z[kdx] = (jlong)product; + + movq(yz_idx, Address(y, idx, Address::times_4, offset)); + rorq(yz_idx, 32); // convert big-endian to little-endian + movq(product, x_xstart); + mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax) + movq(yz_idx, Address(z, idx, Address::times_4, offset)); + rorq(yz_idx, 32); // convert big-endian to little-endian + + add2_with_carry(rdx, product, carry, yz_idx); + + movl(Address(z, idx, Address::times_4, offset+4), product); + shrq(product, 32); + movl(Address(z, idx, Address::times_4, offset), product); + +} + +/** + * Multiply 128 bit by 128 bit. Unrolled inner loop. + */ +void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z, + Register yz_idx, Register idx, Register jdx, + Register carry, Register product, + Register carry2) { + // jlong carry, x[], y[], z[]; + // int kdx = ystart+1; + // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop + // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; + // z[kdx+idx+1] = (jlong)product; + // jlong carry2 = (jlong)(product >>> 64); + // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; + // z[kdx+idx] = (jlong)product; + // carry = (jlong)(product >>> 64); + // } + // idx += 2; + // if (idx > 0) { + // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; + // z[kdx+idx] = (jlong)product; + // carry = (jlong)(product >>> 64); + // } + // + + Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; + + movl(jdx, idx); + andl(jdx, 0xFFFFFFFC); + shrl(jdx, 2); + + bind(L_third_loop); + subl(jdx, 1); + jcc(Assembler::negative, L_third_loop_exit); + subl(idx, 4); + + multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8); + movq(carry2, rdx); + + multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0); + movq(carry, rdx); + jmp(L_third_loop); + + bind (L_third_loop_exit); + + andl (idx, 0x3); + jcc(Assembler::zero, L_post_third_loop_done); + + Label L_check_1; + subl(idx, 2); + jcc(Assembler::negative, L_check_1); + + multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0); + movq(carry, rdx); + + bind (L_check_1); + addl (idx, 0x2); + andl (idx, 0x1); + subl(idx, 1); + jcc(Assembler::negative, L_post_third_loop_done); + + movl(yz_idx, Address(y, idx, Address::times_4, 0)); + movq(product, x_xstart); + mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax) + movl(yz_idx, Address(z, idx, Address::times_4, 0)); + + add2_with_carry(rdx, product, yz_idx, carry); + + movl(Address(z, idx, Address::times_4, 0), product); + shrq(product, 32); + + shlq(rdx, 32); + orq(product, rdx); + movq(carry, product); + + bind(L_post_third_loop_done); +} + +/** + * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop. + * + */ +void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z, + Register carry, Register carry2, + Register idx, Register jdx, + Register yz_idx1, Register yz_idx2, + Register tmp, Register tmp3, Register tmp4) { + assert(UseBMI2Instructions, "should be used only when BMI2 is available"); + + // jlong carry, x[], y[], z[]; + // int kdx = ystart+1; + // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop + // huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry; + // jlong carry2 = (jlong)(tmp3 >>> 64); + // huge_128 tmp4 = (y[idx] * rdx) + z[kdx+idx] + carry2; + // carry = (jlong)(tmp4 >>> 64); + // z[kdx+idx+1] = (jlong)tmp3; + // z[kdx+idx] = (jlong)tmp4; + // } + // idx += 2; + // if (idx > 0) { + // yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry; + // z[kdx+idx] = (jlong)yz_idx1; + // carry = (jlong)(yz_idx1 >>> 64); + // } + // + + Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; + + movl(jdx, idx); + andl(jdx, 0xFFFFFFFC); + shrl(jdx, 2); + + bind(L_third_loop); + subl(jdx, 1); + jcc(Assembler::negative, L_third_loop_exit); + subl(idx, 4); + + movq(yz_idx1, Address(y, idx, Address::times_4, 8)); + rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian + movq(yz_idx2, Address(y, idx, Address::times_4, 0)); + rorxq(yz_idx2, yz_idx2, 32); + + mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3 + mulxq(carry2, tmp, yz_idx2); // yz_idx2 * rdx -> carry2:tmp + + movq(yz_idx1, Address(z, idx, Address::times_4, 8)); + rorxq(yz_idx1, yz_idx1, 32); + movq(yz_idx2, Address(z, idx, Address::times_4, 0)); + rorxq(yz_idx2, yz_idx2, 32); + + if (VM_Version::supports_adx()) { + adcxq(tmp3, carry); + adoxq(tmp3, yz_idx1); + + adcxq(tmp4, tmp); + adoxq(tmp4, yz_idx2); + + movl(carry, 0); // does not affect flags + adcxq(carry2, carry); + adoxq(carry2, carry); + } else { + add2_with_carry(tmp4, tmp3, carry, yz_idx1); + add2_with_carry(carry2, tmp4, tmp, yz_idx2); + } + movq(carry, carry2); + + movl(Address(z, idx, Address::times_4, 12), tmp3); + shrq(tmp3, 32); + movl(Address(z, idx, Address::times_4, 8), tmp3); + + movl(Address(z, idx, Address::times_4, 4), tmp4); + shrq(tmp4, 32); + movl(Address(z, idx, Address::times_4, 0), tmp4); + + jmp(L_third_loop); + + bind (L_third_loop_exit); + + andl (idx, 0x3); + jcc(Assembler::zero, L_post_third_loop_done); + + Label L_check_1; + subl(idx, 2); + jcc(Assembler::negative, L_check_1); + + movq(yz_idx1, Address(y, idx, Address::times_4, 0)); + rorxq(yz_idx1, yz_idx1, 32); + mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3 + movq(yz_idx2, Address(z, idx, Address::times_4, 0)); + rorxq(yz_idx2, yz_idx2, 32); + + add2_with_carry(tmp4, tmp3, carry, yz_idx2); + + movl(Address(z, idx, Address::times_4, 4), tmp3); + shrq(tmp3, 32); + movl(Address(z, idx, Address::times_4, 0), tmp3); + movq(carry, tmp4); + + bind (L_check_1); + addl (idx, 0x2); + andl (idx, 0x1); + subl(idx, 1); + jcc(Assembler::negative, L_post_third_loop_done); + movl(tmp4, Address(y, idx, Address::times_4, 0)); + mulxq(carry2, tmp3, tmp4); // tmp4 * rdx -> carry2:tmp3 + movl(tmp4, Address(z, idx, Address::times_4, 0)); + + add2_with_carry(carry2, tmp3, tmp4, carry); + + movl(Address(z, idx, Address::times_4, 0), tmp3); + shrq(tmp3, 32); + + shlq(carry2, 32); + orq(tmp3, carry2); + movq(carry, tmp3); + + bind(L_post_third_loop_done); +} + +/** + * Code for BigInteger::multiplyToLen() instrinsic. + * + * rdi: x + * rax: xlen + * rsi: y + * rcx: ylen + * r8: z + * r11: zlen + * r12: tmp1 + * r13: tmp2 + * r14: tmp3 + * r15: tmp4 + * rbx: tmp5 + * + */ +void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen, + Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) { + ShortBranchVerifier sbv(this); + assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx); + + push(tmp1); + push(tmp2); + push(tmp3); + push(tmp4); + push(tmp5); + + push(xlen); + push(zlen); + + const Register idx = tmp1; + const Register kdx = tmp2; + const Register xstart = tmp3; + + const Register y_idx = tmp4; + const Register carry = tmp5; + const Register product = xlen; + const Register x_xstart = zlen; // reuse register + + // First Loop. + // + // final static long LONG_MASK = 0xffffffffL; + // int xstart = xlen - 1; + // int ystart = ylen - 1; + // long carry = 0; + // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { + // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; + // z[kdx] = (int)product; + // carry = product >>> 32; + // } + // z[xstart] = (int)carry; + // + + movl(idx, ylen); // idx = ylen; + movl(kdx, zlen); // kdx = xlen+ylen; + xorq(carry, carry); // carry = 0; + + Label L_done; + + movl(xstart, xlen); + decrementl(xstart); + jcc(Assembler::negative, L_done); + + multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); + + Label L_second_loop; + testl(kdx, kdx); + jcc(Assembler::zero, L_second_loop); + + Label L_carry; + subl(kdx, 1); + jcc(Assembler::zero, L_carry); + + movl(Address(z, kdx, Address::times_4, 0), carry); + shrq(carry, 32); + subl(kdx, 1); + + bind(L_carry); + movl(Address(z, kdx, Address::times_4, 0), carry); + + // Second and third (nested) loops. + // + // for (int i = xstart-1; i >= 0; i--) { // Second loop + // carry = 0; + // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop + // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + + // (z[k] & LONG_MASK) + carry; + // z[k] = (int)product; + // carry = product >>> 32; + // } + // z[i] = (int)carry; + // } + // + // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx + + const Register jdx = tmp1; + + bind(L_second_loop); + xorl(carry, carry); // carry = 0; + movl(jdx, ylen); // j = ystart+1 + + subl(xstart, 1); // i = xstart-1; + jcc(Assembler::negative, L_done); + + push (z); + + Label L_last_x; + lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j + subl(xstart, 1); // i = xstart-1; + jcc(Assembler::negative, L_last_x); + + if (UseBMI2Instructions) { + movq(rdx, Address(x, xstart, Address::times_4, 0)); + rorxq(rdx, rdx, 32); // convert big-endian to little-endian + } else { + movq(x_xstart, Address(x, xstart, Address::times_4, 0)); + rorq(x_xstart, 32); // convert big-endian to little-endian + } + + Label L_third_loop_prologue; + bind(L_third_loop_prologue); + + push (x); + push (xstart); + push (ylen); + + + if (UseBMI2Instructions) { + multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4); + } else { // !UseBMI2Instructions + multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x); + } + + pop(ylen); + pop(xlen); + pop(x); + pop(z); + + movl(tmp3, xlen); + addl(tmp3, 1); + movl(Address(z, tmp3, Address::times_4, 0), carry); + subl(tmp3, 1); + jccb(Assembler::negative, L_done); + + shrq(carry, 32); + movl(Address(z, tmp3, Address::times_4, 0), carry); + jmp(L_second_loop); + + // Next infrequent code is moved outside loops. + bind(L_last_x); + if (UseBMI2Instructions) { + movl(rdx, Address(x, 0)); + } else { + movl(x_xstart, Address(x, 0)); + } + jmp(L_third_loop_prologue); + + bind(L_done); + + pop(zlen); + pop(xlen); + + pop(tmp5); + pop(tmp4); + pop(tmp3); + pop(tmp2); + pop(tmp1); +} +#endif + /** * Emits code to update CRC-32 with a byte value according to constants in table * diff --git a/src/cpu/x86/vm/macroAssembler_x86.hpp b/src/cpu/x86/vm/macroAssembler_x86.hpp index 3b3073e633a3b2c0a197444b32b593d0484fef85..69c9e8aa360b75874aa69360c705f57db13bea5d 100644 --- a/src/cpu/x86/vm/macroAssembler_x86.hpp +++ b/src/cpu/x86/vm/macroAssembler_x86.hpp @@ -1221,6 +1221,28 @@ public: XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3, XMMRegister tmp4, Register tmp5, Register result); +#ifdef _LP64 + void add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2); + void multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, + Register y, Register y_idx, Register z, + Register carry, Register product, + Register idx, Register kdx); + void multiply_add_128_x_128(Register x_xstart, Register y, Register z, + Register yz_idx, Register idx, + Register carry, Register product, int offset); + void multiply_128_x_128_bmi2_loop(Register y, Register z, + Register carry, Register carry2, + Register idx, Register jdx, + Register yz_idx1, Register yz_idx2, + Register tmp, Register tmp3, Register tmp4); + void multiply_128_x_128_loop(Register x_xstart, Register y, Register z, + Register yz_idx, Register idx, Register jdx, + Register carry, Register product, + Register carry2); + void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen, + Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5); +#endif + // CRC32 code for java.util.zip.CRC32::updateBytes() instrinsic. void update_byte_crc32(Register crc, Register val, Register table); void kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp); diff --git a/src/cpu/x86/vm/stubGenerator_x86_64.cpp b/src/cpu/x86/vm/stubGenerator_x86_64.cpp index 0adb0d31e72e9eb451fbadc51f506434209a0a1b..0000146f53563d75940e6f16c06f500aa2552f19 100644 --- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp +++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp @@ -3677,6 +3677,70 @@ class StubGenerator: public StubCodeGenerator { return start; } + + /** + * Arguments: + * + * Input: + * c_rarg0 - x address + * c_rarg1 - x length + * c_rarg2 - y address + * c_rarg3 - y lenth + * not Win64 + * c_rarg4 - z address + * c_rarg5 - z length + * Win64 + * rsp+40 - z address + * rsp+48 - z length + */ + address generate_multiplyToLen() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); + + address start = __ pc(); + // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...) + // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...) + const Register x = rdi; + const Register xlen = rax; + const Register y = rsi; + const Register ylen = rcx; + const Register z = r8; + const Register zlen = r11; + + // Next registers will be saved on stack in multiply_to_len(). + const Register tmp1 = r12; + const Register tmp2 = r13; + const Register tmp3 = r14; + const Register tmp4 = r15; + const Register tmp5 = rbx; + + BLOCK_COMMENT("Entry:"); + __ enter(); // required for proper stackwalking of RuntimeStub frame + +#ifndef _WIN64 + __ movptr(zlen, r9); // Save r9 in r11 - zlen +#endif + setup_arg_regs(4); // x => rdi, xlen => rsi, y => rdx + // ylen => rcx, z => r8, zlen => r11 + // r9 and r10 may be used to save non-volatile registers +#ifdef _WIN64 + // last 2 arguments (#4, #5) are on stack on Win64 + __ movptr(z, Address(rsp, 6 * wordSize)); + __ movptr(zlen, Address(rsp, 7 * wordSize)); +#endif + + __ movptr(xlen, rsi); + __ movptr(y, rdx); + __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5); + + restore_arg_regs(); + + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + + return start; + } + #undef __ #define __ masm-> @@ -3917,6 +3981,11 @@ class StubGenerator: public StubCodeGenerator { generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, &StubRoutines::_safefetchN_fault_pc, &StubRoutines::_safefetchN_continuation_pc); +#ifdef COMPILER2 + if (UseMultiplyToLenIntrinsic) { + StubRoutines::_multiplyToLen = generate_multiplyToLen(); + } +#endif } public: diff --git a/src/cpu/x86/vm/vm_version_x86.cpp b/src/cpu/x86/vm/vm_version_x86.cpp index 49be2cad609d6f926d5e4a55ba8d9305da771b40..62cb56d3af33e7ec71c7189c13d6e6deaba669d8 100644 --- a/src/cpu/x86/vm/vm_version_x86.cpp +++ b/src/cpu/x86/vm/vm_version_x86.cpp @@ -493,7 +493,7 @@ void VM_Version::get_processor_features() { } char buf[256]; - jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", + jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", cores_per_cpu(), threads_per_core(), cpu_family(), _model, _stepping, (supports_cmov() ? ", cmov" : ""), @@ -522,7 +522,8 @@ void VM_Version::get_processor_features() { (supports_tscinv_bit() ? ", tscinvbit": ""), (supports_tscinv() ? ", tscinv": ""), (supports_bmi1() ? ", bmi1" : ""), - (supports_bmi2() ? ", bmi2" : "")); + (supports_bmi2() ? ", bmi2" : ""), + (supports_adx() ? ", adx" : "")); _features_str = strdup(buf); // UseSSE is set to the smaller of what hardware supports and what @@ -574,7 +575,7 @@ void VM_Version::get_processor_features() { } } else if (UseCRC32Intrinsics) { if (!FLAG_IS_DEFAULT(UseCRC32Intrinsics)) - warning("CRC32 Intrinsics requires AVX and CLMUL instructions (not available on this CPU)"); + warning("CRC32 Intrinsics requires CLMUL instructions (not available on this CPU)"); FLAG_SET_DEFAULT(UseCRC32Intrinsics, false); } @@ -697,7 +698,20 @@ void VM_Version::get_processor_features() { } #endif } + +#ifdef _LP64 + if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) { + UseMultiplyToLenIntrinsic = true; + } +#else + if (UseMultiplyToLenIntrinsic) { + if (!FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) { + warning("multiplyToLen intrinsic is not available in 32-bit VM"); + } + FLAG_SET_DEFAULT(UseMultiplyToLenIntrinsic, false); + } #endif +#endif // COMPILER2 // On new cpus instructions which update whole XMM register should be used // to prevent partial register stall due to dependencies on high half. @@ -840,6 +854,9 @@ void VM_Version::get_processor_features() { } } } + if(FLAG_IS_DEFAULT(AllocatePrefetchInstr) && supports_3dnow_prefetch()) { + AllocatePrefetchInstr = 3; + } } // Use count leading zeros count instruction if available. @@ -852,23 +869,35 @@ void VM_Version::get_processor_features() { FLAG_SET_DEFAULT(UseCountLeadingZerosInstruction, false); } + // Use count trailing zeros instruction if available if (supports_bmi1()) { + // tzcnt does not require VEX prefix + if (FLAG_IS_DEFAULT(UseCountTrailingZerosInstruction)) { + UseCountTrailingZerosInstruction = true; + } + } else if (UseCountTrailingZerosInstruction) { + warning("tzcnt instruction is not available on this CPU"); + FLAG_SET_DEFAULT(UseCountTrailingZerosInstruction, false); + } + + // BMI instructions use an encoding with VEX prefix. + // VEX prefix is generated only when AVX > 0. + if (supports_bmi1() && supports_avx()) { if (FLAG_IS_DEFAULT(UseBMI1Instructions)) { UseBMI1Instructions = true; } } else if (UseBMI1Instructions) { - warning("BMI1 instructions are not available on this CPU"); + warning("BMI1 instructions are not available on this CPU (AVX is also required)"); FLAG_SET_DEFAULT(UseBMI1Instructions, false); } - // Use count trailing zeros instruction if available - if (supports_bmi1()) { - if (FLAG_IS_DEFAULT(UseCountTrailingZerosInstruction)) { - UseCountTrailingZerosInstruction = UseBMI1Instructions; + if (supports_bmi2() && supports_avx()) { + if (FLAG_IS_DEFAULT(UseBMI2Instructions)) { + UseBMI2Instructions = true; } - } else if (UseCountTrailingZerosInstruction) { - warning("tzcnt instruction is not available on this CPU"); - FLAG_SET_DEFAULT(UseCountTrailingZerosInstruction, false); + } else if (UseBMI2Instructions) { + warning("BMI2 instructions are not available on this CPU (AVX is also required)"); + FLAG_SET_DEFAULT(UseBMI2Instructions, false); } // Use population count instruction if available. diff --git a/src/cpu/x86/vm/vm_version_x86.hpp b/src/cpu/x86/vm/vm_version_x86.hpp index cce93bc9dfed96306ef845e1b0dcce35f6802899..1ad94e38bef73cb7d991d5680be5e3c2bcb57121 100644 --- a/src/cpu/x86/vm/vm_version_x86.hpp +++ b/src/cpu/x86/vm/vm_version_x86.hpp @@ -209,7 +209,9 @@ public: erms : 1, : 1, rtm : 1, - : 20; + : 7, + adx : 1, + : 12; } bits; }; @@ -260,7 +262,8 @@ protected: CPU_CLMUL = (1 << 21), // carryless multiply for CRC CPU_BMI1 = (1 << 22), CPU_BMI2 = (1 << 23), - CPU_RTM = (1 << 24) // Restricted Transactional Memory instructions + CPU_RTM = (1 << 24), // Restricted Transactional Memory instructions + CPU_ADX = (1 << 25) } cpuFeatureFlags; enum { @@ -465,10 +468,16 @@ protected: } // Intel features. if(is_intel()) { + if(_cpuid_info.sef_cpuid7_ebx.bits.adx != 0) + result |= CPU_ADX; if(_cpuid_info.sef_cpuid7_ebx.bits.bmi2 != 0) result |= CPU_BMI2; if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0) result |= CPU_LZCNT; + // for Intel, ecx.bits.misalignsse bit (bit 8) indicates support for prefetchw + if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse != 0) { + result |= CPU_3DNOW_PREFETCH; + } } return result; @@ -621,6 +630,7 @@ public: static bool supports_rtm() { return (_cpuFeatures & CPU_RTM) != 0; } static bool supports_bmi1() { return (_cpuFeatures & CPU_BMI1) != 0; } static bool supports_bmi2() { return (_cpuFeatures & CPU_BMI2) != 0; } + static bool supports_adx() { return (_cpuFeatures & CPU_ADX) != 0; } // Intel features static bool is_intel_family_core() { return is_intel() && extended_cpu_family() == CPU_FAMILY_INTEL_CORE; } diff --git a/src/share/vm/asm/register.hpp b/src/share/vm/asm/register.hpp index d9918517dd9219ce684797693549c358375d70a3..7244b9f5dece7e6a4dddb6dd2c56260ab40c4111 100644 --- a/src/share/vm/asm/register.hpp +++ b/src/share/vm/asm/register.hpp @@ -275,4 +275,101 @@ inline void assert_different_registers( ); } +inline void assert_different_registers( + AbstractRegister a, + AbstractRegister b, + AbstractRegister c, + AbstractRegister d, + AbstractRegister e, + AbstractRegister f, + AbstractRegister g, + AbstractRegister h, + AbstractRegister i, + AbstractRegister j +) { + assert( + a != b && a != c && a != d && a != e && a != f && a != g && a != h && a != i && a != j + && b != c && b != d && b != e && b != f && b != g && b != h && b != i && b != j + && c != d && c != e && c != f && c != g && c != h && c != i && c != j + && d != e && d != f && d != g && d != h && d != i && d != j + && e != f && e != g && e != h && e != i && e != j + && f != g && f != h && f != i && f != j + && g != h && g != i && g != j + && h != i && h != j + && i != j, + err_msg_res("registers must be different: a=" INTPTR_FORMAT ", b=" INTPTR_FORMAT + ", c=" INTPTR_FORMAT ", d=" INTPTR_FORMAT ", e=" INTPTR_FORMAT + ", f=" INTPTR_FORMAT ", g=" INTPTR_FORMAT ", h=" INTPTR_FORMAT + ", i=" INTPTR_FORMAT ", j=" INTPTR_FORMAT "", + p2i(a), p2i(b), p2i(c), p2i(d), p2i(e), p2i(f), p2i(g), p2i(h), p2i(i), p2i(j)) + ); +} + +inline void assert_different_registers( + AbstractRegister a, + AbstractRegister b, + AbstractRegister c, + AbstractRegister d, + AbstractRegister e, + AbstractRegister f, + AbstractRegister g, + AbstractRegister h, + AbstractRegister i, + AbstractRegister j, + AbstractRegister k +) { + assert( + a != b && a != c && a != d && a != e && a != f && a != g && a != h && a != i && a != j && a !=k + && b != c && b != d && b != e && b != f && b != g && b != h && b != i && b != j && b !=k + && c != d && c != e && c != f && c != g && c != h && c != i && c != j && c !=k + && d != e && d != f && d != g && d != h && d != i && d != j && d !=k + && e != f && e != g && e != h && e != i && e != j && e !=k + && f != g && f != h && f != i && f != j && f !=k + && g != h && g != i && g != j && g !=k + && h != i && h != j && h !=k + && i != j && i !=k + && j !=k, + err_msg_res("registers must be different: a=" INTPTR_FORMAT ", b=" INTPTR_FORMAT + ", c=" INTPTR_FORMAT ", d=" INTPTR_FORMAT ", e=" INTPTR_FORMAT + ", f=" INTPTR_FORMAT ", g=" INTPTR_FORMAT ", h=" INTPTR_FORMAT + ", i=" INTPTR_FORMAT ", j=" INTPTR_FORMAT ", k=" INTPTR_FORMAT "", + p2i(a), p2i(b), p2i(c), p2i(d), p2i(e), p2i(f), p2i(g), p2i(h), p2i(i), p2i(j), p2i(k)) + ); +} + +inline void assert_different_registers( + AbstractRegister a, + AbstractRegister b, + AbstractRegister c, + AbstractRegister d, + AbstractRegister e, + AbstractRegister f, + AbstractRegister g, + AbstractRegister h, + AbstractRegister i, + AbstractRegister j, + AbstractRegister k, + AbstractRegister l +) { + assert( + a != b && a != c && a != d && a != e && a != f && a != g && a != h && a != i && a != j && a !=k && a !=l + && b != c && b != d && b != e && b != f && b != g && b != h && b != i && b != j && b !=k && b !=l + && c != d && c != e && c != f && c != g && c != h && c != i && c != j && c !=k && c !=l + && d != e && d != f && d != g && d != h && d != i && d != j && d !=k && d !=l + && e != f && e != g && e != h && e != i && e != j && e !=k && e !=l + && f != g && f != h && f != i && f != j && f !=k && f !=l + && g != h && g != i && g != j && g !=k && g !=l + && h != i && h != j && h !=k && h !=l + && i != j && i !=k && i !=l + && j !=k && j !=l + && k !=l, + err_msg_res("registers must be different: a=" INTPTR_FORMAT ", b=" INTPTR_FORMAT + ", c=" INTPTR_FORMAT ", d=" INTPTR_FORMAT ", e=" INTPTR_FORMAT + ", f=" INTPTR_FORMAT ", g=" INTPTR_FORMAT ", h=" INTPTR_FORMAT + ", i=" INTPTR_FORMAT ", j=" INTPTR_FORMAT ", k=" INTPTR_FORMAT + ", l=" INTPTR_FORMAT "", + p2i(a), p2i(b), p2i(c), p2i(d), p2i(e), p2i(f), p2i(g), p2i(h), p2i(i), p2i(j), p2i(k), p2i(l)) + ); +} + #endif // SHARE_VM_ASM_REGISTER_HPP diff --git a/src/share/vm/classfile/vmSymbols.hpp b/src/share/vm/classfile/vmSymbols.hpp index 536acc1f6d64b098bdfca837136ed6e667cd8cdb..168ad778798634d537199aac9a27436d5dbdfe9b 100644 --- a/src/share/vm/classfile/vmSymbols.hpp +++ b/src/share/vm/classfile/vmSymbols.hpp @@ -787,6 +787,11 @@ do_name( encodeISOArray_name, "encodeISOArray") \ do_signature(encodeISOArray_signature, "([CI[BII)I") \ \ + do_class(java_math_BigInteger, "java/math/BigInteger") \ + do_intrinsic(_multiplyToLen, java_math_BigInteger, multiplyToLen_name, multiplyToLen_signature, F_R) \ + do_name( multiplyToLen_name, "multiplyToLen") \ + do_signature(multiplyToLen_signature, "([II[II[I)[I") \ + \ /* java/lang/ref/Reference */ \ do_intrinsic(_Reference_get, java_lang_ref_Reference, get_name, void_object_signature, F_R) \ \ diff --git a/src/share/vm/opto/c2_globals.hpp b/src/share/vm/opto/c2_globals.hpp index 06155aa0b7cf54e5c5f6dc071ee4b195a28f5a05..54dff6912c37184ac50e0d96c276bdb2d9fa815d 100644 --- a/src/share/vm/opto/c2_globals.hpp +++ b/src/share/vm/opto/c2_globals.hpp @@ -653,6 +653,9 @@ product(bool, UseMathExactIntrinsics, true, \ "Enables intrinsification of various java.lang.Math functions") \ \ + product(bool, UseMultiplyToLenIntrinsic, false, \ + "Enables intrinsification of BigInteger.multiplyToLen()") \ + \ product(bool, UseTypeSpeculation, true, \ "Speculatively propagate types from profiles") \ \ diff --git a/src/share/vm/opto/escape.cpp b/src/share/vm/opto/escape.cpp index 00d9e34e73480452682dc50e563d44d12a97e37a..867e96953e6e02ff90b68779cbe5f685f75d034d 100644 --- a/src/share/vm/opto/escape.cpp +++ b/src/share/vm/opto/escape.cpp @@ -944,7 +944,8 @@ void ConnectionGraph::process_call_arguments(CallNode *call) { strcmp(call->as_CallLeaf()->_name, "sha256_implCompress") == 0 || strcmp(call->as_CallLeaf()->_name, "sha256_implCompressMB") == 0 || strcmp(call->as_CallLeaf()->_name, "sha512_implCompress") == 0 || - strcmp(call->as_CallLeaf()->_name, "sha512_implCompressMB") == 0) + strcmp(call->as_CallLeaf()->_name, "sha512_implCompressMB") == 0 || + strcmp(call->as_CallLeaf()->_name, "multiplyToLen") == 0) ))) { call->dump(); fatal(err_msg_res("EA unexpected CallLeaf %s", call->as_CallLeaf()->_name)); diff --git a/src/share/vm/opto/library_call.cpp b/src/share/vm/opto/library_call.cpp index 1f7140065221536475203082914554164d03c175..024edd269aa9b9021183e0733ce0c35ab73590ff 100644 --- a/src/share/vm/opto/library_call.cpp +++ b/src/share/vm/opto/library_call.cpp @@ -322,6 +322,7 @@ class LibraryCallKit : public GraphKit { bool inline_updateCRC32(); bool inline_updateBytesCRC32(); bool inline_updateByteBufferCRC32(); + bool inline_multiplyToLen(); }; @@ -330,8 +331,12 @@ CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) { vmIntrinsics::ID id = m->intrinsic_id(); assert(id != vmIntrinsics::_none, "must be a VM intrinsic"); - if (DisableIntrinsic[0] != '\0' - && strstr(DisableIntrinsic, vmIntrinsics::name_at(id)) != NULL) { + ccstr disable_intr = NULL; + + if ((DisableIntrinsic[0] != '\0' + && strstr(DisableIntrinsic, vmIntrinsics::name_at(id)) != NULL) || + (method_has_option_value("DisableIntrinsic", disable_intr) + && strstr(disable_intr, vmIntrinsics::name_at(id)) != NULL)) { // disabled by a user request on the command line: // example: -XX:DisableIntrinsic=_hashCode,_getClass return NULL; @@ -515,6 +520,10 @@ CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) { if (!UseAESIntrinsics) return NULL; break; + case vmIntrinsics::_multiplyToLen: + if (!UseMultiplyToLenIntrinsic) return NULL; + break; + case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt: case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt: if (!UseAESIntrinsics) return NULL; @@ -912,6 +921,9 @@ bool LibraryCallKit::try_to_inline(int predicate) { case vmIntrinsics::_digestBase_implCompressMB: return inline_digestBase_implCompressMB(predicate); + case vmIntrinsics::_multiplyToLen: + return inline_multiplyToLen(); + case vmIntrinsics::_encodeISOArray: return inline_encodeISOArray(); @@ -5735,6 +5747,106 @@ bool LibraryCallKit::inline_encodeISOArray() { return true; } +//-------------inline_multiplyToLen----------------------------------- +bool LibraryCallKit::inline_multiplyToLen() { + assert(UseMultiplyToLenIntrinsic, "not implementated on this platform"); + + address stubAddr = StubRoutines::multiplyToLen(); + if (stubAddr == NULL) { + return false; // Intrinsic's stub is not implemented on this platform + } + const char* stubName = "multiplyToLen"; + + assert(callee()->signature()->size() == 5, "multiplyToLen has 5 parameters"); + + Node* x = argument(1); + Node* xlen = argument(2); + Node* y = argument(3); + Node* ylen = argument(4); + Node* z = argument(5); + + const Type* x_type = x->Value(&_gvn); + const Type* y_type = y->Value(&_gvn); + const TypeAryPtr* top_x = x_type->isa_aryptr(); + const TypeAryPtr* top_y = y_type->isa_aryptr(); + if (top_x == NULL || top_x->klass() == NULL || + top_y == NULL || top_y->klass() == NULL) { + // failed array check + return false; + } + + BasicType x_elem = x_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type(); + BasicType y_elem = y_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type(); + if (x_elem != T_INT || y_elem != T_INT) { + return false; + } + + // Set the original stack and the reexecute bit for the interpreter to reexecute + // the bytecode that invokes BigInteger.multiplyToLen() if deoptimization happens + // on the return from z array allocation in runtime. + { PreserveReexecuteState preexecs(this); + jvms()->set_should_reexecute(true); + + Node* x_start = array_element_address(x, intcon(0), x_elem); + Node* y_start = array_element_address(y, intcon(0), y_elem); + // 'x_start' points to x array + scaled xlen + // 'y_start' points to y array + scaled ylen + + // Allocate the result array + Node* zlen = _gvn.transform(new(C) AddINode(xlen, ylen)); + Node* klass_node = makecon(TypeKlassPtr::make(ciTypeArrayKlass::make(T_INT))); + + IdealKit ideal(this); + +#define __ ideal. + Node* one = __ ConI(1); + Node* zero = __ ConI(0); + IdealVariable need_alloc(ideal), z_alloc(ideal); __ declarations_done(); + __ set(need_alloc, zero); + __ set(z_alloc, z); + __ if_then(z, BoolTest::eq, null()); { + __ increment (need_alloc, one); + } __ else_(); { + // Update graphKit memory and control from IdealKit. + sync_kit(ideal); + Node* zlen_arg = load_array_length(z); + // Update IdealKit memory and control from graphKit. + __ sync_kit(this); + __ if_then(zlen_arg, BoolTest::lt, zlen); { + __ increment (need_alloc, one); + } __ end_if(); + } __ end_if(); + + __ if_then(__ value(need_alloc), BoolTest::ne, zero); { + // Update graphKit memory and control from IdealKit. + sync_kit(ideal); + Node * narr = new_array(klass_node, zlen, 1); + // Update IdealKit memory and control from graphKit. + __ sync_kit(this); + __ set(z_alloc, narr); + } __ end_if(); + + sync_kit(ideal); + z = __ value(z_alloc); + _gvn.set_type(z, TypeAryPtr::INTS); + // Final sync IdealKit and GraphKit. + final_sync(ideal); +#undef __ + + Node* z_start = array_element_address(z, intcon(0), T_INT); + + Node* call = make_runtime_call(RC_LEAF|RC_NO_FP, + OptoRuntime::multiplyToLen_Type(), + stubAddr, stubName, TypePtr::BOTTOM, + x_start, xlen, y_start, ylen, z_start, zlen); + } // original reexecute is set back here + + C->set_has_split_ifs(true); // Has chance for split-if optimization + set_result(z); + return true; +} + + /** * Calculate CRC32 for byte. * int java.util.zip.CRC32.update(int crc, int b) diff --git a/src/share/vm/opto/runtime.cpp b/src/share/vm/opto/runtime.cpp index 4b2da96aa8d25c86681d00fa3f5e62090a0f85cd..89b9997e221ecaa474dd14d70ffa6506b0da2643 100644 --- a/src/share/vm/opto/runtime.cpp +++ b/src/share/vm/opto/runtime.cpp @@ -942,6 +942,30 @@ const TypeFunc* OptoRuntime::digestBase_implCompressMB_Type() { return TypeFunc::make(domain, range); } +const TypeFunc* OptoRuntime::multiplyToLen_Type() { + // create input type (domain) + int num_args = 6; + int argcnt = num_args; + const Type** fields = TypeTuple::fields(argcnt); + int argp = TypeFunc::Parms; + fields[argp++] = TypePtr::NOTNULL; // x + fields[argp++] = TypeInt::INT; // xlen + fields[argp++] = TypePtr::NOTNULL; // y + fields[argp++] = TypeInt::INT; // ylen + fields[argp++] = TypePtr::NOTNULL; // z + fields[argp++] = TypeInt::INT; // zlen + assert(argp == TypeFunc::Parms+argcnt, "correct decoding"); + const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields); + + // no result type needed + fields = TypeTuple::fields(1); + fields[TypeFunc::Parms+0] = NULL; + const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields); + return TypeFunc::make(domain, range); +} + + + //------------- Interpreter state access for on stack replacement const TypeFunc* OptoRuntime::osr_end_Type() { // create input type (domain) diff --git a/src/share/vm/opto/runtime.hpp b/src/share/vm/opto/runtime.hpp index b8ad0105dfcc47e7b2a4dff48d8b7d9b0494668d..3258c36ea3abf540396fee8c70a613c7236128e9 100644 --- a/src/share/vm/opto/runtime.hpp +++ b/src/share/vm/opto/runtime.hpp @@ -303,6 +303,8 @@ private: static const TypeFunc* sha_implCompress_Type(); static const TypeFunc* digestBase_implCompressMB_Type(); + static const TypeFunc* multiplyToLen_Type(); + static const TypeFunc* updateBytesCRC32_Type(); // leaf on stack replacement interpreter accessor types diff --git a/src/share/vm/runtime/stubRoutines.cpp b/src/share/vm/runtime/stubRoutines.cpp index 6febb5bcee91879592b3eb6dcfdf0eff88c16e55..f35cce9f5e9d52a2bdfb153359f460e9b6bb7615 100644 --- a/src/share/vm/runtime/stubRoutines.cpp +++ b/src/share/vm/runtime/stubRoutines.cpp @@ -135,6 +135,8 @@ address StubRoutines::_sha512_implCompressMB = NULL; address StubRoutines::_updateBytesCRC32 = NULL; address StubRoutines::_crc_table_adr = NULL; +address StubRoutines::_multiplyToLen = NULL; + double (* StubRoutines::_intrinsic_log )(double) = NULL; double (* StubRoutines::_intrinsic_log10 )(double) = NULL; double (* StubRoutines::_intrinsic_exp )(double) = NULL; diff --git a/src/share/vm/runtime/stubRoutines.hpp b/src/share/vm/runtime/stubRoutines.hpp index 78381168fc4023ce7a23fe07a91d72305acc0005..1c56a2d9095c41c7677893a68d5612e4b1509d8d 100644 --- a/src/share/vm/runtime/stubRoutines.hpp +++ b/src/share/vm/runtime/stubRoutines.hpp @@ -217,6 +217,8 @@ class StubRoutines: AllStatic { static address _updateBytesCRC32; static address _crc_table_adr; + static address _multiplyToLen; + // These are versions of the java.lang.Math methods which perform // the same operations as the intrinsic version. They are used for // constant folding in the compiler to ensure equivalence. If the @@ -373,6 +375,8 @@ class StubRoutines: AllStatic { static address updateBytesCRC32() { return _updateBytesCRC32; } static address crc_table_addr() { return _crc_table_adr; } + static address multiplyToLen() {return _multiplyToLen; } + static address select_fill_function(BasicType t, bool aligned, const char* &name); static address zero_aligned_words() { return _zero_aligned_words; } diff --git a/src/share/vm/runtime/vmStructs.cpp b/src/share/vm/runtime/vmStructs.cpp index c3a039295bac38cfbce73d3540cca022a7dab65f..5948722888e90a465bc53db00053b720ba6f0c60 100644 --- a/src/share/vm/runtime/vmStructs.cpp +++ b/src/share/vm/runtime/vmStructs.cpp @@ -814,6 +814,7 @@ typedef BinaryTreeDictionary > MetablockTreeDicti static_field(StubRoutines, _cipherBlockChaining_decryptAESCrypt, address) \ static_field(StubRoutines, _updateBytesCRC32, address) \ static_field(StubRoutines, _crc_table_adr, address) \ + static_field(StubRoutines, _multiplyToLen, address) \ \ /*****************/ \ /* SharedRuntime */ \ diff --git a/test/compiler/intrinsics/multiplytolen/TestMultiplyToLen.java b/test/compiler/intrinsics/multiplytolen/TestMultiplyToLen.java new file mode 100644 index 0000000000000000000000000000000000000000..c7425966076c7a80b4b8bcb10c969aef25f8f631 --- /dev/null +++ b/test/compiler/intrinsics/multiplytolen/TestMultiplyToLen.java @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +/** + * @test + * @bug 8055494 + * @summary Add C2 x86 intrinsic for BigInteger::multiplyToLen() method + * + * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch + * -XX:CompileCommand=exclude,TestMultiplyToLen::main + * -XX:CompileCommand=option,TestMultiplyToLen::base_multiply,ccstr,DisableIntrinsic,_multiplyToLen + * -XX:CompileCommand=option,java.math.BigInteger::multiply,ccstr,DisableIntrinsic,_multiplyToLen + * -XX:CompileCommand=inline,java.math.BigInteger::multiply TestMultiplyToLen + */ + +import java.util.Random; +import java.math.*; + +public class TestMultiplyToLen { + + // Avoid intrinsic by preventing inlining multiply() and multiplyToLen(). + public static BigInteger base_multiply(BigInteger op1, BigInteger op2) { + return op1.multiply(op2); + } + + // Generate multiplyToLen() intrinsic by inlining multiply(). + public static BigInteger new_multiply(BigInteger op1, BigInteger op2) { + return op1.multiply(op2); + } + + public static boolean bytecompare(BigInteger b1, BigInteger b2) { + byte[] data1 = b1.toByteArray(); + byte[] data2 = b2.toByteArray(); + if (data1.length != data2.length) + return false; + for (int i = 0; i < data1.length; i++) { + if (data1[i] != data2[i]) + return false; + } + return true; + } + + public static String stringify(BigInteger b) { + String strout= ""; + byte [] data = b.toByteArray(); + for (int i = 0; i < data.length; i++) { + strout += (String.format("%02x",data[i]) + " "); + } + return strout; + } + + public static void main(String args[]) throws Exception { + + BigInteger oldsum = new BigInteger("0"); + BigInteger newsum = new BigInteger("0"); + + BigInteger b1, b2, oldres, newres; + + Random rand = new Random(); + long seed = System.nanoTime(); + Random rand1 = new Random(); + long seed1 = System.nanoTime(); + rand.setSeed(seed); + rand1.setSeed(seed1); + + for (int j = 0; j < 1000000; j++) { + int rand_int = rand1.nextInt(3136)+32; + int rand_int1 = rand1.nextInt(3136)+32; + b1 = new BigInteger(rand_int, rand); + b2 = new BigInteger(rand_int1, rand); + + oldres = base_multiply(b1,b2); + newres = new_multiply(b1,b2); + + oldsum = oldsum.add(oldres); + newsum = newsum.add(newres); + + if (!bytecompare(oldres,newres)) { + System.out.print("mismatch for:b1:" + stringify(b1) + " :b2:" + stringify(b2) + " :oldres:" + stringify(oldres) + " :newres:" + stringify(newres)); + System.out.println(b1); + System.out.println(b2); + throw new Exception("Failed"); + } + } + if (!bytecompare(oldsum,newsum)) { + System.out.println("Failure: oldsum:" + stringify(oldsum) + " newsum:" + stringify(newsum)); + throw new Exception("Failed"); + } else { + System.out.println("Success"); + } + } +}