From c5837c061bfc07a971f14524ceaaba72bfe5ab77 Mon Sep 17 00:00:00 2001
From: igerasim <unknown>
Date: Wed, 17 Feb 2016 13:40:12 +0300
Subject: [PATCH] 8081778: Use Intel x64 CPU instructions for RSA acceleration
 Summary: Add intrinsics for BigInteger squareToLen and mulAdd methods.
 Reviewed-by: kvn, jrose

---
 src/cpu/x86/vm/assembler_x86.cpp              |  20 +
 src/cpu/x86/vm/assembler_x86.hpp              |   3 +
 src/cpu/x86/vm/macroAssembler_x86.cpp         | 497 ++++++++++++++++++
 src/cpu/x86/vm/macroAssembler_x86.hpp         |  19 +
 src/cpu/x86/vm/stubGenerator_x86_64.cpp       | 107 ++++
 src/cpu/x86/vm/stubRoutines_x86_64.hpp        |   2 +-
 src/cpu/x86/vm/vm_version_x86.cpp             |  18 +
 src/share/vm/classfile/vmSymbols.hpp          |   8 +
 src/share/vm/opto/c2_globals.hpp              |   6 +
 src/share/vm/opto/escape.cpp                  |   4 +-
 src/share/vm/opto/library_call.cpp            | 110 ++++
 src/share/vm/opto/runtime.cpp                 |  42 ++
 src/share/vm/opto/runtime.hpp                 |   4 +
 src/share/vm/runtime/stubRoutines.cpp         |   2 +
 src/share/vm/runtime/stubRoutines.hpp         |   4 +
 src/share/vm/runtime/vmStructs.cpp            |   2 +
 .../intrinsics/muladd/TestMulAdd.java         | 117 +++++
 .../squaretolen/TestSquareToLen.java          | 114 ++++
 18 files changed, 1077 insertions(+), 2 deletions(-)
 create mode 100644 test/compiler/intrinsics/muladd/TestMulAdd.java
 create mode 100644 test/compiler/intrinsics/squaretolen/TestSquareToLen.java
diff --git a/src/cpu/x86/vm/assembler_x86.cpp b/src/cpu/x86/vm/assembler_x86.cpp
index 8098e889b..7cbc47d60 100644
--- a/src/cpu/x86/vm/assembler_x86.cpp
+++ b/src/cpu/x86/vm/assembler_x86.cpp
@@ -2318,6 +2318,13 @@ void Assembler::orl(Register dst, Register src) {
   emit_arith(0x0B, 0xC0, dst, src);
 }
 
+void Assembler::orl(Address dst, Register src) {
+  InstructionMark im(this);
+  prefix(dst, src);
+  emit_int8(0x09);
+  emit_operand(src, dst);
+}
+
 void Assembler::packuswb(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
@@ -5613,6 +5620,19 @@ void Assembler::rclq(Register dst, int imm8) {
   }
 }
 
+void Assembler::rcrq(Register dst, int imm8) {
+  assert(isShiftCount(imm8 >> 1), "illegal shift count");
+  int encode = prefixq_and_encode(dst->encoding());
+  if (imm8 == 1) {
+    emit_int8((unsigned char)0xD1);
+    emit_int8((unsigned char)(0xD8 | encode));
+  } else {
+    emit_int8((unsigned char)0xC1);
+    emit_int8((unsigned char)(0xD8 | encode));
+    emit_int8(imm8);
+  }
+}
+
 void Assembler::rorq(Register dst, int imm8) {
   assert(isShiftCount(imm8 >> 1), "illegal shift count");
   int encode = prefixq_and_encode(dst->encoding());
diff --git a/src/cpu/x86/vm/assembler_x86.hpp b/src/cpu/x86/vm/assembler_x86.hpp
index fc270dbbd..341d9e39b 100644
--- a/src/cpu/x86/vm/assembler_x86.hpp
+++ b/src/cpu/x86/vm/assembler_x86.hpp
@@ -1455,6 +1455,7 @@ private:
   void orl(Register dst, int32_t imm32);
   void orl(Register dst, Address src);
   void orl(Register dst, Register src);
+  void orl(Address dst, Register src);
 
   void orq(Address dst, int32_t imm32);
   void orq(Register dst, int32_t imm32);
@@ -1555,6 +1556,8 @@ private:
 
   void rclq(Register dst, int imm8);
 
+  void rcrq(Register dst, int imm8);
+
   void rdtsc();
 
   void ret(int imm16);
diff --git a/src/cpu/x86/vm/macroAssembler_x86.cpp b/src/cpu/x86/vm/macroAssembler_x86.cpp
index d6f179770..764cf1bcf 100644
--- a/src/cpu/x86/vm/macroAssembler_x86.cpp
+++ b/src/cpu/x86/vm/macroAssembler_x86.cpp
@@ -7767,6 +7767,503 @@ void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Regi
   pop(tmp2);
   pop(tmp1);
 }
+
+//Helper functions for square_to_len()
+
+/**
+ * Store the squares of x[], right shifted one bit (divided by 2) into z[]
+ * Preserves x and z and modifies rest of the registers.
+ */
+
+void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
+  // Perform square and right shift by 1
+  // Handle odd xlen case first, then for even xlen do the following
+  // jlong carry = 0;
+  // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
+  //     huge_128 product = x[j:j+1] * x[j:j+1];
+  //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
+  //     z[i+2:i+3] = (jlong)(product >>> 1);
+  //     carry = (jlong)product;
+  // }
+
+  xorq(tmp5, tmp5);     // carry
+  xorq(rdxReg, rdxReg);
+  xorl(tmp1, tmp1);     // index for x
+  xorl(tmp4, tmp4);     // index for z
+
+  Label L_first_loop, L_first_loop_exit;
+
+  testl(xlen, 1);
+  jccb(Assembler::zero, L_first_loop); //jump if xlen is even
+
+  // Square and right shift by 1 the odd element using 32 bit multiply
+  movl(raxReg, Address(x, tmp1, Address::times_4, 0));
+  imulq(raxReg, raxReg);
+  shrq(raxReg, 1);
+  adcq(tmp5, 0);
+  movq(Address(z, tmp4, Address::times_4, 0), raxReg);
+  incrementl(tmp1);
+  addl(tmp4, 2);
+
+  // Square and  right shift by 1 the rest using 64 bit multiply
+  bind(L_first_loop);
+  cmpptr(tmp1, xlen);
+  jccb(Assembler::equal, L_first_loop_exit);
+
+  // Square
+  movq(raxReg, Address(x, tmp1, Address::times_4,  0));
+  rorq(raxReg, 32);    // convert big-endian to little-endian
+  mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
+
+  // Right shift by 1 and save carry
+  shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
+  rcrq(rdxReg, 1);
+  rcrq(raxReg, 1);
+  adcq(tmp5, 0);
+
+  // Store result in z
+  movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
+  movq(Address(z, tmp4, Address::times_4, 8), raxReg);
+
+  // Update indices for x and z
+  addl(tmp1, 2);
+  addl(tmp4, 4);
+  jmp(L_first_loop);
+
+  bind(L_first_loop_exit);
+}
+
+
+/**
+ * Perform the following multiply add operation using BMI2 instructions
+ * carry:sum = sum + op1*op2 + carry
+ * op2 should be in rdx
+ * op2 is preserved, all other registers are modified
+ */
+void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
+  // assert op2 is rdx
+  mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
+  addq(sum, carry);
+  adcq(tmp2, 0);
+  addq(sum, op1);
+  adcq(tmp2, 0);
+  movq(carry, tmp2);
+}
+
+/**
+ * Perform the following multiply add operation:
+ * carry:sum = sum + op1*op2 + carry
+ * Preserves op1, op2 and modifies rest of registers
+ */
+void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
+  // rdx:rax = op1 * op2
+  movq(raxReg, op2);
+  mulq(op1);
+
+  //  rdx:rax = sum + carry + rdx:rax
+  addq(sum, carry);
+  adcq(rdxReg, 0);
+  addq(sum, raxReg);
+  adcq(rdxReg, 0);
+
+  // carry:sum = rdx:sum
+  movq(carry, rdxReg);
+}
+
+/**
+ * Add 64 bit long carry into z[] with carry propogation.
+ * Preserves z and carry register values and modifies rest of registers.
+ *
+ */
+void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
+  Label L_fourth_loop, L_fourth_loop_exit;
+
+  movl(tmp1, 1);
+  subl(zlen, 2);
+  addq(Address(z, zlen, Address::times_4, 0), carry);
+
+  bind(L_fourth_loop);
+  jccb(Assembler::carryClear, L_fourth_loop_exit);
+  subl(zlen, 2);
+  jccb(Assembler::negative, L_fourth_loop_exit);
+  addq(Address(z, zlen, Address::times_4, 0), tmp1);
+  jmp(L_fourth_loop);
+  bind(L_fourth_loop_exit);
+}
+
+/**
+ * Shift z[] left by 1 bit.
+ * Preserves x, len, z and zlen registers and modifies rest of the registers.
+ *
+ */
+void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
+
+  Label L_fifth_loop, L_fifth_loop_exit;
+
+  // Fifth loop
+  // Perform primitiveLeftShift(z, zlen, 1)
+
+  const Register prev_carry = tmp1;
+  const Register new_carry = tmp4;
+  const Register value = tmp2;
+  const Register zidx = tmp3;
+
+  // int zidx, carry;
+  // long value;
+  // carry = 0;
+  // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
+  //    (carry:value)  = (z[i] << 1) | carry ;
+  //    z[i] = value;
+  // }
+
+  movl(zidx, zlen);
+  xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
+
+  bind(L_fifth_loop);
+  decl(zidx);  // Use decl to preserve carry flag
+  decl(zidx);
+  jccb(Assembler::negative, L_fifth_loop_exit);
+
+  if (UseBMI2Instructions) {
+     movq(value, Address(z, zidx, Address::times_4, 0));
+     rclq(value, 1);
+     rorxq(value, value, 32);
+     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
+  }
+  else {
+    // clear new_carry
+    xorl(new_carry, new_carry);
+
+    // Shift z[i] by 1, or in previous carry and save new carry
+    movq(value, Address(z, zidx, Address::times_4, 0));
+    shlq(value, 1);
+    adcl(new_carry, 0);
+
+    orq(value, prev_carry);
+    rorq(value, 0x20);
+    movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
+
+    // Set previous carry = new carry
+    movl(prev_carry, new_carry);
+  }
+  jmp(L_fifth_loop);
+
+  bind(L_fifth_loop_exit);
+}
+
+
+/**
+ * Code for BigInteger::squareToLen() intrinsic
+ *
+ * rdi: x
+ * rsi: len
+ * r8:  z
+ * rcx: zlen
+ * r12: tmp1
+ * r13: tmp2
+ * r14: tmp3
+ * r15: tmp4
+ * rbx: tmp5
+ *
+ */
+void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
+
+  Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, fifth_loop, fifth_loop_exit, L_last_x, L_multiply;
+  push(tmp1);
+  push(tmp2);
+  push(tmp3);
+  push(tmp4);
+  push(tmp5);
+
+  // First loop
+  // Store the squares, right shifted one bit (i.e., divided by 2).
+  square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
+
+  // Add in off-diagonal sums.
+  //
+  // Second, third (nested) and fourth loops.
+  // zlen +=2;
+  // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
+  //    carry = 0;
+  //    long op2 = x[xidx:xidx+1];
+  //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
+  //       k -= 2;
+  //       long op1 = x[j:j+1];
+  //       long sum = z[k:k+1];
+  //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
+  //       z[k:k+1] = sum;
+  //    }
+  //    add_one_64(z, k, carry, tmp_regs);
+  // }
+
+  const Register carry = tmp5;
+  const Register sum = tmp3;
+  const Register op1 = tmp4;
+  Register op2 = tmp2;
+
+  push(zlen);
+  push(len);
+  addl(zlen,2);
+  bind(L_second_loop);
+  xorq(carry, carry);
+  subl(zlen, 4);
+  subl(len, 2);
+  push(zlen);
+  push(len);
+  cmpl(len, 0);
+  jccb(Assembler::lessEqual, L_second_loop_exit);
+
+  // Multiply an array by one 64 bit long.
+  if (UseBMI2Instructions) {
+    op2 = rdxReg;
+    movq(op2, Address(x, len, Address::times_4,  0));
+    rorxq(op2, op2, 32);
+  }
+  else {
+    movq(op2, Address(x, len, Address::times_4,  0));
+    rorq(op2, 32);
+  }
+
+  bind(L_third_loop);
+  decrementl(len);
+  jccb(Assembler::negative, L_third_loop_exit);
+  decrementl(len);
+  jccb(Assembler::negative, L_last_x);
+
+  movq(op1, Address(x, len, Address::times_4,  0));
+  rorq(op1, 32);
+
+  bind(L_multiply);
+  subl(zlen, 2);
+  movq(sum, Address(z, zlen, Address::times_4,  0));
+
+  // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
+  if (UseBMI2Instructions) {
+    multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
+  }
+  else {
+    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
+  }
+
+  movq(Address(z, zlen, Address::times_4, 0), sum);
+
+  jmp(L_third_loop);
+  bind(L_third_loop_exit);
+
+  // Fourth loop
+  // Add 64 bit long carry into z with carry propogation.
+  // Uses offsetted zlen.
+  add_one_64(z, zlen, carry, tmp1);
+
+  pop(len);
+  pop(zlen);
+  jmp(L_second_loop);
+
+  // Next infrequent code is moved outside loops.
+  bind(L_last_x);
+  movl(op1, Address(x, 0));
+  jmp(L_multiply);
+
+  bind(L_second_loop_exit);
+  pop(len);
+  pop(zlen);
+  pop(len);
+  pop(zlen);
+
+  // Fifth loop
+  // Shift z left 1 bit.
+  lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
+
+  // z[zlen-1] |= x[len-1] & 1;
+  movl(tmp3, Address(x, len, Address::times_4, -4));
+  andl(tmp3, 1);
+  orl(Address(z, zlen, Address::times_4,  -4), tmp3);
+
+  pop(tmp5);
+  pop(tmp4);
+  pop(tmp3);
+  pop(tmp2);
+  pop(tmp1);
+}
+
+/**
+ * Helper function for mul_add()
+ * Multiply the in[] by int k and add to out[] starting at offset offs using
+ * 128 bit by 32 bit multiply and return the carry in tmp5.
+ * Only quad int aligned length of in[] is operated on in this function.
+ * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
+ * This function preserves out, in and k registers.
+ * len and offset point to the appropriate index in "in" & "out" correspondingly
+ * tmp5 has the carry.
+ * other registers are temporary and are modified.
+ *
+ */
+void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
+  Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
+  Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
+
+  Label L_first_loop, L_first_loop_exit;
+
+  movl(tmp1, len);
+  shrl(tmp1, 2);
+
+  bind(L_first_loop);
+  subl(tmp1, 1);
+  jccb(Assembler::negative, L_first_loop_exit);
+
+  subl(len, 4);
+  subl(offset, 4);
+
+  Register op2 = tmp2;
+  const Register sum = tmp3;
+  const Register op1 = tmp4;
+  const Register carry = tmp5;
+
+  if (UseBMI2Instructions) {
+    op2 = rdxReg;
+  }
+
+  movq(op1, Address(in, len, Address::times_4,  8));
+  rorq(op1, 32);
+  movq(sum, Address(out, offset, Address::times_4,  8));
+  rorq(sum, 32);
+  if (UseBMI2Instructions) {
+    multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
+  }
+  else {
+    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
+  }
+  // Store back in big endian from little endian
+  rorq(sum, 0x20);
+  movq(Address(out, offset, Address::times_4,  8), sum);
+
+  movq(op1, Address(in, len, Address::times_4,  0));
+  rorq(op1, 32);
+  movq(sum, Address(out, offset, Address::times_4,  0));
+  rorq(sum, 32);
+  if (UseBMI2Instructions) {
+    multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
+  }
+  else {
+    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
+  }
+  // Store back in big endian from little endian
+  rorq(sum, 0x20);
+  movq(Address(out, offset, Address::times_4,  0), sum);
+
+  jmp(L_first_loop);
+  bind(L_first_loop_exit);
+}
+
+/**
+ * Code for BigInteger::mulAdd() intrinsic
+ *
+ * rdi: out
+ * rsi: in
+ * r11: offs (out.length - offset)
+ * rcx: len
+ * r8:  k
+ * r12: tmp1
+ * r13: tmp2
+ * r14: tmp3
+ * r15: tmp4
+ * rbx: tmp5
+ * Multiply the in[] by word k and add to out[], return the carry in rax
+ */
+void MacroAssembler::mul_add(Register out, Register in, Register offs,
+   Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
+   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
+
+  Label L_carry, L_last_in, L_done;
+
+// carry = 0;
+// for (int j=len-1; j >= 0; j--) {
+//    long product = (in[j] & LONG_MASK) * kLong +
+//                   (out[offs] & LONG_MASK) + carry;
+//    out[offs--] = (int)product;
+//    carry = product >>> 32;
+// }
+//
+  push(tmp1);
+  push(tmp2);
+  push(tmp3);
+  push(tmp4);
+  push(tmp5);
+
+  Register op2 = tmp2;
+  const Register sum = tmp3;
+  const Register op1 = tmp4;
+  const Register carry =  tmp5;
+
+  if (UseBMI2Instructions) {
+    op2 = rdxReg;
+    movl(op2, k);
+  }
+  else {
+    movl(op2, k);
+  }
+
+  xorq(carry, carry);
+
+  //First loop
+
+  //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
+  //The carry is in tmp5
+  mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
+
+  //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
+  decrementl(len);
+  jccb(Assembler::negative, L_carry);
+  decrementl(len);
+  jccb(Assembler::negative, L_last_in);
+
+  movq(op1, Address(in, len, Address::times_4,  0));
+  rorq(op1, 32);
+
+  subl(offs, 2);
+  movq(sum, Address(out, offs, Address::times_4,  0));
+  rorq(sum, 32);
+
+  if (UseBMI2Instructions) {
+    multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
+  }
+  else {
+    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
+  }
+
+  // Store back in big endian from little endian
+  rorq(sum, 0x20);
+  movq(Address(out, offs, Address::times_4,  0), sum);
+
+  testl(len, len);
+  jccb(Assembler::zero, L_carry);
+
+  //Multiply the last in[] entry, if any
+  bind(L_last_in);
+  movl(op1, Address(in, 0));
+  movl(sum, Address(out, offs, Address::times_4,  -4));
+
+  movl(raxReg, k);
+  mull(op1); //tmp4 * eax -> edx:eax
+  addl(sum, carry);
+  adcl(rdxReg, 0);
+  addl(sum, raxReg);
+  adcl(rdxReg, 0);
+  movl(carry, rdxReg);
+
+  movl(Address(out, offs, Address::times_4,  -4), sum);
+
+  bind(L_carry);
+  //return tmp5/carry as carry in rax
+  movl(rax, carry);
+
+  bind(L_done);
+  pop(tmp5);
+  pop(tmp4);
+  pop(tmp3);
+  pop(tmp2);
+  pop(tmp1);
+}
 #endif
 
 /**
diff --git a/src/cpu/x86/vm/macroAssembler_x86.hpp b/src/cpu/x86/vm/macroAssembler_x86.hpp
index 69c9e8aa3..cc5e5856a 100644
--- a/src/cpu/x86/vm/macroAssembler_x86.hpp
+++ b/src/cpu/x86/vm/macroAssembler_x86.hpp
@@ -1241,6 +1241,25 @@ public:
                                Register carry2);
   void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
                        Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5);
+
+  void square_rshift(Register x, Register len, Register z, Register tmp1, Register tmp3,
+                     Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
+  void multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry,
+                            Register tmp2);
+  void multiply_add_64(Register sum, Register op1, Register op2, Register carry,
+                       Register rdxReg, Register raxReg);
+  void add_one_64(Register z, Register zlen, Register carry, Register tmp1);
+  void lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
+                       Register tmp3, Register tmp4);
+  void square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
+                     Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
+
+  void mul_add_128_x_32_loop(Register out, Register in, Register offset, Register len, Register tmp1,
+               Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
+               Register raxReg);
+  void mul_add(Register out, Register in, Register offset, Register len, Register k, Register tmp1,
+               Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
+               Register raxReg);
 #endif
 
   // CRC32 code for java.util.zip.CRC32::updateBytes() instrinsic.
diff --git a/src/cpu/x86/vm/stubGenerator_x86_64.cpp b/src/cpu/x86/vm/stubGenerator_x86_64.cpp
index 0bb6118d7..a9f4e9448 100644
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp
@@ -3743,6 +3743,107 @@ class StubGenerator: public StubCodeGenerator {
     return start;
   }
 
+/**
+   *  Arguments:
+   *
+  //  Input:
+  //    c_rarg0   - x address
+  //    c_rarg1   - x length
+  //    c_rarg2   - z address
+  //    c_rarg3   - z lenth
+   *
+   */
+  address generate_squareToLen() {
+
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "squareToLen");
+
+    address start = __ pc();
+    // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
+    // Unix:  rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...)
+    const Register x      = rdi;
+    const Register len    = rsi;
+    const Register z      = r8;
+    const Register zlen   = rcx;
+
+   const Register tmp1      = r12;
+   const Register tmp2      = r13;
+   const Register tmp3      = r14;
+   const Register tmp4      = r15;
+   const Register tmp5      = rbx;
+
+    BLOCK_COMMENT("Entry:");
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+       setup_arg_regs(4); // x => rdi, len => rsi, z => rdx
+                          // zlen => rcx
+                          // r9 and r10 may be used to save non-volatile registers
+    __ movptr(r8, rdx);
+    __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
+
+    restore_arg_regs();
+
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+
+    return start;
+  }
+
+   /**
+   *  Arguments:
+   *
+   *  Input:
+   *    c_rarg0   - out address
+   *    c_rarg1   - in address
+   *    c_rarg2   - offset
+   *    c_rarg3   - len
+   * not Win64
+   *    c_rarg4   - k
+   * Win64
+   *    rsp+40    - k
+   */
+  address generate_mulAdd() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "mulAdd");
+
+    address start = __ pc();
+    // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
+    // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
+    const Register out     = rdi;
+    const Register in      = rsi;
+    const Register offset  = r11;
+    const Register len     = rcx;
+    const Register k       = r8;
+
+    // Next registers will be saved on stack in mul_add().
+    const Register tmp1  = r12;
+    const Register tmp2  = r13;
+    const Register tmp3  = r14;
+    const Register tmp4  = r15;
+    const Register tmp5  = rbx;
+
+    BLOCK_COMMENT("Entry:");
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+    setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx
+                       // len => rcx, k => r8
+                       // r9 and r10 may be used to save non-volatile registers
+#ifdef _WIN64
+    // last argument is on stack on Win64
+    __ movl(k, Address(rsp, 6 * wordSize));
+#endif
+    __ movptr(r11, rdx);  // move offset in rdx to offset(r11)
+    __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
+
+    restore_arg_regs();
+
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+
+    return start;
+  }
+
+
 #undef __
 #define __ masm->
 
@@ -3987,6 +4088,12 @@ class StubGenerator: public StubCodeGenerator {
     if (UseMultiplyToLenIntrinsic) {
       StubRoutines::_multiplyToLen = generate_multiplyToLen();
     }
+    if (UseSquareToLenIntrinsic) {
+      StubRoutines::_squareToLen = generate_squareToLen();
+    }
+    if (UseMulAddIntrinsic) {
+      StubRoutines::_mulAdd = generate_mulAdd();
+    }
 #endif
   }
 
diff --git a/src/cpu/x86/vm/stubRoutines_x86_64.hpp b/src/cpu/x86/vm/stubRoutines_x86_64.hpp
index 205bce4eb..15922b8ae 100644
--- a/src/cpu/x86/vm/stubRoutines_x86_64.hpp
+++ b/src/cpu/x86/vm/stubRoutines_x86_64.hpp
@@ -33,7 +33,7 @@ static bool    returns_to_call_stub(address return_pc)   { return return_pc == _
 
 enum platform_dependent_constants {
   code_size1 = 19000,          // simply increase if too small (assembler will crash if too small)
-  code_size2 = 22000           // simply increase if too small (assembler will crash if too small)
+  code_size2 = 23000           // simply increase if too small (assembler will crash if too small)
 };
 
 class x86 {
diff --git a/src/cpu/x86/vm/vm_version_x86.cpp b/src/cpu/x86/vm/vm_version_x86.cpp
index adfdfd67e..807f33efd 100644
--- a/src/cpu/x86/vm/vm_version_x86.cpp
+++ b/src/cpu/x86/vm/vm_version_x86.cpp
@@ -703,6 +703,12 @@ void VM_Version::get_processor_features() {
   if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
     UseMultiplyToLenIntrinsic = true;
   }
+  if (FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) {
+    UseSquareToLenIntrinsic = true;
+  }
+  if (FLAG_IS_DEFAULT(UseMulAddIntrinsic)) {
+    UseMulAddIntrinsic = true;
+  }
 #else
   if (UseMultiplyToLenIntrinsic) {
     if (!FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
@@ -710,6 +716,18 @@ void VM_Version::get_processor_features() {
     }
     FLAG_SET_DEFAULT(UseMultiplyToLenIntrinsic, false);
   }
+  if (UseSquareToLenIntrinsic) {
+    if (!FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) {
+      warning("squareToLen intrinsic is not available in 32-bit VM");
+    }
+    FLAG_SET_DEFAULT(UseSquareToLenIntrinsic, false);
+  }
+  if (UseMulAddIntrinsic) {
+    if (!FLAG_IS_DEFAULT(UseMulAddIntrinsic)) {
+      warning("mulAdd intrinsic is not available in 32-bit VM");
+    }
+    FLAG_SET_DEFAULT(UseMulAddIntrinsic, false);
+  }
 #endif
 #endif // COMPILER2
 
diff --git a/src/share/vm/classfile/vmSymbols.hpp b/src/share/vm/classfile/vmSymbols.hpp
index 0a84cb074..96032fbe9 100644
--- a/src/share/vm/classfile/vmSymbols.hpp
+++ b/src/share/vm/classfile/vmSymbols.hpp
@@ -793,6 +793,14 @@
    do_name(     multiplyToLen_name,                             "multiplyToLen")                                        \
    do_signature(multiplyToLen_signature,                        "([II[II[I)[I")                                         \
                                                                                                                         \
+  do_intrinsic(_squareToLen, java_math_BigInteger, squareToLen_name, squareToLen_signature, F_S)                        \
+   do_name(     squareToLen_name,                             "implSquareToLen")                                        \
+   do_signature(squareToLen_signature,                        "([II[II)[I")                                             \
+                                                                                                                        \
+  do_intrinsic(_mulAdd, java_math_BigInteger, mulAdd_name, mulAdd_signature, F_S)                                       \
+   do_name(     mulAdd_name,                                  "implMulAdd")                                             \
+   do_signature(mulAdd_signature,                             "([I[IIII)I")                                             \
+                                                                                                                        \
   /* java/lang/ref/Reference */                                                                                         \
   do_intrinsic(_Reference_get,            java_lang_ref_Reference, get_name,    void_object_signature, F_R)             \
                                                                                                                         \
diff --git a/src/share/vm/opto/c2_globals.hpp b/src/share/vm/opto/c2_globals.hpp
index adeee7a83..5a7cdf51f 100644
--- a/src/share/vm/opto/c2_globals.hpp
+++ b/src/share/vm/opto/c2_globals.hpp
@@ -659,6 +659,12 @@
   product(bool, UseMultiplyToLenIntrinsic, false,                           \
           "Enables intrinsification of BigInteger.multiplyToLen()")         \
                                                                             \
+  product(bool, UseSquareToLenIntrinsic, false,                             \
+          "Enables intrinsification of BigInteger.squareToLen()")           \
+                                                                            \
+  product(bool, UseMulAddIntrinsic, false,                                  \
+          "Enables intrinsification of BigInteger.mulAdd()")                \
+                                                                            \
   product(bool, UseTypeSpeculation, true,                                   \
           "Speculatively propagate types from profiles")                    \
                                                                             \
diff --git a/src/share/vm/opto/escape.cpp b/src/share/vm/opto/escape.cpp
index b5e1f6b72..779af14fa 100644
--- a/src/share/vm/opto/escape.cpp
+++ b/src/share/vm/opto/escape.cpp
@@ -958,7 +958,9 @@ void ConnectionGraph::process_call_arguments(CallNode *call) {
                   strcmp(call->as_CallLeaf()->_name, "sha256_implCompressMB") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "sha512_implCompress") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "sha512_implCompressMB") == 0 ||
-                  strcmp(call->as_CallLeaf()->_name, "multiplyToLen") == 0)
+                  strcmp(call->as_CallLeaf()->_name, "multiplyToLen") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "squareToLen") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "mulAdd") == 0)
                   ))) {
             call->dump();
             fatal(err_msg_res("EA unexpected CallLeaf %s", call->as_CallLeaf()->_name));
diff --git a/src/share/vm/opto/library_call.cpp b/src/share/vm/opto/library_call.cpp
index b234d8bb6..461bcc966 100644
--- a/src/share/vm/opto/library_call.cpp
+++ b/src/share/vm/opto/library_call.cpp
@@ -324,6 +324,8 @@ class LibraryCallKit : public GraphKit {
   bool inline_updateBytesCRC32();
   bool inline_updateByteBufferCRC32();
   bool inline_multiplyToLen();
+  bool inline_squareToLen();
+  bool inline_mulAdd();
 
   bool inline_profileBoolean();
 };
@@ -527,6 +529,14 @@ CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) {
     if (!UseMultiplyToLenIntrinsic) return NULL;
     break;
 
+  case vmIntrinsics::_squareToLen:
+    if (!UseSquareToLenIntrinsic) return NULL;
+    break;
+
+  case vmIntrinsics::_mulAdd:
+    if (!UseMulAddIntrinsic) return NULL;
+    break;
+
   case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt:
   case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt:
     if (!UseAESIntrinsics) return NULL;
@@ -927,6 +937,12 @@ bool LibraryCallKit::try_to_inline(int predicate) {
   case vmIntrinsics::_multiplyToLen:
     return inline_multiplyToLen();
 
+  case vmIntrinsics::_squareToLen:
+    return inline_squareToLen();
+
+  case vmIntrinsics::_mulAdd:
+    return inline_mulAdd();
+
   case vmIntrinsics::_encodeISOArray:
     return inline_encodeISOArray();
 
@@ -5856,6 +5872,100 @@ bool LibraryCallKit::inline_multiplyToLen() {
   return true;
 }
 
+//-------------inline_squareToLen------------------------------------
+bool LibraryCallKit::inline_squareToLen() {
+  assert(UseSquareToLenIntrinsic, "not implementated on this platform");
+
+  address stubAddr = StubRoutines::squareToLen();
+  if (stubAddr == NULL) {
+    return false; // Intrinsic's stub is not implemented on this platform
+  }
+  const char* stubName = "squareToLen";
+
+  assert(callee()->signature()->size() == 4, "implSquareToLen has 4 parameters");
+
+  Node* x    = argument(0);
+  Node* len  = argument(1);
+  Node* z    = argument(2);
+  Node* zlen = argument(3);
+
+  const Type* x_type = x->Value(&_gvn);
+  const Type* z_type = z->Value(&_gvn);
+  const TypeAryPtr* top_x = x_type->isa_aryptr();
+  const TypeAryPtr* top_z = z_type->isa_aryptr();
+  if (top_x  == NULL || top_x->klass()  == NULL ||
+      top_z  == NULL || top_z->klass()  == NULL) {
+    // failed array check
+    return false;
+  }
+
+  BasicType x_elem = x_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
+  BasicType z_elem = z_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
+  if (x_elem != T_INT || z_elem != T_INT) {
+    return false;
+  }
+
+
+  Node* x_start = array_element_address(x, intcon(0), x_elem);
+  Node* z_start = array_element_address(z, intcon(0), z_elem);
+
+  Node*  call = make_runtime_call(RC_LEAF|RC_NO_FP,
+                                  OptoRuntime::squareToLen_Type(),
+                                  stubAddr, stubName, TypePtr::BOTTOM,
+                                  x_start, len, z_start, zlen);
+
+  set_result(z);
+  return true;
+}
+
+//-------------inline_mulAdd------------------------------------------
+bool LibraryCallKit::inline_mulAdd() {
+  assert(UseMulAddIntrinsic, "not implementated on this platform");
+
+  address stubAddr = StubRoutines::mulAdd();
+  if (stubAddr == NULL) {
+    return false; // Intrinsic's stub is not implemented on this platform
+  }
+  const char* stubName = "mulAdd";
+
+  assert(callee()->signature()->size() == 5, "mulAdd has 5 parameters");
+
+  Node* out      = argument(0);
+  Node* in       = argument(1);
+  Node* offset   = argument(2);
+  Node* len      = argument(3);
+  Node* k        = argument(4);
+
+  const Type* out_type = out->Value(&_gvn);
+  const Type* in_type = in->Value(&_gvn);
+  const TypeAryPtr* top_out = out_type->isa_aryptr();
+  const TypeAryPtr* top_in = in_type->isa_aryptr();
+  if (top_out  == NULL || top_out->klass()  == NULL ||
+      top_in == NULL || top_in->klass() == NULL) {
+    // failed array check
+    return false;
+  }
+
+  BasicType out_elem = out_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
+  BasicType in_elem = in_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
+  if (out_elem != T_INT || in_elem != T_INT) {
+    return false;
+  }
+
+  Node* outlen = load_array_length(out);
+  Node* new_offset = _gvn.transform(new (C) SubINode(outlen, offset));
+  Node* out_start = array_element_address(out, intcon(0), out_elem);
+  Node* in_start = array_element_address(in, intcon(0), in_elem);
+
+  Node*  call = make_runtime_call(RC_LEAF|RC_NO_FP,
+                                  OptoRuntime::mulAdd_Type(),
+                                  stubAddr, stubName, TypePtr::BOTTOM,
+                                  out_start,in_start, new_offset, len, k);
+  Node* result = _gvn.transform(new (C) ProjNode(call, TypeFunc::Parms));
+  set_result(result);
+  return true;
+}
+
 
 /**
  * Calculate CRC32 for byte.
diff --git a/src/share/vm/opto/runtime.cpp b/src/share/vm/opto/runtime.cpp
index 2b62a9961..4f96e3ea0 100644
--- a/src/share/vm/opto/runtime.cpp
+++ b/src/share/vm/opto/runtime.cpp
@@ -956,6 +956,48 @@ const TypeFunc* OptoRuntime::multiplyToLen_Type() {
   return TypeFunc::make(domain, range);
 }
 
+const TypeFunc* OptoRuntime::squareToLen_Type() {
+  // create input type (domain)
+  int num_args      = 4;
+  int argcnt = num_args;
+  const Type** fields = TypeTuple::fields(argcnt);
+  int argp = TypeFunc::Parms;
+  fields[argp++] = TypePtr::NOTNULL;    // x
+  fields[argp++] = TypeInt::INT;        // len
+  fields[argp++] = TypePtr::NOTNULL;    // z
+  fields[argp++] = TypeInt::INT;        // zlen
+  assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
+  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
+
+  // no result type needed
+  fields = TypeTuple::fields(1);
+  fields[TypeFunc::Parms+0] = NULL;
+  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
+  return TypeFunc::make(domain, range);
+}
+
+// for mulAdd calls, 2 pointers and 3 ints, returning int
+const TypeFunc* OptoRuntime::mulAdd_Type() {
+  // create input type (domain)
+  int num_args      = 5;
+  int argcnt = num_args;
+  const Type** fields = TypeTuple::fields(argcnt);
+  int argp = TypeFunc::Parms;
+  fields[argp++] = TypePtr::NOTNULL;    // out
+  fields[argp++] = TypePtr::NOTNULL;    // in
+  fields[argp++] = TypeInt::INT;        // offset
+  fields[argp++] = TypeInt::INT;        // len
+  fields[argp++] = TypeInt::INT;        // k
+  assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
+  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
+
+  // returning carry (int)
+  fields = TypeTuple::fields(1);
+  fields[TypeFunc::Parms+0] = TypeInt::INT;
+  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms+1, fields);
+  return TypeFunc::make(domain, range);
+}
+
 
 
 //------------- Interpreter state access for on stack replacement
diff --git a/src/share/vm/opto/runtime.hpp b/src/share/vm/opto/runtime.hpp
index 3258c36ea..aae3c2b17 100644
--- a/src/share/vm/opto/runtime.hpp
+++ b/src/share/vm/opto/runtime.hpp
@@ -305,6 +305,10 @@ private:
 
   static const TypeFunc* multiplyToLen_Type();
 
+  static const TypeFunc* squareToLen_Type();
+
+  static const TypeFunc* mulAdd_Type();
+
   static const TypeFunc* updateBytesCRC32_Type();
 
   // leaf on stack replacement interpreter accessor types
diff --git a/src/share/vm/runtime/stubRoutines.cpp b/src/share/vm/runtime/stubRoutines.cpp
index f35cce9f5..9fc123a73 100644
--- a/src/share/vm/runtime/stubRoutines.cpp
+++ b/src/share/vm/runtime/stubRoutines.cpp
@@ -136,6 +136,8 @@ address StubRoutines::_updateBytesCRC32 = NULL;
 address StubRoutines::_crc_table_adr = NULL;
 
 address StubRoutines::_multiplyToLen = NULL;
+address StubRoutines::_squareToLen = NULL;
+address StubRoutines::_mulAdd = NULL;
 
 double (* StubRoutines::_intrinsic_log   )(double) = NULL;
 double (* StubRoutines::_intrinsic_log10 )(double) = NULL;
diff --git a/src/share/vm/runtime/stubRoutines.hpp b/src/share/vm/runtime/stubRoutines.hpp
index 819b9c42e..56ae9ae45 100644
--- a/src/share/vm/runtime/stubRoutines.hpp
+++ b/src/share/vm/runtime/stubRoutines.hpp
@@ -209,6 +209,8 @@ class StubRoutines: AllStatic {
   static address _crc_table_adr;
 
   static address _multiplyToLen;
+  static address _squareToLen;
+  static address _mulAdd;
 
   // These are versions of the java.lang.Math methods which perform
   // the same operations as the intrinsic version.  They are used for
@@ -367,6 +369,8 @@ class StubRoutines: AllStatic {
   static address crc_table_addr()      { return _crc_table_adr; }
 
   static address multiplyToLen()       {return _multiplyToLen; }
+  static address squareToLen()         {return _squareToLen; }
+  static address mulAdd()              {return _mulAdd; }
 
   static address select_fill_function(BasicType t, bool aligned, const char* &name);
 
diff --git a/src/share/vm/runtime/vmStructs.cpp b/src/share/vm/runtime/vmStructs.cpp
index f1200e339..80b01150b 100644
--- a/src/share/vm/runtime/vmStructs.cpp
+++ b/src/share/vm/runtime/vmStructs.cpp
@@ -813,6 +813,8 @@ typedef TwoOopHashtable<Symbol*, mtClass>     SymbolTwoOopHashtable;
      static_field(StubRoutines,                _updateBytesCRC32,                             address)                               \
      static_field(StubRoutines,                _crc_table_adr,                                address)                               \
      static_field(StubRoutines,                _multiplyToLen,                                address)                               \
+     static_field(StubRoutines,                _squareToLen,                                  address)                               \
+     static_field(StubRoutines,                _mulAdd,                                       address)                               \
                                                                                                                                      \
   /*****************/                                                                                                                \
   /* SharedRuntime */                                                                                                                \
diff --git a/test/compiler/intrinsics/muladd/TestMulAdd.java b/test/compiler/intrinsics/muladd/TestMulAdd.java
new file mode 100644
index 000000000..4d7b274c2
--- /dev/null
+++ b/test/compiler/intrinsics/muladd/TestMulAdd.java
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 8081778
+ * @summary Add C2 x86 intrinsic for BigInteger::mulAdd() method
+ *
+ * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch
+ *      -XX:+IgnoreUnrecognizedVMOptions -XX:-UseSquareToLenIntrinsic -XX:-UseMultiplyToLenIntrinsic
+ *      -XX:CompileCommand=dontinline,TestMulAdd::main
+ *      -XX:CompileCommand=option,TestMulAdd::base_multiply,ccstr,DisableIntrinsic,_mulAdd
+ *      -XX:CompileCommand=option,java.math.BigInteger::multiply,ccstr,DisableIntrinsic,_mulAdd
+ *      -XX:CompileCommand=option,java.math.BigInteger::square,ccstr,DisableIntrinsic,_mulAdd
+ *      -XX:CompileCommand=option,java.math.BigInteger::squareToLen,ccstr,DisableIntrinsic,_mulAdd
+ *      -XX:CompileCommand=option,java.math.BigInteger::mulAdd,ccstr,DisableIntrinsic,_mulAdd
+ *      -XX:CompileCommand=inline,java.math.BigInteger::multiply
+ *      -XX:CompileCommand=inline,java.math.BigInteger::square
+ *      -XX:CompileCommand=inline,java.math.BigInteger::squareToLen
+ *      -XX:CompileCommand=inline,java.math.BigInteger::mulAdd TestMulAdd
+ */
+
+import java.util.Random;
+import java.math.*;
+
+public class TestMulAdd {
+
+    // Avoid intrinsic by preventing inlining multiply() and mulAdd().
+    public static BigInteger base_multiply(BigInteger op1) {
+      return op1.multiply(op1);
+    }
+
+    // Generate mulAdd() intrinsic by inlining multiply().
+    public static BigInteger new_multiply(BigInteger op1) {
+      return op1.multiply(op1);
+    }
+
+    public static boolean bytecompare(BigInteger b1, BigInteger b2) {
+      byte[] data1 = b1.toByteArray();
+      byte[] data2 = b2.toByteArray();
+      if (data1.length != data2.length)
+        return false;
+      for (int i = 0; i < data1.length; i++) {
+        if (data1[i] != data2[i])
+          return false;
+      }
+      return true;
+    }
+
+    public static String stringify(BigInteger b) {
+      String strout= "";
+      byte [] data = b.toByteArray();
+      for (int i = 0; i < data.length; i++) {
+        strout += (String.format("%02x",data[i]) + " ");
+      }
+      return strout;
+    }
+
+    public static void main(String args[]) throws Exception {
+
+      BigInteger oldsum = new BigInteger("0");
+      BigInteger newsum = new BigInteger("0");
+
+      BigInteger b1, b2, oldres, newres;
+
+      Random rand = new Random();
+      long seed = System.nanoTime();
+      Random rand1 = new Random();
+      long seed1 = System.nanoTime();
+      rand.setSeed(seed);
+      rand1.setSeed(seed1);
+
+      for (int j = 0; j < 100000; j++) {
+        int rand_int = rand1.nextInt(3136)+32;
+        b1 = new BigInteger(rand_int, rand);
+
+        oldres = base_multiply(b1);
+        newres = new_multiply(b1);
+
+        oldsum = oldsum.add(oldres);
+        newsum = newsum.add(newres);
+
+        if (!bytecompare(oldres,newres)) {
+          System.out.print("mismatch for:b1:" + stringify(b1) + " :oldres:" + stringify(oldres) + " :newres:" + stringify(newres));
+          System.out.println(b1);
+          throw new Exception("Failed");
+        }
+      }
+      if (!bytecompare(oldsum,newsum))  {
+        System.out.println("Failure: oldsum:" + stringify(oldsum) + " newsum:" + stringify(newsum));
+        throw new Exception("Failed");
+      } else {
+        System.out.println("Success");
+      }
+   }
+}
diff --git a/test/compiler/intrinsics/squaretolen/TestSquareToLen.java b/test/compiler/intrinsics/squaretolen/TestSquareToLen.java
new file mode 100644
index 000000000..79ebbe3af
--- /dev/null
+++ b/test/compiler/intrinsics/squaretolen/TestSquareToLen.java
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 8081778
+ * @summary Add C2 x86 intrinsic for BigInteger::squareToLen() method
+ *
+ * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch
+ *      -XX:CompileCommand=exclude,TestSquareToLen::main
+ *      -XX:CompileCommand=option,TestSquareToLen::base_multiply,ccstr,DisableIntrinsic,_squareToLen
+ *      -XX:CompileCommand=option,java.math.BigInteger::multiply,ccstr,DisableIntrinsic,_squareToLen
+ *      -XX:CompileCommand=option,java.math.BigInteger::square,ccstr,DisableIntrinsic,_squareToLen
+ *      -XX:CompileCommand=option,java.math.BigInteger::squareToLen,ccstr,DisableIntrinsic,_squareToLen
+ *      -XX:CompileCommand=inline,java.math.BigInteger::multiply
+ *      -XX:CompileCommand=inline,java.math.BigInteger::square
+ *      -XX:CompileCommand=inline,java.math.BigInteger::squareToLen TestSquareToLen
+ */
+
+import java.util.Random;
+import java.math.*;
+
+public class TestSquareToLen {
+
+    // Avoid intrinsic by preventing inlining multiply() and squareToLen().
+    public static BigInteger base_multiply(BigInteger op1) {
+      return op1.multiply(op1);
+    }
+
+    // Generate squareToLen() intrinsic by inlining multiply().
+    public static BigInteger new_multiply(BigInteger op1) {
+      return op1.multiply(op1);
+    }
+
+    public static boolean bytecompare(BigInteger b1, BigInteger b2) {
+      byte[] data1 = b1.toByteArray();
+      byte[] data2 = b2.toByteArray();
+      if (data1.length != data2.length)
+        return false;
+      for (int i = 0; i < data1.length; i++) {
+        if (data1[i] != data2[i])
+          return false;
+      }
+      return true;
+    }
+
+    public static String stringify(BigInteger b) {
+      String strout= "";
+      byte [] data = b.toByteArray();
+      for (int i = 0; i < data.length; i++) {
+        strout += (String.format("%02x",data[i]) + " ");
+      }
+      return strout;
+    }
+
+    public static void main(String args[]) throws Exception {
+
+      BigInteger oldsum = new BigInteger("0");
+      BigInteger newsum = new BigInteger("0");
+
+      BigInteger b1, b2, oldres, newres;
+
+      Random rand = new Random();
+      long seed = System.nanoTime();
+      Random rand1 = new Random();
+      long seed1 = System.nanoTime();
+      rand.setSeed(seed);
+      rand1.setSeed(seed1);
+
+      for (int j = 0; j < 100000; j++) {
+        int rand_int = rand1.nextInt(3136)+32;
+        b1 = new BigInteger(rand_int, rand);
+
+        oldres = base_multiply(b1);
+        newres = new_multiply(b1);
+
+        oldsum = oldsum.add(oldres);
+        newsum = newsum.add(newres);
+
+        if (!bytecompare(oldres,newres)) {
+          System.out.print("mismatch for:b1:" + stringify(b1) + " :oldres:" + stringify(oldres) + " :newres:" + stringify(newres));
+          System.out.println(b1);
+          throw new Exception("Failed");
+        }
+      }
+      if (!bytecompare(oldsum,newsum))  {
+        System.out.println("Failure: oldsum:" + stringify(oldsum) + " newsum:" + stringify(newsum));
+        throw new Exception("Failed");
+      } else {
+        System.out.println("Success");
+      }
+   }
+}
-- 
GitLab