8011102: Clear AVX registers after return from JNI call

Summary: Execute vzeroupper instruction after JNI call and on exits in jit compiled code which use 256bit vectors. Reviewed-by: roland

8011102: Clear AVX registers after return from JNI call
Summary: Execute vzeroupper instruction after JNI call and on exits in jit compiled code which use 256bit vectors. Reviewed-by: roland
6fb85be0 · kvn · 67e687a4 · 6fb85be0 · 6fb85be0 · 6fb85be0
15 changed file
--- a/src/cpu/x86/vm/cppInterpreter_x86.cpp
+++ b/src/cpu/x86/vm/cppInterpreter_x86.cpp
@@ -1299,25 +1299,8 @@ address InterpreterGenerator::generate_native_entry(bool synchronized) {
  __ push(rdx);
 #endif // _LP64

-  // Either restore the MXCSR register after returning from the JNI Call
-  // or verify that it wasn't changed.
-  if (VM_Version::supports_sse()) {
-    if (RestoreMXCSROnJNICalls) {
-      __ ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
-    }
-    else if (CheckJNICalls ) {
-      __ call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
-    }
-  }
-
-#ifndef _LP64
-  // Either restore the x87 floating pointer control word after returning
-  // from the JNI call or verify that it wasn't changed.
-  if (CheckJNICalls) {
-    __ call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
-  }
-#endif // _LP64
-
+  // Verify or restore cpu control state after JNI call
+  __ restore_cpu_control_state_after_jni();

  // change thread state
  __ movl(Address(thread, JavaThread::thread_state_offset()), _thread_in_native_trans);

--- a/src/cpu/x86/vm/macroAssembler_x86.cpp
+++ b/src/cpu/x86/vm/macroAssembler_x86.cpp
@@ -4765,6 +4765,31 @@ void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
  pop_CPU_state();
 }

+void MacroAssembler::restore_cpu_control_state_after_jni() {
+  // Either restore the MXCSR register after returning from the JNI Call
+  // or verify that it wasn't changed (with -Xcheck:jni flag).
+  if (VM_Version::supports_sse()) {
+    if (RestoreMXCSROnJNICalls) {
+      ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
+    } else if (CheckJNICalls) {
+      call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
+    }
+  }
+  if (VM_Version::supports_avx()) {
+    // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
+    vzeroupper();
+  }
+
+#ifndef _LP64
+  // Either restore the x87 floating pointer control word after returning
+  // from the JNI call or verify that it wasn't changed.
+  if (CheckJNICalls) {
+    call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
+  }
+#endif // _LP64
+}
+
+
 void MacroAssembler::load_klass(Register dst, Register src) {
 #ifdef _LP64
  if (UseCompressedKlassPointers) {
@@ -5759,6 +5784,8 @@ void MacroAssembler::string_compare(Register str1, Register str2,
    addptr(result, stride2);
    subl(cnt2, stride2);
    jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
+    // clean upper bits of YMM registers
+    vzeroupper();

    // compare wide vectors tail
    bind(COMPARE_WIDE_TAIL);
@@ -5772,6 +5799,8 @@ void MacroAssembler::string_compare(Register str1, Register str2,

    // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
    bind(VECTOR_NOT_EQUAL);
+    // clean upper bits of YMM registers
+    vzeroupper();
    lea(str1, Address(str1, result, scale));
    lea(str2, Address(str2, result, scale));
    jmp(COMPARE_16_CHARS);
@@ -6028,6 +6057,10 @@ void MacroAssembler::char_arrays_equals(bool is_array_equ, Register ary1, Regist

  // That's it
  bind(DONE);
+  if (UseAVX >= 2) {
+    // clean upper bits of YMM registers
+    vzeroupper();
+  }
 }

 void MacroAssembler::generate_fill(BasicType t, bool aligned,
@@ -6157,6 +6190,10 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
        vmovdqu(Address(to, 0), xtmp);
        addptr(to, 32);
        subl(count, 8 << shift);
+
+        BIND(L_check_fill_8_bytes);
+        // clean upper bits of YMM registers
+        vzeroupper();
      } else {
        // Fill 32-byte chunks
        pshufd(xtmp, xtmp, 0);
@@ -6180,8 +6217,9 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
        addptr(to, 32);
        subl(count, 8 << shift);
        jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
+
+        BIND(L_check_fill_8_bytes);
      }
-      BIND(L_check_fill_8_bytes);
      addl(count, 8 << shift);
      jccb(Assembler::zero, L_exit);
      jmpb(L_fill_8_bytes);
@@ -6316,6 +6354,10 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
    jccb(Assembler::lessEqual, L_copy_16_chars);

    bind(L_copy_16_chars_exit);
+    if (UseAVX >= 2) {
+      // clean upper bits of YMM registers
+      vzeroupper();
+    }
    subptr(len, 8);
    jccb(Assembler::greater, L_copy_8_chars_exit);


--- a/src/cpu/x86/vm/macroAssembler_x86.hpp
+++ b/src/cpu/x86/vm/macroAssembler_x86.hpp
@@ -582,6 +582,9 @@ class MacroAssembler: public Assembler {
  // only if +VerifyFPU
  void verify_FPU(int stack_depth, const char* s = "illegal FPU state");

+  // Verify or restore cpu control state after JNI call
+  void restore_cpu_control_state_after_jni();
+
  // prints msg, dumps registers and stops execution
  void stop(const char* msg);


--- a/src/cpu/x86/vm/sharedRuntime_x86_32.cpp
+++ b/src/cpu/x86/vm/sharedRuntime_x86_32.cpp
@@ -2065,6 +2065,9 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,

  __ call(RuntimeAddress(native_func));

+  // Verify or restore cpu control state after JNI call
+  __ restore_cpu_control_state_after_jni();
+
  // WARNING - on Windows Java Natives use pascal calling convention and pop the
  // arguments off of the stack. We could just re-adjust the stack pointer here
  // and continue to do SP relative addressing but we instead switch to FP

--- a/src/cpu/x86/vm/sharedRuntime_x86_64.cpp
+++ b/src/cpu/x86/vm/sharedRuntime_x86_64.cpp
@@ -2315,16 +2315,8 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,

  __ call(RuntimeAddress(native_func));

-    // Either restore the MXCSR register after returning from the JNI Call
-    // or verify that it wasn't changed.
-    if (RestoreMXCSROnJNICalls) {
-      __ ldmxcsr(ExternalAddress(StubRoutines::x86::mxcsr_std()));
-
-    }
-    else if (CheckJNICalls ) {
-      __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::verify_mxcsr_entry())));
-    }
-
+  // Verify or restore cpu control state after JNI call
+  __ restore_cpu_control_state_after_jni();

  // Unpack native results.
  switch (ret_type) {

--- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp
+++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp
@@ -835,6 +835,11 @@ class StubGenerator: public StubCodeGenerator {
  __ BIND(L_copy_64_bytes);
    __ subl(qword_count, 8);
    __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
+
+    if (UseUnalignedLoadStores && (UseAVX >= 2)) {
+      // clean upper bits of YMM registers
+      __ vzeroupper();
+    }
    __ addl(qword_count, 8);
    __ jccb(Assembler::zero, L_exit);
    //

--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp
@@ -1331,6 +1331,10 @@ class StubGenerator: public StubCodeGenerator {
      }
      __ addptr(qword_count, 4);
      __ BIND(L_end);
+      if (UseAVX >= 2) {
+        // clean upper bits of YMM registers
+        __ vzeroupper();
+      }
    } else {
      // Copy 32-bytes per iteration
      __ BIND(L_loop);
@@ -1404,6 +1408,10 @@ class StubGenerator: public StubCodeGenerator {
      }
      __ subptr(qword_count, 4);
      __ BIND(L_end);
+      if (UseAVX >= 2) {
+        // clean upper bits of YMM registers
+        __ vzeroupper();
+      }
    } else {
      // Copy 32-bytes per iteration
      __ BIND(L_loop);

--- a/src/cpu/x86/vm/templateInterpreter_x86_32.cpp
+++ b/src/cpu/x86/vm/templateInterpreter_x86_32.cpp
@@ -1080,22 +1080,8 @@ address InterpreterGenerator::generate_native_entry(bool synchronized) {

  // result potentially in rdx:rax or ST0

-  // Either restore the MXCSR register after returning from the JNI Call
-  // or verify that it wasn't changed.
-  if (VM_Version::supports_sse()) {
-    if (RestoreMXCSROnJNICalls) {
-      __ ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
-    }
-    else if (CheckJNICalls ) {
-      __ call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
-    }
-  }
-
-  // Either restore the x87 floating pointer control word after returning
-  // from the JNI call or verify that it wasn't changed.
-  if (CheckJNICalls) {
-    __ call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
-  }
+  // Verify or restore cpu control state after JNI call
+  __ restore_cpu_control_state_after_jni();

  // save potential result in ST(0) & rdx:rax
  // (if result handler is the T_FLOAT or T_DOUBLE handler, result must be in ST0 -

--- a/src/cpu/x86/vm/templateInterpreter_x86_64.cpp
+++ b/src/cpu/x86/vm/templateInterpreter_x86_64.cpp
@@ -1079,15 +1079,8 @@ address InterpreterGenerator::generate_native_entry(bool synchronized) {
  __ call(rax);
  // result potentially in rax or xmm0

-  // Depending on runtime options, either restore the MXCSR
-  // register after returning from the JNI Call or verify that
-  // it wasn't changed during -Xcheck:jni.
-  if (RestoreMXCSROnJNICalls) {
-    __ ldmxcsr(ExternalAddress(StubRoutines::x86::mxcsr_std()));
-  }
-  else if (CheckJNICalls) {
-    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::verify_mxcsr_entry())));
-  }
+  // Verify or restore cpu control state after JNI call
+  __ restore_cpu_control_state_after_jni();

  // NOTE: The order of these pushes is known to frame::interpreter_frame_result
  // in order to extract the result of a method call. If the order of these

--- a/src/cpu/x86/vm/x86_32.ad
+++ b/src/cpu/x86/vm/x86_32.ad
@@ -228,10 +228,16 @@ static jlong *float_signflip_pool  = double_quadword(&fp_signmask_pool[3*2], CON
 static jlong *double_signflip_pool = double_quadword(&fp_signmask_pool[4*2], CONST64(0x8000000000000000), CONST64(0x8000000000000000));

 // Offset hacking within calls.
-static int pre_call_FPU_size() {
-  if (Compile::current()->in_24_bit_fp_mode())
-    return 6; // fldcw
-  return 0;
+static int pre_call_resets_size() {
+  int size = 0;
+  Compile* C = Compile::current();
+  if (C->in_24_bit_fp_mode()) {
+    size += 6; // fldcw
+  }
+  if (C->max_vector_size() > 16) {
+    size += 3; // vzeroupper
+  }
+  return size;
 }

 static int preserve_SP_size() {
@@ -242,21 +248,21 @@ static int preserve_SP_size() {
 //       from the start of the call to the point where the return address
 //       will point.
 int MachCallStaticJavaNode::ret_addr_offset() {
-  int offset = 5 + pre_call_FPU_size();  // 5 bytes from start of call to where return address points
+  int offset = 5 + pre_call_resets_size();  // 5 bytes from start of call to where return address points
  if (_method_handle_invoke)
    offset += preserve_SP_size();
  return offset;
 }

 int MachCallDynamicJavaNode::ret_addr_offset() {
-  return 10 + pre_call_FPU_size();  // 10 bytes from start of call to where return address points
+  return 10 + pre_call_resets_size();  // 10 bytes from start of call to where return address points
 }

 static int sizeof_FFree_Float_Stack_All = -1;

 int MachCallRuntimeNode::ret_addr_offset() {
  assert(sizeof_FFree_Float_Stack_All != -1, "must have been emitted already");
-  return sizeof_FFree_Float_Stack_All + 5 + pre_call_FPU_size();
+  return sizeof_FFree_Float_Stack_All + 5 + pre_call_resets_size();
 }

 // Indicate if the safepoint node needs the polling page as an input.
@@ -272,7 +278,7 @@ bool SafePointNode::needs_polling_address_input() {
 // The address of the call instruction needs to be 4-byte aligned to
 // ensure that it does not span a cache line so that it can be patched.
 int CallStaticJavaDirectNode::compute_padding(int current_offset) const {
-  current_offset += pre_call_FPU_size();  // skip fldcw, if any
+  current_offset += pre_call_resets_size();  // skip fldcw, if any
  current_offset += 1;      // skip call opcode byte
  return round_to(current_offset, alignment_required()) - current_offset;
 }
@@ -280,7 +286,7 @@ int CallStaticJavaDirectNode::compute_padding(int current_offset) const {
 // The address of the call instruction needs to be 4-byte aligned to
 // ensure that it does not span a cache line so that it can be patched.
 int CallStaticJavaHandleNode::compute_padding(int current_offset) const {
-  current_offset += pre_call_FPU_size();  // skip fldcw, if any
+  current_offset += pre_call_resets_size();  // skip fldcw, if any
  current_offset += preserve_SP_size();   // skip mov rbp, rsp
  current_offset += 1;      // skip call opcode byte
  return round_to(current_offset, alignment_required()) - current_offset;
@@ -289,7 +295,7 @@ int CallStaticJavaHandleNode::compute_padding(int current_offset) const {
 // The address of the call instruction needs to be 4-byte aligned to
 // ensure that it does not span a cache line so that it can be patched.
 int CallDynamicJavaDirectNode::compute_padding(int current_offset) const {
-  current_offset += pre_call_FPU_size();  // skip fldcw, if any
+  current_offset += pre_call_resets_size();  // skip fldcw, if any
  current_offset += 5;      // skip MOV instruction
  current_offset += 1;      // skip call opcode byte
  return round_to(current_offset, alignment_required()) - current_offset;
@@ -583,16 +589,20 @@ void MachEpilogNode::format( PhaseRegAlloc *ra_, outputStream* st ) const {
  // Remove two words for return addr and rbp,
  framesize -= 2*wordSize;

-  if( C->in_24_bit_fp_mode() ) {
+  if (C->max_vector_size() > 16) {
+    st->print("VZEROUPPER");
+    st->cr(); st->print("\t");
+  }
+  if (C->in_24_bit_fp_mode()) {
    st->print("FLDCW  standard control word");
    st->cr(); st->print("\t");
  }
-  if( framesize ) {
+  if (framesize) {
    st->print("ADD    ESP,%d\t# Destroy frame",framesize);
    st->cr(); st->print("\t");
  }
  st->print_cr("POPL   EBP"); st->print("\t");
-  if( do_polling() && C->is_method_compilation() ) {
+  if (do_polling() && C->is_method_compilation()) {
    st->print("TEST   PollPage,EAX\t! Poll Safepoint");
    st->cr(); st->print("\t");
  }
@@ -602,8 +612,14 @@ void MachEpilogNode::format( PhaseRegAlloc *ra_, outputStream* st ) const {
 void MachEpilogNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
  Compile *C = ra_->C;

+  if (C->max_vector_size() > 16) {
+    // Clear upper bits of YMM registers when current compiled code uses
+    // wide vectors to avoid AVX <-> SSE transition penalty during call.
+    MacroAssembler masm(&cbuf);
+    masm.vzeroupper();
+  }
  // If method set FPU control word, restore to standard control word
-  if( C->in_24_bit_fp_mode() ) {
+  if (C->in_24_bit_fp_mode()) {
    MacroAssembler masm(&cbuf);
    masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
  }
@@ -615,12 +631,11 @@ void MachEpilogNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {

  // Note that VerifyStackAtCalls' Majik cookie does not change the frame size popped here

-  if( framesize >= 128 ) {
+  if (framesize >= 128) {
    emit_opcode(cbuf, 0x81); // add  SP, #framesize
    emit_rm(cbuf, 0x3, 0x00, ESP_enc);
    emit_d32(cbuf, framesize);
-  }
-  else if( framesize ) {
+  } else if (framesize) {
    emit_opcode(cbuf, 0x83); // add  SP, #framesize
    emit_rm(cbuf, 0x3, 0x00, ESP_enc);
    emit_d8(cbuf, framesize);
@@ -628,7 +643,7 @@ void MachEpilogNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {

  emit_opcode(cbuf, 0x58 | EBP_enc);

-  if( do_polling() && C->is_method_compilation() ) {
+  if (do_polling() && C->is_method_compilation()) {
    cbuf.relocate(cbuf.insts_end(), relocInfo::poll_return_type, 0);
    emit_opcode(cbuf,0x85);
    emit_rm(cbuf, 0x0, EAX_enc, 0x5); // EAX
@@ -640,7 +655,8 @@ uint MachEpilogNode::size(PhaseRegAlloc *ra_) const {
  Compile *C = ra_->C;
  // If method set FPU control word, restore to standard control word
  int size = C->in_24_bit_fp_mode() ? 6 : 0;
-  if( do_polling() && C->is_method_compilation() ) size += 6;
+  if (C->max_vector_size() > 16) size += 3; // vzeroupper
+  if (do_polling() && C->is_method_compilation()) size += 6;

  int framesize = C->frame_slots() << LogBytesPerInt;
  assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
@@ -649,7 +665,7 @@ uint MachEpilogNode::size(PhaseRegAlloc *ra_) const {

  size++; // popl rbp,

-  if( framesize >= 128 ) {
+  if (framesize >= 128) {
    size += 6;
  } else {
    size += framesize ? 3 : 0;
@@ -1853,20 +1869,26 @@ encode %{
  %}


-  enc_class pre_call_FPU %{
+  enc_class pre_call_resets %{
    // If method sets FPU control word restore it here
    debug_only(int off0 = cbuf.insts_size());
-    if( Compile::current()->in_24_bit_fp_mode() ) {
-      MacroAssembler masm(&cbuf);
-      masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
+    if (ra_->C->in_24_bit_fp_mode()) {
+      MacroAssembler _masm(&cbuf);
+      __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
+    }
+    if (ra_->C->max_vector_size() > 16) {
+      // Clear upper bits of YMM registers when current compiled code uses
+      // wide vectors to avoid AVX <-> SSE transition penalty during call.
+      MacroAssembler _masm(&cbuf);
+      __ vzeroupper();
    }
    debug_only(int off1 = cbuf.insts_size());
-    assert(off1 - off0 == pre_call_FPU_size(), "correct size prediction");
+    assert(off1 - off0 == pre_call_resets_size(), "correct size prediction");
  %}

  enc_class post_call_FPU %{
    // If method sets FPU control word do it here also
-    if( Compile::current()->in_24_bit_fp_mode() ) {
+    if (Compile::current()->in_24_bit_fp_mode()) {
      MacroAssembler masm(&cbuf);
      masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
    }
@@ -1877,17 +1899,17 @@ encode %{
    // who we intended to call.
    cbuf.set_insts_mark();
    $$$emit8$primary;
-    if ( !_method ) {
+    if (!_method) {
      emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4),
                     runtime_call_Relocation::spec(), RELOC_IMM32 );
-    } else if(_optimized_virtual) {
+    } else if (_optimized_virtual) {
      emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4),
                     opt_virtual_call_Relocation::spec(), RELOC_IMM32 );
    } else {
      emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4),
                     static_call_Relocation::spec(), RELOC_IMM32 );
    }
-    if( _method ) {  // Emit stub for static call
+    if (_method) {  // Emit stub for static call
      emit_java_to_interp(cbuf);
    }
  %}
@@ -12828,7 +12850,7 @@ instruct CallStaticJavaDirect(method meth) %{
  ins_cost(300);
  format %{ "CALL,static " %}
  opcode(0xE8); /* E8 cd */
-  ins_encode( pre_call_FPU,
+  ins_encode( pre_call_resets,
              Java_Static_Call( meth ),
              call_epilog,
              post_call_FPU );
@@ -12849,7 +12871,7 @@ instruct CallStaticJavaHandle(method meth, eBPRegP ebp_mh_SP_save) %{
  ins_cost(300);
  format %{ "CALL,static/MethodHandle " %}
  opcode(0xE8); /* E8 cd */
-  ins_encode( pre_call_FPU,
+  ins_encode( pre_call_resets,
              preserve_SP,
              Java_Static_Call( meth ),
              restore_SP,
@@ -12870,7 +12892,7 @@ instruct CallDynamicJavaDirect(method meth) %{
  format %{ "MOV    EAX,(oop)-1\n\t"
            "CALL,dynamic" %}
  opcode(0xE8); /* E8 cd */
-  ins_encode( pre_call_FPU,
+  ins_encode( pre_call_resets,
              Java_Dynamic_Call( meth ),
              call_epilog,
              post_call_FPU );
@@ -12887,7 +12909,7 @@ instruct CallRuntimeDirect(method meth) %{
  format %{ "CALL,runtime " %}
  opcode(0xE8); /* E8 cd */
  // Use FFREEs to clear entries in float stack
-  ins_encode( pre_call_FPU,
+  ins_encode( pre_call_resets,
              FFree_Float_Stack_All,
              Java_To_Runtime( meth ),
              post_call_FPU );
@@ -12902,7 +12924,7 @@ instruct CallLeafDirect(method meth) %{
  ins_cost(300);
  format %{ "CALL_LEAF,runtime " %}
  opcode(0xE8); /* E8 cd */
-  ins_encode( pre_call_FPU,
+  ins_encode( pre_call_resets,
              FFree_Float_Stack_All,
              Java_To_Runtime( meth ),
              Verify_FPU_For_Leaf, post_call_FPU );

--- a/src/cpu/x86/vm/x86_64.ad
+++ b/src/cpu/x86/vm/x86_64.ad
@@ -399,6 +399,9 @@ source %{
 static int preserve_SP_size() {
  return 3;  // rex.w, op, rm(reg/reg)
 }
+static int clear_avx_size() {
+  return (Compile::current()->max_vector_size() > 16) ? 3 : 0;  // vzeroupper
+}

 // !!!!! Special hack to get all types of calls to specify the byte offset
 //       from the start of the call to the point where the return address
@@ -406,6 +409,7 @@ static int preserve_SP_size() {
 int MachCallStaticJavaNode::ret_addr_offset()
 {
  int offset = 5; // 5 bytes from start of call to where return address points
+  offset += clear_avx_size();
  if (_method_handle_invoke)
    offset += preserve_SP_size();
  return offset;
@@ -413,11 +417,16 @@ int MachCallStaticJavaNode::ret_addr_offset()

 int MachCallDynamicJavaNode::ret_addr_offset()
 {
-  return 15; // 15 bytes from start of call to where return address points
+  int offset = 15; // 15 bytes from start of call to where return address points
+  offset += clear_avx_size();
+  return offset;
 }

-// In os_cpu .ad file
-// int MachCallRuntimeNode::ret_addr_offset()
+int MachCallRuntimeNode::ret_addr_offset() {
+  int offset = 13; // movq r10,#addr; callq (r10)
+  offset += clear_avx_size();
+  return offset;
+}

 // Indicate if the safepoint node needs the polling page as an input,
 // it does if the polling page is more than disp32 away.
@@ -434,6 +443,7 @@ bool SafePointNode::needs_polling_address_input()
 // ensure that it does not span a cache line so that it can be patched.
 int CallStaticJavaDirectNode::compute_padding(int current_offset) const
 {
+  current_offset += clear_avx_size(); // skip vzeroupper
  current_offset += 1; // skip call opcode byte
  return round_to(current_offset, alignment_required()) - current_offset;
 }
@@ -443,6 +453,7 @@ int CallStaticJavaDirectNode::compute_padding(int current_offset) const
 int CallStaticJavaHandleNode::compute_padding(int current_offset) const
 {
  current_offset += preserve_SP_size();   // skip mov rbp, rsp
+  current_offset += clear_avx_size(); // skip vzeroupper
  current_offset += 1; // skip call opcode byte
  return round_to(current_offset, alignment_required()) - current_offset;
 }
@@ -451,6 +462,7 @@ int CallStaticJavaHandleNode::compute_padding(int current_offset) const
 // ensure that it does not span a cache line so that it can be patched.
 int CallDynamicJavaDirectNode::compute_padding(int current_offset) const
 {
+  current_offset += clear_avx_size(); // skip vzeroupper
  current_offset += 11; // skip movq instruction + call opcode byte
  return round_to(current_offset, alignment_required()) - current_offset;
 }
@@ -764,6 +776,11 @@ int MachPrologNode::reloc() const
 void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const
 {
  Compile* C = ra_->C;
+  if (C->max_vector_size() > 16) {
+    st->print("vzeroupper");
+    st->cr(); st->print("\t");
+  }
+
  int framesize = C->frame_slots() << LogBytesPerInt;
  assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  // Remove word for return adr already pushed
@@ -793,6 +810,13 @@ void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const
 void MachEpilogNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
 {
  Compile* C = ra_->C;
+  if (C->max_vector_size() > 16) {
+    // Clear upper bits of YMM registers when current compiled code uses
+    // wide vectors to avoid AVX <-> SSE transition penalty during call.
+    MacroAssembler _masm(&cbuf);
+    __ vzeroupper();
+  }
+
  int framesize = C->frame_slots() << LogBytesPerInt;
  assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  // Remove word for return adr already pushed
@@ -2008,6 +2032,25 @@ encode %{
    __ bind(miss);
  %}

+  enc_class clear_avx %{
+    debug_only(int off0 = cbuf.insts_size());
+    if (ra_->C->max_vector_size() > 16) {
+      // Clear upper bits of YMM registers when current compiled code uses
+      // wide vectors to avoid AVX <-> SSE transition penalty during call.
+      MacroAssembler _masm(&cbuf);
+      __ vzeroupper();
+    }
+    debug_only(int off1 = cbuf.insts_size());
+    assert(off1 - off0 == clear_avx_size(), "correct size prediction");
+  %}
+
+  enc_class Java_To_Runtime(method meth) %{
+    // No relocation needed
+    MacroAssembler _masm(&cbuf);
+    __ mov64(r10, (int64_t) $meth$$method);
+    __ call(r10);
+  %}
+
  enc_class Java_To_Interpreter(method meth)
  %{
    // CALL Java_To_Interpreter
@@ -11366,7 +11409,7 @@ instruct CallStaticJavaDirect(method meth) %{
  ins_cost(300);
  format %{ "call,static " %}
  opcode(0xE8); /* E8 cd */
-  ins_encode(Java_Static_Call(meth), call_epilog);
+  ins_encode(clear_avx, Java_Static_Call(meth), call_epilog);
  ins_pipe(pipe_slow);
  ins_alignment(4);
 %}
@@ -11384,7 +11427,7 @@ instruct CallStaticJavaHandle(method meth, rbp_RegP rbp_mh_SP_save) %{
  ins_cost(300);
  format %{ "call,static/MethodHandle " %}
  opcode(0xE8); /* E8 cd */
-  ins_encode(preserve_SP,
+  ins_encode(clear_avx, preserve_SP,
             Java_Static_Call(meth),
             restore_SP,
             call_epilog);
@@ -11403,7 +11446,7 @@ instruct CallDynamicJavaDirect(method meth)
  ins_cost(300);
  format %{ "movq    rax, #Universe::non_oop_word()\n\t"
            "call,dynamic " %}
-  ins_encode(Java_Dynamic_Call(meth), call_epilog);
+  ins_encode(clear_avx, Java_Dynamic_Call(meth), call_epilog);
  ins_pipe(pipe_slow);
  ins_alignment(4);
 %}
@@ -11416,8 +11459,7 @@ instruct CallRuntimeDirect(method meth)

  ins_cost(300);
  format %{ "call,runtime " %}
-  opcode(0xE8); /* E8 cd */
-  ins_encode(Java_To_Runtime(meth));
+  ins_encode(clear_avx, Java_To_Runtime(meth));
  ins_pipe(pipe_slow);
 %}

@@ -11429,8 +11471,7 @@ instruct CallLeafDirect(method meth)

  ins_cost(300);
  format %{ "call_leaf,runtime " %}
-  opcode(0xE8); /* E8 cd */
-  ins_encode(Java_To_Runtime(meth));
+  ins_encode(clear_avx, Java_To_Runtime(meth));
  ins_pipe(pipe_slow);
 %}

@@ -11442,7 +11483,6 @@ instruct CallLeafNoFPDirect(method meth)

  ins_cost(300);
  format %{ "call_leaf_nofp,runtime " %}
-  opcode(0xE8); /* E8 cd */
  ins_encode(Java_To_Runtime(meth));
  ins_pipe(pipe_slow);
 %}

--- a/src/os_cpu/bsd_x86/vm/bsd_x86_64.ad
+++ b/src/os_cpu/bsd_x86/vm/bsd_x86_64.ad
@@ -55,20 +55,6 @@ encode %{
  // adding a syntax that specifies the sizes of fields in an order,
  // so that the adlc can build the emit functions automagically

-  enc_class Java_To_Runtime(method meth) %{
-    // No relocation needed
-
-    // movq r10, <meth>
-    emit_opcode(cbuf, Assembler::REX_WB);
-    emit_opcode(cbuf, 0xB8 | (R10_enc - 8));
-    emit_d64(cbuf, (int64_t) $meth$$method);
-
-    // call (r10)
-    emit_opcode(cbuf, Assembler::REX_B);
-    emit_opcode(cbuf, 0xFF);
-    emit_opcode(cbuf, 0xD0 | (R10_enc - 8));
-  %}
-
 %}


@@ -76,8 +62,4 @@ encode %{

 source %{

-int MachCallRuntimeNode::ret_addr_offset() {
-  return 13; // movq r10,#addr; callq (r10)
-}
-
 %}
--- a/src/os_cpu/linux_x86/vm/linux_x86_64.ad
+++ b/src/os_cpu/linux_x86/vm/linux_x86_64.ad
@@ -55,20 +55,6 @@ encode %{
  // adding a syntax that specifies the sizes of fields in an order,
  // so that the adlc can build the emit functions automagically

-  enc_class Java_To_Runtime(method meth) %{
-    // No relocation needed
-
-    // movq r10, <meth>
-    emit_opcode(cbuf, Assembler::REX_WB);
-    emit_opcode(cbuf, 0xB8 | (R10_enc - 8));
-    emit_d64(cbuf, (int64_t) $meth$$method);
-
-    // call (r10)
-    emit_opcode(cbuf, Assembler::REX_B);
-    emit_opcode(cbuf, 0xFF);
-    emit_opcode(cbuf, 0xD0 | (R10_enc - 8));
-  %}
-
 %}


@@ -76,8 +62,4 @@ encode %{

 source %{

-int MachCallRuntimeNode::ret_addr_offset() {
-  return 13; // movq r10,#addr; callq (r10)
-}
-
 %}
--- a/src/os_cpu/solaris_x86/vm/solaris_x86_64.ad
+++ b/src/os_cpu/solaris_x86/vm/solaris_x86_64.ad
@@ -54,39 +54,10 @@ encode %{
  // main source block for now.  In future, we can generalize this by
  // adding a syntax that specifies the sizes of fields in an order,
  // so that the adlc can build the emit functions automagically
-
-  enc_class Java_To_Runtime(method meth) %{
-    // No relocation needed
-
-    // movq r10, <meth>
-    emit_opcode(cbuf, Assembler::REX_WB);
-    emit_opcode(cbuf, 0xB8 | (R10_enc - 8));
-    emit_d64(cbuf, (int64_t) $meth$$method);
-
-    // call (r10)
-    emit_opcode(cbuf, Assembler::REX_B);
-    emit_opcode(cbuf, 0xFF);
-    emit_opcode(cbuf, 0xD0 | (R10_enc - 8));
-  %}
-
-  enc_class post_call_verify_mxcsr %{
-    MacroAssembler _masm(&cbuf);
-    if (RestoreMXCSROnJNICalls) {
-      __ ldmxcsr(ExternalAddress(StubRoutines::amd64::mxcsr_std()));
-    }
-    else if (CheckJNICalls) {
-      __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::amd64::verify_mxcsr_entry())));
-    }
-  %}
 %}


 // Platform dependent source

 source %{
-
-int MachCallRuntimeNode::ret_addr_offset() {
-  return 13; // movq r10,#addr; callq (r10)
-}
-
 %}
--- a/src/os_cpu/windows_x86/vm/windows_x86_64.ad
+++ b/src/os_cpu/windows_x86/vm/windows_x86_64.ad
@@ -53,30 +53,11 @@ encode %{
  // adding a syntax that specifies the sizes of fields in an order,
  // so that the adlc can build the emit functions automagically

-  enc_class Java_To_Runtime (method meth) %{    // CALL Java_To_Runtime
-    // No relocation needed
-
-    // movq r10, <meth>
-    emit_opcode(cbuf, Assembler::REX_WB);
-    emit_opcode(cbuf, 0xB8 | (R10_enc - 8));
-    emit_d64(cbuf, (int64_t) $meth$$method);
-
-    // call (r10)
-    emit_opcode(cbuf, Assembler::REX_B);
-    emit_opcode(cbuf, 0xFF);
-    emit_opcode(cbuf, 0xD0 | (R10_enc - 8));
-  %}
-
 %}

-//
+
 // Platform dependent source
-//
-source %{

-int MachCallRuntimeNode::ret_addr_offset()
-{
-  return 13; // movq r10,#addr; callq (r10)
-}
+source %{

 %}