diff --git a/src/cpu/x86/vm/cppInterpreter_x86.cpp b/src/cpu/x86/vm/cppInterpreter_x86.cpp index c568c6f4d3afe1e80b46f90045ffb8a256dcd9ba..24e6694082efc102f3f744671eb34ecde4222290 100644 --- a/src/cpu/x86/vm/cppInterpreter_x86.cpp +++ b/src/cpu/x86/vm/cppInterpreter_x86.cpp @@ -1299,25 +1299,8 @@ address InterpreterGenerator::generate_native_entry(bool synchronized) { __ push(rdx); #endif // _LP64 - // Either restore the MXCSR register after returning from the JNI Call - // or verify that it wasn't changed. - if (VM_Version::supports_sse()) { - if (RestoreMXCSROnJNICalls) { - __ ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std())); - } - else if (CheckJNICalls ) { - __ call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry())); - } - } - -#ifndef _LP64 - // Either restore the x87 floating pointer control word after returning - // from the JNI call or verify that it wasn't changed. - if (CheckJNICalls) { - __ call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry())); - } -#endif // _LP64 - + // Verify or restore cpu control state after JNI call + __ restore_cpu_control_state_after_jni(); // change thread state __ movl(Address(thread, JavaThread::thread_state_offset()), _thread_in_native_trans); diff --git a/src/cpu/x86/vm/macroAssembler_x86.cpp b/src/cpu/x86/vm/macroAssembler_x86.cpp index b64518a7bc1fbda5997f44eb2d6fbc6443818957..98c93f99a0f9caadeadfdf14355a1c9331a47a86 100644 --- a/src/cpu/x86/vm/macroAssembler_x86.cpp +++ b/src/cpu/x86/vm/macroAssembler_x86.cpp @@ -4765,6 +4765,31 @@ void MacroAssembler::verify_FPU(int stack_depth, const char* s) { pop_CPU_state(); } +void MacroAssembler::restore_cpu_control_state_after_jni() { + // Either restore the MXCSR register after returning from the JNI Call + // or verify that it wasn't changed (with -Xcheck:jni flag). + if (VM_Version::supports_sse()) { + if (RestoreMXCSROnJNICalls) { + ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std())); + } else if (CheckJNICalls) { + call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry())); + } + } + if (VM_Version::supports_avx()) { + // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty. + vzeroupper(); + } + +#ifndef _LP64 + // Either restore the x87 floating pointer control word after returning + // from the JNI call or verify that it wasn't changed. + if (CheckJNICalls) { + call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry())); + } +#endif // _LP64 +} + + void MacroAssembler::load_klass(Register dst, Register src) { #ifdef _LP64 if (UseCompressedKlassPointers) { @@ -5759,6 +5784,8 @@ void MacroAssembler::string_compare(Register str1, Register str2, addptr(result, stride2); subl(cnt2, stride2); jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); + // clean upper bits of YMM registers + vzeroupper(); // compare wide vectors tail bind(COMPARE_WIDE_TAIL); @@ -5772,6 +5799,8 @@ void MacroAssembler::string_compare(Register str1, Register str2, // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. bind(VECTOR_NOT_EQUAL); + // clean upper bits of YMM registers + vzeroupper(); lea(str1, Address(str1, result, scale)); lea(str2, Address(str2, result, scale)); jmp(COMPARE_16_CHARS); @@ -6028,6 +6057,10 @@ void MacroAssembler::char_arrays_equals(bool is_array_equ, Register ary1, Regist // That's it bind(DONE); + if (UseAVX >= 2) { + // clean upper bits of YMM registers + vzeroupper(); + } } void MacroAssembler::generate_fill(BasicType t, bool aligned, @@ -6157,6 +6190,10 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned, vmovdqu(Address(to, 0), xtmp); addptr(to, 32); subl(count, 8 << shift); + + BIND(L_check_fill_8_bytes); + // clean upper bits of YMM registers + vzeroupper(); } else { // Fill 32-byte chunks pshufd(xtmp, xtmp, 0); @@ -6180,8 +6217,9 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned, addptr(to, 32); subl(count, 8 << shift); jcc(Assembler::greaterEqual, L_fill_32_bytes_loop); + + BIND(L_check_fill_8_bytes); } - BIND(L_check_fill_8_bytes); addl(count, 8 << shift); jccb(Assembler::zero, L_exit); jmpb(L_fill_8_bytes); @@ -6316,6 +6354,10 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, Register len, jccb(Assembler::lessEqual, L_copy_16_chars); bind(L_copy_16_chars_exit); + if (UseAVX >= 2) { + // clean upper bits of YMM registers + vzeroupper(); + } subptr(len, 8); jccb(Assembler::greater, L_copy_8_chars_exit); diff --git a/src/cpu/x86/vm/macroAssembler_x86.hpp b/src/cpu/x86/vm/macroAssembler_x86.hpp index 9500f3164fa150f8c9cb734cd243af9386dfa6ed..e9f409dc50042e5a2c7eede72d23b7066936f54f 100644 --- a/src/cpu/x86/vm/macroAssembler_x86.hpp +++ b/src/cpu/x86/vm/macroAssembler_x86.hpp @@ -582,6 +582,9 @@ class MacroAssembler: public Assembler { // only if +VerifyFPU void verify_FPU(int stack_depth, const char* s = "illegal FPU state"); + // Verify or restore cpu control state after JNI call + void restore_cpu_control_state_after_jni(); + // prints msg, dumps registers and stops execution void stop(const char* msg); diff --git a/src/cpu/x86/vm/sharedRuntime_x86_32.cpp b/src/cpu/x86/vm/sharedRuntime_x86_32.cpp index dc705421ca9605ab85a9d528379c228e02004e24..0fce7952a70dac29bec9808b2047f4b97951e811 100644 --- a/src/cpu/x86/vm/sharedRuntime_x86_32.cpp +++ b/src/cpu/x86/vm/sharedRuntime_x86_32.cpp @@ -2065,6 +2065,9 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm, __ call(RuntimeAddress(native_func)); + // Verify or restore cpu control state after JNI call + __ restore_cpu_control_state_after_jni(); + // WARNING - on Windows Java Natives use pascal calling convention and pop the // arguments off of the stack. We could just re-adjust the stack pointer here // and continue to do SP relative addressing but we instead switch to FP diff --git a/src/cpu/x86/vm/sharedRuntime_x86_64.cpp b/src/cpu/x86/vm/sharedRuntime_x86_64.cpp index 50255eeef0ea661354598458f6175cc156d65570..db20c1f23888aefa77c37252a658ad0aae345620 100644 --- a/src/cpu/x86/vm/sharedRuntime_x86_64.cpp +++ b/src/cpu/x86/vm/sharedRuntime_x86_64.cpp @@ -2315,16 +2315,8 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm, __ call(RuntimeAddress(native_func)); - // Either restore the MXCSR register after returning from the JNI Call - // or verify that it wasn't changed. - if (RestoreMXCSROnJNICalls) { - __ ldmxcsr(ExternalAddress(StubRoutines::x86::mxcsr_std())); - - } - else if (CheckJNICalls ) { - __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::verify_mxcsr_entry()))); - } - + // Verify or restore cpu control state after JNI call + __ restore_cpu_control_state_after_jni(); // Unpack native results. switch (ret_type) { diff --git a/src/cpu/x86/vm/stubGenerator_x86_32.cpp b/src/cpu/x86/vm/stubGenerator_x86_32.cpp index e56bb2266d00ee12331fc34dbce5276e55439bd0..f3a91d03c9991bbf34187457f8f00fa623cae9b2 100644 --- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp +++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp @@ -835,6 +835,11 @@ class StubGenerator: public StubCodeGenerator { __ BIND(L_copy_64_bytes); __ subl(qword_count, 8); __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop); + + if (UseUnalignedLoadStores && (UseAVX >= 2)) { + // clean upper bits of YMM registers + __ vzeroupper(); + } __ addl(qword_count, 8); __ jccb(Assembler::zero, L_exit); // diff --git a/src/cpu/x86/vm/stubGenerator_x86_64.cpp b/src/cpu/x86/vm/stubGenerator_x86_64.cpp index c6b94e243bc9522459382ddfd45598ead511f090..ace545383d7030682b2d4c398b92273124886c1a 100644 --- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp +++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp @@ -1331,6 +1331,10 @@ class StubGenerator: public StubCodeGenerator { } __ addptr(qword_count, 4); __ BIND(L_end); + if (UseAVX >= 2) { + // clean upper bits of YMM registers + __ vzeroupper(); + } } else { // Copy 32-bytes per iteration __ BIND(L_loop); @@ -1404,6 +1408,10 @@ class StubGenerator: public StubCodeGenerator { } __ subptr(qword_count, 4); __ BIND(L_end); + if (UseAVX >= 2) { + // clean upper bits of YMM registers + __ vzeroupper(); + } } else { // Copy 32-bytes per iteration __ BIND(L_loop); diff --git a/src/cpu/x86/vm/templateInterpreter_x86_32.cpp b/src/cpu/x86/vm/templateInterpreter_x86_32.cpp index 5df98394cf528d55ad6d399c8f1ca1ca96d9cb7c..fb13a44045a00487792bc9b9ad8c9ac8e312c2cd 100644 --- a/src/cpu/x86/vm/templateInterpreter_x86_32.cpp +++ b/src/cpu/x86/vm/templateInterpreter_x86_32.cpp @@ -1080,22 +1080,8 @@ address InterpreterGenerator::generate_native_entry(bool synchronized) { // result potentially in rdx:rax or ST0 - // Either restore the MXCSR register after returning from the JNI Call - // or verify that it wasn't changed. - if (VM_Version::supports_sse()) { - if (RestoreMXCSROnJNICalls) { - __ ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std())); - } - else if (CheckJNICalls ) { - __ call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry())); - } - } - - // Either restore the x87 floating pointer control word after returning - // from the JNI call or verify that it wasn't changed. - if (CheckJNICalls) { - __ call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry())); - } + // Verify or restore cpu control state after JNI call + __ restore_cpu_control_state_after_jni(); // save potential result in ST(0) & rdx:rax // (if result handler is the T_FLOAT or T_DOUBLE handler, result must be in ST0 - diff --git a/src/cpu/x86/vm/templateInterpreter_x86_64.cpp b/src/cpu/x86/vm/templateInterpreter_x86_64.cpp index c446cb3c9b17843ea97a5316aad8a8395cdfea90..6b3f7b6ba26bc0ed52679829d370b4231b36657d 100644 --- a/src/cpu/x86/vm/templateInterpreter_x86_64.cpp +++ b/src/cpu/x86/vm/templateInterpreter_x86_64.cpp @@ -1079,15 +1079,8 @@ address InterpreterGenerator::generate_native_entry(bool synchronized) { __ call(rax); // result potentially in rax or xmm0 - // Depending on runtime options, either restore the MXCSR - // register after returning from the JNI Call or verify that - // it wasn't changed during -Xcheck:jni. - if (RestoreMXCSROnJNICalls) { - __ ldmxcsr(ExternalAddress(StubRoutines::x86::mxcsr_std())); - } - else if (CheckJNICalls) { - __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::verify_mxcsr_entry()))); - } + // Verify or restore cpu control state after JNI call + __ restore_cpu_control_state_after_jni(); // NOTE: The order of these pushes is known to frame::interpreter_frame_result // in order to extract the result of a method call. If the order of these diff --git a/src/cpu/x86/vm/x86_32.ad b/src/cpu/x86/vm/x86_32.ad index 6ceb50cee845e4361602e88aa4d77384a0c2419f..67f33d3ba27be9dc64cc0df92f4a7709ab0bce00 100644 --- a/src/cpu/x86/vm/x86_32.ad +++ b/src/cpu/x86/vm/x86_32.ad @@ -228,10 +228,16 @@ static jlong *float_signflip_pool = double_quadword(&fp_signmask_pool[3*2], CON static jlong *double_signflip_pool = double_quadword(&fp_signmask_pool[4*2], CONST64(0x8000000000000000), CONST64(0x8000000000000000)); // Offset hacking within calls. -static int pre_call_FPU_size() { - if (Compile::current()->in_24_bit_fp_mode()) - return 6; // fldcw - return 0; +static int pre_call_resets_size() { + int size = 0; + Compile* C = Compile::current(); + if (C->in_24_bit_fp_mode()) { + size += 6; // fldcw + } + if (C->max_vector_size() > 16) { + size += 3; // vzeroupper + } + return size; } static int preserve_SP_size() { @@ -242,21 +248,21 @@ static int preserve_SP_size() { // from the start of the call to the point where the return address // will point. int MachCallStaticJavaNode::ret_addr_offset() { - int offset = 5 + pre_call_FPU_size(); // 5 bytes from start of call to where return address points + int offset = 5 + pre_call_resets_size(); // 5 bytes from start of call to where return address points if (_method_handle_invoke) offset += preserve_SP_size(); return offset; } int MachCallDynamicJavaNode::ret_addr_offset() { - return 10 + pre_call_FPU_size(); // 10 bytes from start of call to where return address points + return 10 + pre_call_resets_size(); // 10 bytes from start of call to where return address points } static int sizeof_FFree_Float_Stack_All = -1; int MachCallRuntimeNode::ret_addr_offset() { assert(sizeof_FFree_Float_Stack_All != -1, "must have been emitted already"); - return sizeof_FFree_Float_Stack_All + 5 + pre_call_FPU_size(); + return sizeof_FFree_Float_Stack_All + 5 + pre_call_resets_size(); } // Indicate if the safepoint node needs the polling page as an input. @@ -272,7 +278,7 @@ bool SafePointNode::needs_polling_address_input() { // The address of the call instruction needs to be 4-byte aligned to // ensure that it does not span a cache line so that it can be patched. int CallStaticJavaDirectNode::compute_padding(int current_offset) const { - current_offset += pre_call_FPU_size(); // skip fldcw, if any + current_offset += pre_call_resets_size(); // skip fldcw, if any current_offset += 1; // skip call opcode byte return round_to(current_offset, alignment_required()) - current_offset; } @@ -280,7 +286,7 @@ int CallStaticJavaDirectNode::compute_padding(int current_offset) const { // The address of the call instruction needs to be 4-byte aligned to // ensure that it does not span a cache line so that it can be patched. int CallStaticJavaHandleNode::compute_padding(int current_offset) const { - current_offset += pre_call_FPU_size(); // skip fldcw, if any + current_offset += pre_call_resets_size(); // skip fldcw, if any current_offset += preserve_SP_size(); // skip mov rbp, rsp current_offset += 1; // skip call opcode byte return round_to(current_offset, alignment_required()) - current_offset; @@ -289,7 +295,7 @@ int CallStaticJavaHandleNode::compute_padding(int current_offset) const { // The address of the call instruction needs to be 4-byte aligned to // ensure that it does not span a cache line so that it can be patched. int CallDynamicJavaDirectNode::compute_padding(int current_offset) const { - current_offset += pre_call_FPU_size(); // skip fldcw, if any + current_offset += pre_call_resets_size(); // skip fldcw, if any current_offset += 5; // skip MOV instruction current_offset += 1; // skip call opcode byte return round_to(current_offset, alignment_required()) - current_offset; @@ -583,16 +589,20 @@ void MachEpilogNode::format( PhaseRegAlloc *ra_, outputStream* st ) const { // Remove two words for return addr and rbp, framesize -= 2*wordSize; - if( C->in_24_bit_fp_mode() ) { + if (C->max_vector_size() > 16) { + st->print("VZEROUPPER"); + st->cr(); st->print("\t"); + } + if (C->in_24_bit_fp_mode()) { st->print("FLDCW standard control word"); st->cr(); st->print("\t"); } - if( framesize ) { + if (framesize) { st->print("ADD ESP,%d\t# Destroy frame",framesize); st->cr(); st->print("\t"); } st->print_cr("POPL EBP"); st->print("\t"); - if( do_polling() && C->is_method_compilation() ) { + if (do_polling() && C->is_method_compilation()) { st->print("TEST PollPage,EAX\t! Poll Safepoint"); st->cr(); st->print("\t"); } @@ -602,8 +612,14 @@ void MachEpilogNode::format( PhaseRegAlloc *ra_, outputStream* st ) const { void MachEpilogNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const { Compile *C = ra_->C; + if (C->max_vector_size() > 16) { + // Clear upper bits of YMM registers when current compiled code uses + // wide vectors to avoid AVX <-> SSE transition penalty during call. + MacroAssembler masm(&cbuf); + masm.vzeroupper(); + } // If method set FPU control word, restore to standard control word - if( C->in_24_bit_fp_mode() ) { + if (C->in_24_bit_fp_mode()) { MacroAssembler masm(&cbuf); masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std())); } @@ -615,12 +631,11 @@ void MachEpilogNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const { // Note that VerifyStackAtCalls' Majik cookie does not change the frame size popped here - if( framesize >= 128 ) { + if (framesize >= 128) { emit_opcode(cbuf, 0x81); // add SP, #framesize emit_rm(cbuf, 0x3, 0x00, ESP_enc); emit_d32(cbuf, framesize); - } - else if( framesize ) { + } else if (framesize) { emit_opcode(cbuf, 0x83); // add SP, #framesize emit_rm(cbuf, 0x3, 0x00, ESP_enc); emit_d8(cbuf, framesize); @@ -628,7 +643,7 @@ void MachEpilogNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const { emit_opcode(cbuf, 0x58 | EBP_enc); - if( do_polling() && C->is_method_compilation() ) { + if (do_polling() && C->is_method_compilation()) { cbuf.relocate(cbuf.insts_end(), relocInfo::poll_return_type, 0); emit_opcode(cbuf,0x85); emit_rm(cbuf, 0x0, EAX_enc, 0x5); // EAX @@ -640,7 +655,8 @@ uint MachEpilogNode::size(PhaseRegAlloc *ra_) const { Compile *C = ra_->C; // If method set FPU control word, restore to standard control word int size = C->in_24_bit_fp_mode() ? 6 : 0; - if( do_polling() && C->is_method_compilation() ) size += 6; + if (C->max_vector_size() > 16) size += 3; // vzeroupper + if (do_polling() && C->is_method_compilation()) size += 6; int framesize = C->frame_slots() << LogBytesPerInt; assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); @@ -649,7 +665,7 @@ uint MachEpilogNode::size(PhaseRegAlloc *ra_) const { size++; // popl rbp, - if( framesize >= 128 ) { + if (framesize >= 128) { size += 6; } else { size += framesize ? 3 : 0; @@ -1853,20 +1869,26 @@ encode %{ %} - enc_class pre_call_FPU %{ + enc_class pre_call_resets %{ // If method sets FPU control word restore it here debug_only(int off0 = cbuf.insts_size()); - if( Compile::current()->in_24_bit_fp_mode() ) { - MacroAssembler masm(&cbuf); - masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std())); + if (ra_->C->in_24_bit_fp_mode()) { + MacroAssembler _masm(&cbuf); + __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std())); + } + if (ra_->C->max_vector_size() > 16) { + // Clear upper bits of YMM registers when current compiled code uses + // wide vectors to avoid AVX <-> SSE transition penalty during call. + MacroAssembler _masm(&cbuf); + __ vzeroupper(); } debug_only(int off1 = cbuf.insts_size()); - assert(off1 - off0 == pre_call_FPU_size(), "correct size prediction"); + assert(off1 - off0 == pre_call_resets_size(), "correct size prediction"); %} enc_class post_call_FPU %{ // If method sets FPU control word do it here also - if( Compile::current()->in_24_bit_fp_mode() ) { + if (Compile::current()->in_24_bit_fp_mode()) { MacroAssembler masm(&cbuf); masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24())); } @@ -1877,17 +1899,17 @@ encode %{ // who we intended to call. cbuf.set_insts_mark(); $$$emit8$primary; - if ( !_method ) { + if (!_method) { emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4), runtime_call_Relocation::spec(), RELOC_IMM32 ); - } else if(_optimized_virtual) { + } else if (_optimized_virtual) { emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4), opt_virtual_call_Relocation::spec(), RELOC_IMM32 ); } else { emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4), static_call_Relocation::spec(), RELOC_IMM32 ); } - if( _method ) { // Emit stub for static call + if (_method) { // Emit stub for static call emit_java_to_interp(cbuf); } %} @@ -12828,7 +12850,7 @@ instruct CallStaticJavaDirect(method meth) %{ ins_cost(300); format %{ "CALL,static " %} opcode(0xE8); /* E8 cd */ - ins_encode( pre_call_FPU, + ins_encode( pre_call_resets, Java_Static_Call( meth ), call_epilog, post_call_FPU ); @@ -12849,7 +12871,7 @@ instruct CallStaticJavaHandle(method meth, eBPRegP ebp_mh_SP_save) %{ ins_cost(300); format %{ "CALL,static/MethodHandle " %} opcode(0xE8); /* E8 cd */ - ins_encode( pre_call_FPU, + ins_encode( pre_call_resets, preserve_SP, Java_Static_Call( meth ), restore_SP, @@ -12870,7 +12892,7 @@ instruct CallDynamicJavaDirect(method meth) %{ format %{ "MOV EAX,(oop)-1\n\t" "CALL,dynamic" %} opcode(0xE8); /* E8 cd */ - ins_encode( pre_call_FPU, + ins_encode( pre_call_resets, Java_Dynamic_Call( meth ), call_epilog, post_call_FPU ); @@ -12887,7 +12909,7 @@ instruct CallRuntimeDirect(method meth) %{ format %{ "CALL,runtime " %} opcode(0xE8); /* E8 cd */ // Use FFREEs to clear entries in float stack - ins_encode( pre_call_FPU, + ins_encode( pre_call_resets, FFree_Float_Stack_All, Java_To_Runtime( meth ), post_call_FPU ); @@ -12902,7 +12924,7 @@ instruct CallLeafDirect(method meth) %{ ins_cost(300); format %{ "CALL_LEAF,runtime " %} opcode(0xE8); /* E8 cd */ - ins_encode( pre_call_FPU, + ins_encode( pre_call_resets, FFree_Float_Stack_All, Java_To_Runtime( meth ), Verify_FPU_For_Leaf, post_call_FPU ); diff --git a/src/cpu/x86/vm/x86_64.ad b/src/cpu/x86/vm/x86_64.ad index 7c902c4e31c646f98cc216cc10ddc924c41991d1..77dc5b01131e6aca80465533f8b759850ee740da 100644 --- a/src/cpu/x86/vm/x86_64.ad +++ b/src/cpu/x86/vm/x86_64.ad @@ -399,6 +399,9 @@ source %{ static int preserve_SP_size() { return 3; // rex.w, op, rm(reg/reg) } +static int clear_avx_size() { + return (Compile::current()->max_vector_size() > 16) ? 3 : 0; // vzeroupper +} // !!!!! Special hack to get all types of calls to specify the byte offset // from the start of the call to the point where the return address @@ -406,6 +409,7 @@ static int preserve_SP_size() { int MachCallStaticJavaNode::ret_addr_offset() { int offset = 5; // 5 bytes from start of call to where return address points + offset += clear_avx_size(); if (_method_handle_invoke) offset += preserve_SP_size(); return offset; @@ -413,11 +417,16 @@ int MachCallStaticJavaNode::ret_addr_offset() int MachCallDynamicJavaNode::ret_addr_offset() { - return 15; // 15 bytes from start of call to where return address points + int offset = 15; // 15 bytes from start of call to where return address points + offset += clear_avx_size(); + return offset; } -// In os_cpu .ad file -// int MachCallRuntimeNode::ret_addr_offset() +int MachCallRuntimeNode::ret_addr_offset() { + int offset = 13; // movq r10,#addr; callq (r10) + offset += clear_avx_size(); + return offset; +} // Indicate if the safepoint node needs the polling page as an input, // it does if the polling page is more than disp32 away. @@ -434,6 +443,7 @@ bool SafePointNode::needs_polling_address_input() // ensure that it does not span a cache line so that it can be patched. int CallStaticJavaDirectNode::compute_padding(int current_offset) const { + current_offset += clear_avx_size(); // skip vzeroupper current_offset += 1; // skip call opcode byte return round_to(current_offset, alignment_required()) - current_offset; } @@ -443,6 +453,7 @@ int CallStaticJavaDirectNode::compute_padding(int current_offset) const int CallStaticJavaHandleNode::compute_padding(int current_offset) const { current_offset += preserve_SP_size(); // skip mov rbp, rsp + current_offset += clear_avx_size(); // skip vzeroupper current_offset += 1; // skip call opcode byte return round_to(current_offset, alignment_required()) - current_offset; } @@ -451,6 +462,7 @@ int CallStaticJavaHandleNode::compute_padding(int current_offset) const // ensure that it does not span a cache line so that it can be patched. int CallDynamicJavaDirectNode::compute_padding(int current_offset) const { + current_offset += clear_avx_size(); // skip vzeroupper current_offset += 11; // skip movq instruction + call opcode byte return round_to(current_offset, alignment_required()) - current_offset; } @@ -764,6 +776,11 @@ int MachPrologNode::reloc() const void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const { Compile* C = ra_->C; + if (C->max_vector_size() > 16) { + st->print("vzeroupper"); + st->cr(); st->print("\t"); + } + int framesize = C->frame_slots() << LogBytesPerInt; assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); // Remove word for return adr already pushed @@ -793,6 +810,13 @@ void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const void MachEpilogNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const { Compile* C = ra_->C; + if (C->max_vector_size() > 16) { + // Clear upper bits of YMM registers when current compiled code uses + // wide vectors to avoid AVX <-> SSE transition penalty during call. + MacroAssembler _masm(&cbuf); + __ vzeroupper(); + } + int framesize = C->frame_slots() << LogBytesPerInt; assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); // Remove word for return adr already pushed @@ -2008,6 +2032,25 @@ encode %{ __ bind(miss); %} + enc_class clear_avx %{ + debug_only(int off0 = cbuf.insts_size()); + if (ra_->C->max_vector_size() > 16) { + // Clear upper bits of YMM registers when current compiled code uses + // wide vectors to avoid AVX <-> SSE transition penalty during call. + MacroAssembler _masm(&cbuf); + __ vzeroupper(); + } + debug_only(int off1 = cbuf.insts_size()); + assert(off1 - off0 == clear_avx_size(), "correct size prediction"); + %} + + enc_class Java_To_Runtime(method meth) %{ + // No relocation needed + MacroAssembler _masm(&cbuf); + __ mov64(r10, (int64_t) $meth$$method); + __ call(r10); + %} + enc_class Java_To_Interpreter(method meth) %{ // CALL Java_To_Interpreter @@ -11366,7 +11409,7 @@ instruct CallStaticJavaDirect(method meth) %{ ins_cost(300); format %{ "call,static " %} opcode(0xE8); /* E8 cd */ - ins_encode(Java_Static_Call(meth), call_epilog); + ins_encode(clear_avx, Java_Static_Call(meth), call_epilog); ins_pipe(pipe_slow); ins_alignment(4); %} @@ -11384,7 +11427,7 @@ instruct CallStaticJavaHandle(method meth, rbp_RegP rbp_mh_SP_save) %{ ins_cost(300); format %{ "call,static/MethodHandle " %} opcode(0xE8); /* E8 cd */ - ins_encode(preserve_SP, + ins_encode(clear_avx, preserve_SP, Java_Static_Call(meth), restore_SP, call_epilog); @@ -11403,7 +11446,7 @@ instruct CallDynamicJavaDirect(method meth) ins_cost(300); format %{ "movq rax, #Universe::non_oop_word()\n\t" "call,dynamic " %} - ins_encode(Java_Dynamic_Call(meth), call_epilog); + ins_encode(clear_avx, Java_Dynamic_Call(meth), call_epilog); ins_pipe(pipe_slow); ins_alignment(4); %} @@ -11416,8 +11459,7 @@ instruct CallRuntimeDirect(method meth) ins_cost(300); format %{ "call,runtime " %} - opcode(0xE8); /* E8 cd */ - ins_encode(Java_To_Runtime(meth)); + ins_encode(clear_avx, Java_To_Runtime(meth)); ins_pipe(pipe_slow); %} @@ -11429,8 +11471,7 @@ instruct CallLeafDirect(method meth) ins_cost(300); format %{ "call_leaf,runtime " %} - opcode(0xE8); /* E8 cd */ - ins_encode(Java_To_Runtime(meth)); + ins_encode(clear_avx, Java_To_Runtime(meth)); ins_pipe(pipe_slow); %} @@ -11442,7 +11483,6 @@ instruct CallLeafNoFPDirect(method meth) ins_cost(300); format %{ "call_leaf_nofp,runtime " %} - opcode(0xE8); /* E8 cd */ ins_encode(Java_To_Runtime(meth)); ins_pipe(pipe_slow); %} diff --git a/src/os_cpu/bsd_x86/vm/bsd_x86_64.ad b/src/os_cpu/bsd_x86/vm/bsd_x86_64.ad index f4dc25d34fde24c33858edace46d1cf9b3e06dd7..254328e09714cba0a29fc3a1bd46553218923193 100644 --- a/src/os_cpu/bsd_x86/vm/bsd_x86_64.ad +++ b/src/os_cpu/bsd_x86/vm/bsd_x86_64.ad @@ -55,20 +55,6 @@ encode %{ // adding a syntax that specifies the sizes of fields in an order, // so that the adlc can build the emit functions automagically - enc_class Java_To_Runtime(method meth) %{ - // No relocation needed - - // movq r10, - emit_opcode(cbuf, Assembler::REX_WB); - emit_opcode(cbuf, 0xB8 | (R10_enc - 8)); - emit_d64(cbuf, (int64_t) $meth$$method); - - // call (r10) - emit_opcode(cbuf, Assembler::REX_B); - emit_opcode(cbuf, 0xFF); - emit_opcode(cbuf, 0xD0 | (R10_enc - 8)); - %} - %} @@ -76,8 +62,4 @@ encode %{ source %{ -int MachCallRuntimeNode::ret_addr_offset() { - return 13; // movq r10,#addr; callq (r10) -} - %} diff --git a/src/os_cpu/linux_x86/vm/linux_x86_64.ad b/src/os_cpu/linux_x86/vm/linux_x86_64.ad index cf9adf40e90c46fce142694629cfa18c54a4677c..3b3ac007cd1c333a1254856b6edaea44baca29d9 100644 --- a/src/os_cpu/linux_x86/vm/linux_x86_64.ad +++ b/src/os_cpu/linux_x86/vm/linux_x86_64.ad @@ -55,20 +55,6 @@ encode %{ // adding a syntax that specifies the sizes of fields in an order, // so that the adlc can build the emit functions automagically - enc_class Java_To_Runtime(method meth) %{ - // No relocation needed - - // movq r10, - emit_opcode(cbuf, Assembler::REX_WB); - emit_opcode(cbuf, 0xB8 | (R10_enc - 8)); - emit_d64(cbuf, (int64_t) $meth$$method); - - // call (r10) - emit_opcode(cbuf, Assembler::REX_B); - emit_opcode(cbuf, 0xFF); - emit_opcode(cbuf, 0xD0 | (R10_enc - 8)); - %} - %} @@ -76,8 +62,4 @@ encode %{ source %{ -int MachCallRuntimeNode::ret_addr_offset() { - return 13; // movq r10,#addr; callq (r10) -} - %} diff --git a/src/os_cpu/solaris_x86/vm/solaris_x86_64.ad b/src/os_cpu/solaris_x86/vm/solaris_x86_64.ad index fdce355abe9dac2fbc6be357a59db109e91190dd..f3334952f62a6f99347e8cbfee39a688f787c304 100644 --- a/src/os_cpu/solaris_x86/vm/solaris_x86_64.ad +++ b/src/os_cpu/solaris_x86/vm/solaris_x86_64.ad @@ -54,39 +54,10 @@ encode %{ // main source block for now. In future, we can generalize this by // adding a syntax that specifies the sizes of fields in an order, // so that the adlc can build the emit functions automagically - - enc_class Java_To_Runtime(method meth) %{ - // No relocation needed - - // movq r10, - emit_opcode(cbuf, Assembler::REX_WB); - emit_opcode(cbuf, 0xB8 | (R10_enc - 8)); - emit_d64(cbuf, (int64_t) $meth$$method); - - // call (r10) - emit_opcode(cbuf, Assembler::REX_B); - emit_opcode(cbuf, 0xFF); - emit_opcode(cbuf, 0xD0 | (R10_enc - 8)); - %} - - enc_class post_call_verify_mxcsr %{ - MacroAssembler _masm(&cbuf); - if (RestoreMXCSROnJNICalls) { - __ ldmxcsr(ExternalAddress(StubRoutines::amd64::mxcsr_std())); - } - else if (CheckJNICalls) { - __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::amd64::verify_mxcsr_entry()))); - } - %} %} // Platform dependent source source %{ - -int MachCallRuntimeNode::ret_addr_offset() { - return 13; // movq r10,#addr; callq (r10) -} - %} diff --git a/src/os_cpu/windows_x86/vm/windows_x86_64.ad b/src/os_cpu/windows_x86/vm/windows_x86_64.ad index e251b2b0c377272fd8321202c9e5fc9c8b54a25b..54e183a0bc560bdd7a777a20255324fb352834d0 100644 --- a/src/os_cpu/windows_x86/vm/windows_x86_64.ad +++ b/src/os_cpu/windows_x86/vm/windows_x86_64.ad @@ -53,30 +53,11 @@ encode %{ // adding a syntax that specifies the sizes of fields in an order, // so that the adlc can build the emit functions automagically - enc_class Java_To_Runtime (method meth) %{ // CALL Java_To_Runtime - // No relocation needed - - // movq r10, - emit_opcode(cbuf, Assembler::REX_WB); - emit_opcode(cbuf, 0xB8 | (R10_enc - 8)); - emit_d64(cbuf, (int64_t) $meth$$method); - - // call (r10) - emit_opcode(cbuf, Assembler::REX_B); - emit_opcode(cbuf, 0xFF); - emit_opcode(cbuf, 0xD0 | (R10_enc - 8)); - %} - %} -// + // Platform dependent source -// -source %{ -int MachCallRuntimeNode::ret_addr_offset() -{ - return 13; // movq r10,#addr; callq (r10) -} +source %{ %}