提交 6fb85be0 编写于 作者: K kvn

8011102: Clear AVX registers after return from JNI call

Summary: Execute vzeroupper instruction after JNI call and on exits in jit compiled code which use 256bit vectors.
Reviewed-by: roland
上级 67e687a4
......@@ -1299,25 +1299,8 @@ address InterpreterGenerator::generate_native_entry(bool synchronized) {
__ push(rdx);
#endif // _LP64
// Either restore the MXCSR register after returning from the JNI Call
// or verify that it wasn't changed.
if (VM_Version::supports_sse()) {
if (RestoreMXCSROnJNICalls) {
__ ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
}
else if (CheckJNICalls ) {
__ call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
}
}
#ifndef _LP64
// Either restore the x87 floating pointer control word after returning
// from the JNI call or verify that it wasn't changed.
if (CheckJNICalls) {
__ call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
}
#endif // _LP64
// Verify or restore cpu control state after JNI call
__ restore_cpu_control_state_after_jni();
// change thread state
__ movl(Address(thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
......
......@@ -4765,6 +4765,31 @@ void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
pop_CPU_state();
}
void MacroAssembler::restore_cpu_control_state_after_jni() {
// Either restore the MXCSR register after returning from the JNI Call
// or verify that it wasn't changed (with -Xcheck:jni flag).
if (VM_Version::supports_sse()) {
if (RestoreMXCSROnJNICalls) {
ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
} else if (CheckJNICalls) {
call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
}
}
if (VM_Version::supports_avx()) {
// Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
vzeroupper();
}
#ifndef _LP64
// Either restore the x87 floating pointer control word after returning
// from the JNI call or verify that it wasn't changed.
if (CheckJNICalls) {
call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
}
#endif // _LP64
}
void MacroAssembler::load_klass(Register dst, Register src) {
#ifdef _LP64
if (UseCompressedKlassPointers) {
......@@ -5759,6 +5784,8 @@ void MacroAssembler::string_compare(Register str1, Register str2,
addptr(result, stride2);
subl(cnt2, stride2);
jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
// clean upper bits of YMM registers
vzeroupper();
// compare wide vectors tail
bind(COMPARE_WIDE_TAIL);
......@@ -5772,6 +5799,8 @@ void MacroAssembler::string_compare(Register str1, Register str2,
// Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
bind(VECTOR_NOT_EQUAL);
// clean upper bits of YMM registers
vzeroupper();
lea(str1, Address(str1, result, scale));
lea(str2, Address(str2, result, scale));
jmp(COMPARE_16_CHARS);
......@@ -6028,6 +6057,10 @@ void MacroAssembler::char_arrays_equals(bool is_array_equ, Register ary1, Regist
// That's it
bind(DONE);
if (UseAVX >= 2) {
// clean upper bits of YMM registers
vzeroupper();
}
}
void MacroAssembler::generate_fill(BasicType t, bool aligned,
......@@ -6157,6 +6190,10 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
vmovdqu(Address(to, 0), xtmp);
addptr(to, 32);
subl(count, 8 << shift);
BIND(L_check_fill_8_bytes);
// clean upper bits of YMM registers
vzeroupper();
} else {
// Fill 32-byte chunks
pshufd(xtmp, xtmp, 0);
......@@ -6180,8 +6217,9 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
addptr(to, 32);
subl(count, 8 << shift);
jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
BIND(L_check_fill_8_bytes);
}
BIND(L_check_fill_8_bytes);
addl(count, 8 << shift);
jccb(Assembler::zero, L_exit);
jmpb(L_fill_8_bytes);
......@@ -6316,6 +6354,10 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
jccb(Assembler::lessEqual, L_copy_16_chars);
bind(L_copy_16_chars_exit);
if (UseAVX >= 2) {
// clean upper bits of YMM registers
vzeroupper();
}
subptr(len, 8);
jccb(Assembler::greater, L_copy_8_chars_exit);
......
......@@ -582,6 +582,9 @@ class MacroAssembler: public Assembler {
// only if +VerifyFPU
void verify_FPU(int stack_depth, const char* s = "illegal FPU state");
// Verify or restore cpu control state after JNI call
void restore_cpu_control_state_after_jni();
// prints msg, dumps registers and stops execution
void stop(const char* msg);
......
......@@ -2065,6 +2065,9 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
__ call(RuntimeAddress(native_func));
// Verify or restore cpu control state after JNI call
__ restore_cpu_control_state_after_jni();
// WARNING - on Windows Java Natives use pascal calling convention and pop the
// arguments off of the stack. We could just re-adjust the stack pointer here
// and continue to do SP relative addressing but we instead switch to FP
......
......@@ -2315,16 +2315,8 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
__ call(RuntimeAddress(native_func));
// Either restore the MXCSR register after returning from the JNI Call
// or verify that it wasn't changed.
if (RestoreMXCSROnJNICalls) {
__ ldmxcsr(ExternalAddress(StubRoutines::x86::mxcsr_std()));
}
else if (CheckJNICalls ) {
__ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::verify_mxcsr_entry())));
}
// Verify or restore cpu control state after JNI call
__ restore_cpu_control_state_after_jni();
// Unpack native results.
switch (ret_type) {
......
......@@ -835,6 +835,11 @@ class StubGenerator: public StubCodeGenerator {
__ BIND(L_copy_64_bytes);
__ subl(qword_count, 8);
__ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
if (UseUnalignedLoadStores && (UseAVX >= 2)) {
// clean upper bits of YMM registers
__ vzeroupper();
}
__ addl(qword_count, 8);
__ jccb(Assembler::zero, L_exit);
//
......
......@@ -1331,6 +1331,10 @@ class StubGenerator: public StubCodeGenerator {
}
__ addptr(qword_count, 4);
__ BIND(L_end);
if (UseAVX >= 2) {
// clean upper bits of YMM registers
__ vzeroupper();
}
} else {
// Copy 32-bytes per iteration
__ BIND(L_loop);
......@@ -1404,6 +1408,10 @@ class StubGenerator: public StubCodeGenerator {
}
__ subptr(qword_count, 4);
__ BIND(L_end);
if (UseAVX >= 2) {
// clean upper bits of YMM registers
__ vzeroupper();
}
} else {
// Copy 32-bytes per iteration
__ BIND(L_loop);
......
......@@ -1080,22 +1080,8 @@ address InterpreterGenerator::generate_native_entry(bool synchronized) {
// result potentially in rdx:rax or ST0
// Either restore the MXCSR register after returning from the JNI Call
// or verify that it wasn't changed.
if (VM_Version::supports_sse()) {
if (RestoreMXCSROnJNICalls) {
__ ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
}
else if (CheckJNICalls ) {
__ call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
}
}
// Either restore the x87 floating pointer control word after returning
// from the JNI call or verify that it wasn't changed.
if (CheckJNICalls) {
__ call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
}
// Verify or restore cpu control state after JNI call
__ restore_cpu_control_state_after_jni();
// save potential result in ST(0) & rdx:rax
// (if result handler is the T_FLOAT or T_DOUBLE handler, result must be in ST0 -
......
......@@ -1079,15 +1079,8 @@ address InterpreterGenerator::generate_native_entry(bool synchronized) {
__ call(rax);
// result potentially in rax or xmm0
// Depending on runtime options, either restore the MXCSR
// register after returning from the JNI Call or verify that
// it wasn't changed during -Xcheck:jni.
if (RestoreMXCSROnJNICalls) {
__ ldmxcsr(ExternalAddress(StubRoutines::x86::mxcsr_std()));
}
else if (CheckJNICalls) {
__ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::verify_mxcsr_entry())));
}
// Verify or restore cpu control state after JNI call
__ restore_cpu_control_state_after_jni();
// NOTE: The order of these pushes is known to frame::interpreter_frame_result
// in order to extract the result of a method call. If the order of these
......
......@@ -228,10 +228,16 @@ static jlong *float_signflip_pool = double_quadword(&fp_signmask_pool[3*2], CON
static jlong *double_signflip_pool = double_quadword(&fp_signmask_pool[4*2], CONST64(0x8000000000000000), CONST64(0x8000000000000000));
// Offset hacking within calls.
static int pre_call_FPU_size() {
if (Compile::current()->in_24_bit_fp_mode())
return 6; // fldcw
return 0;
static int pre_call_resets_size() {
int size = 0;
Compile* C = Compile::current();
if (C->in_24_bit_fp_mode()) {
size += 6; // fldcw
}
if (C->max_vector_size() > 16) {
size += 3; // vzeroupper
}
return size;
}
static int preserve_SP_size() {
......@@ -242,21 +248,21 @@ static int preserve_SP_size() {
// from the start of the call to the point where the return address
// will point.
int MachCallStaticJavaNode::ret_addr_offset() {
int offset = 5 + pre_call_FPU_size(); // 5 bytes from start of call to where return address points
int offset = 5 + pre_call_resets_size(); // 5 bytes from start of call to where return address points
if (_method_handle_invoke)
offset += preserve_SP_size();
return offset;
}
int MachCallDynamicJavaNode::ret_addr_offset() {
return 10 + pre_call_FPU_size(); // 10 bytes from start of call to where return address points
return 10 + pre_call_resets_size(); // 10 bytes from start of call to where return address points
}
static int sizeof_FFree_Float_Stack_All = -1;
int MachCallRuntimeNode::ret_addr_offset() {
assert(sizeof_FFree_Float_Stack_All != -1, "must have been emitted already");
return sizeof_FFree_Float_Stack_All + 5 + pre_call_FPU_size();
return sizeof_FFree_Float_Stack_All + 5 + pre_call_resets_size();
}
// Indicate if the safepoint node needs the polling page as an input.
......@@ -272,7 +278,7 @@ bool SafePointNode::needs_polling_address_input() {
// The address of the call instruction needs to be 4-byte aligned to
// ensure that it does not span a cache line so that it can be patched.
int CallStaticJavaDirectNode::compute_padding(int current_offset) const {
current_offset += pre_call_FPU_size(); // skip fldcw, if any
current_offset += pre_call_resets_size(); // skip fldcw, if any
current_offset += 1; // skip call opcode byte
return round_to(current_offset, alignment_required()) - current_offset;
}
......@@ -280,7 +286,7 @@ int CallStaticJavaDirectNode::compute_padding(int current_offset) const {
// The address of the call instruction needs to be 4-byte aligned to
// ensure that it does not span a cache line so that it can be patched.
int CallStaticJavaHandleNode::compute_padding(int current_offset) const {
current_offset += pre_call_FPU_size(); // skip fldcw, if any
current_offset += pre_call_resets_size(); // skip fldcw, if any
current_offset += preserve_SP_size(); // skip mov rbp, rsp
current_offset += 1; // skip call opcode byte
return round_to(current_offset, alignment_required()) - current_offset;
......@@ -289,7 +295,7 @@ int CallStaticJavaHandleNode::compute_padding(int current_offset) const {
// The address of the call instruction needs to be 4-byte aligned to
// ensure that it does not span a cache line so that it can be patched.
int CallDynamicJavaDirectNode::compute_padding(int current_offset) const {
current_offset += pre_call_FPU_size(); // skip fldcw, if any
current_offset += pre_call_resets_size(); // skip fldcw, if any
current_offset += 5; // skip MOV instruction
current_offset += 1; // skip call opcode byte
return round_to(current_offset, alignment_required()) - current_offset;
......@@ -583,16 +589,20 @@ void MachEpilogNode::format( PhaseRegAlloc *ra_, outputStream* st ) const {
// Remove two words for return addr and rbp,
framesize -= 2*wordSize;
if( C->in_24_bit_fp_mode() ) {
if (C->max_vector_size() > 16) {
st->print("VZEROUPPER");
st->cr(); st->print("\t");
}
if (C->in_24_bit_fp_mode()) {
st->print("FLDCW standard control word");
st->cr(); st->print("\t");
}
if( framesize ) {
if (framesize) {
st->print("ADD ESP,%d\t# Destroy frame",framesize);
st->cr(); st->print("\t");
}
st->print_cr("POPL EBP"); st->print("\t");
if( do_polling() && C->is_method_compilation() ) {
if (do_polling() && C->is_method_compilation()) {
st->print("TEST PollPage,EAX\t! Poll Safepoint");
st->cr(); st->print("\t");
}
......@@ -602,8 +612,14 @@ void MachEpilogNode::format( PhaseRegAlloc *ra_, outputStream* st ) const {
void MachEpilogNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
Compile *C = ra_->C;
if (C->max_vector_size() > 16) {
// Clear upper bits of YMM registers when current compiled code uses
// wide vectors to avoid AVX <-> SSE transition penalty during call.
MacroAssembler masm(&cbuf);
masm.vzeroupper();
}
// If method set FPU control word, restore to standard control word
if( C->in_24_bit_fp_mode() ) {
if (C->in_24_bit_fp_mode()) {
MacroAssembler masm(&cbuf);
masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
}
......@@ -615,12 +631,11 @@ void MachEpilogNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
// Note that VerifyStackAtCalls' Majik cookie does not change the frame size popped here
if( framesize >= 128 ) {
if (framesize >= 128) {
emit_opcode(cbuf, 0x81); // add SP, #framesize
emit_rm(cbuf, 0x3, 0x00, ESP_enc);
emit_d32(cbuf, framesize);
}
else if( framesize ) {
} else if (framesize) {
emit_opcode(cbuf, 0x83); // add SP, #framesize
emit_rm(cbuf, 0x3, 0x00, ESP_enc);
emit_d8(cbuf, framesize);
......@@ -628,7 +643,7 @@ void MachEpilogNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
emit_opcode(cbuf, 0x58 | EBP_enc);
if( do_polling() && C->is_method_compilation() ) {
if (do_polling() && C->is_method_compilation()) {
cbuf.relocate(cbuf.insts_end(), relocInfo::poll_return_type, 0);
emit_opcode(cbuf,0x85);
emit_rm(cbuf, 0x0, EAX_enc, 0x5); // EAX
......@@ -640,7 +655,8 @@ uint MachEpilogNode::size(PhaseRegAlloc *ra_) const {
Compile *C = ra_->C;
// If method set FPU control word, restore to standard control word
int size = C->in_24_bit_fp_mode() ? 6 : 0;
if( do_polling() && C->is_method_compilation() ) size += 6;
if (C->max_vector_size() > 16) size += 3; // vzeroupper
if (do_polling() && C->is_method_compilation()) size += 6;
int framesize = C->frame_slots() << LogBytesPerInt;
assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
......@@ -649,7 +665,7 @@ uint MachEpilogNode::size(PhaseRegAlloc *ra_) const {
size++; // popl rbp,
if( framesize >= 128 ) {
if (framesize >= 128) {
size += 6;
} else {
size += framesize ? 3 : 0;
......@@ -1853,20 +1869,26 @@ encode %{
%}
enc_class pre_call_FPU %{
enc_class pre_call_resets %{
// If method sets FPU control word restore it here
debug_only(int off0 = cbuf.insts_size());
if( Compile::current()->in_24_bit_fp_mode() ) {
MacroAssembler masm(&cbuf);
masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
if (ra_->C->in_24_bit_fp_mode()) {
MacroAssembler _masm(&cbuf);
__ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
}
if (ra_->C->max_vector_size() > 16) {
// Clear upper bits of YMM registers when current compiled code uses
// wide vectors to avoid AVX <-> SSE transition penalty during call.
MacroAssembler _masm(&cbuf);
__ vzeroupper();
}
debug_only(int off1 = cbuf.insts_size());
assert(off1 - off0 == pre_call_FPU_size(), "correct size prediction");
assert(off1 - off0 == pre_call_resets_size(), "correct size prediction");
%}
enc_class post_call_FPU %{
// If method sets FPU control word do it here also
if( Compile::current()->in_24_bit_fp_mode() ) {
if (Compile::current()->in_24_bit_fp_mode()) {
MacroAssembler masm(&cbuf);
masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
}
......@@ -1877,17 +1899,17 @@ encode %{
// who we intended to call.
cbuf.set_insts_mark();
$$$emit8$primary;
if ( !_method ) {
if (!_method) {
emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4),
runtime_call_Relocation::spec(), RELOC_IMM32 );
} else if(_optimized_virtual) {
} else if (_optimized_virtual) {
emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4),
opt_virtual_call_Relocation::spec(), RELOC_IMM32 );
} else {
emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4),
static_call_Relocation::spec(), RELOC_IMM32 );
}
if( _method ) { // Emit stub for static call
if (_method) { // Emit stub for static call
emit_java_to_interp(cbuf);
}
%}
......@@ -12828,7 +12850,7 @@ instruct CallStaticJavaDirect(method meth) %{
ins_cost(300);
format %{ "CALL,static " %}
opcode(0xE8); /* E8 cd */
ins_encode( pre_call_FPU,
ins_encode( pre_call_resets,
Java_Static_Call( meth ),
call_epilog,
post_call_FPU );
......@@ -12849,7 +12871,7 @@ instruct CallStaticJavaHandle(method meth, eBPRegP ebp_mh_SP_save) %{
ins_cost(300);
format %{ "CALL,static/MethodHandle " %}
opcode(0xE8); /* E8 cd */
ins_encode( pre_call_FPU,
ins_encode( pre_call_resets,
preserve_SP,
Java_Static_Call( meth ),
restore_SP,
......@@ -12870,7 +12892,7 @@ instruct CallDynamicJavaDirect(method meth) %{
format %{ "MOV EAX,(oop)-1\n\t"
"CALL,dynamic" %}
opcode(0xE8); /* E8 cd */
ins_encode( pre_call_FPU,
ins_encode( pre_call_resets,
Java_Dynamic_Call( meth ),
call_epilog,
post_call_FPU );
......@@ -12887,7 +12909,7 @@ instruct CallRuntimeDirect(method meth) %{
format %{ "CALL,runtime " %}
opcode(0xE8); /* E8 cd */
// Use FFREEs to clear entries in float stack
ins_encode( pre_call_FPU,
ins_encode( pre_call_resets,
FFree_Float_Stack_All,
Java_To_Runtime( meth ),
post_call_FPU );
......@@ -12902,7 +12924,7 @@ instruct CallLeafDirect(method meth) %{
ins_cost(300);
format %{ "CALL_LEAF,runtime " %}
opcode(0xE8); /* E8 cd */
ins_encode( pre_call_FPU,
ins_encode( pre_call_resets,
FFree_Float_Stack_All,
Java_To_Runtime( meth ),
Verify_FPU_For_Leaf, post_call_FPU );
......
......@@ -399,6 +399,9 @@ source %{
static int preserve_SP_size() {
return 3; // rex.w, op, rm(reg/reg)
}
static int clear_avx_size() {
return (Compile::current()->max_vector_size() > 16) ? 3 : 0; // vzeroupper
}
// !!!!! Special hack to get all types of calls to specify the byte offset
// from the start of the call to the point where the return address
......@@ -406,6 +409,7 @@ static int preserve_SP_size() {
int MachCallStaticJavaNode::ret_addr_offset()
{
int offset = 5; // 5 bytes from start of call to where return address points
offset += clear_avx_size();
if (_method_handle_invoke)
offset += preserve_SP_size();
return offset;
......@@ -413,11 +417,16 @@ int MachCallStaticJavaNode::ret_addr_offset()
int MachCallDynamicJavaNode::ret_addr_offset()
{
return 15; // 15 bytes from start of call to where return address points
int offset = 15; // 15 bytes from start of call to where return address points
offset += clear_avx_size();
return offset;
}
// In os_cpu .ad file
// int MachCallRuntimeNode::ret_addr_offset()
int MachCallRuntimeNode::ret_addr_offset() {
int offset = 13; // movq r10,#addr; callq (r10)
offset += clear_avx_size();
return offset;
}
// Indicate if the safepoint node needs the polling page as an input,
// it does if the polling page is more than disp32 away.
......@@ -434,6 +443,7 @@ bool SafePointNode::needs_polling_address_input()
// ensure that it does not span a cache line so that it can be patched.
int CallStaticJavaDirectNode::compute_padding(int current_offset) const
{
current_offset += clear_avx_size(); // skip vzeroupper
current_offset += 1; // skip call opcode byte
return round_to(current_offset, alignment_required()) - current_offset;
}
......@@ -443,6 +453,7 @@ int CallStaticJavaDirectNode::compute_padding(int current_offset) const
int CallStaticJavaHandleNode::compute_padding(int current_offset) const
{
current_offset += preserve_SP_size(); // skip mov rbp, rsp
current_offset += clear_avx_size(); // skip vzeroupper
current_offset += 1; // skip call opcode byte
return round_to(current_offset, alignment_required()) - current_offset;
}
......@@ -451,6 +462,7 @@ int CallStaticJavaHandleNode::compute_padding(int current_offset) const
// ensure that it does not span a cache line so that it can be patched.
int CallDynamicJavaDirectNode::compute_padding(int current_offset) const
{
current_offset += clear_avx_size(); // skip vzeroupper
current_offset += 11; // skip movq instruction + call opcode byte
return round_to(current_offset, alignment_required()) - current_offset;
}
......@@ -764,6 +776,11 @@ int MachPrologNode::reloc() const
void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const
{
Compile* C = ra_->C;
if (C->max_vector_size() > 16) {
st->print("vzeroupper");
st->cr(); st->print("\t");
}
int framesize = C->frame_slots() << LogBytesPerInt;
assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
// Remove word for return adr already pushed
......@@ -793,6 +810,13 @@ void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const
void MachEpilogNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
{
Compile* C = ra_->C;
if (C->max_vector_size() > 16) {
// Clear upper bits of YMM registers when current compiled code uses
// wide vectors to avoid AVX <-> SSE transition penalty during call.
MacroAssembler _masm(&cbuf);
__ vzeroupper();
}
int framesize = C->frame_slots() << LogBytesPerInt;
assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
// Remove word for return adr already pushed
......@@ -2008,6 +2032,25 @@ encode %{
__ bind(miss);
%}
enc_class clear_avx %{
debug_only(int off0 = cbuf.insts_size());
if (ra_->C->max_vector_size() > 16) {
// Clear upper bits of YMM registers when current compiled code uses
// wide vectors to avoid AVX <-> SSE transition penalty during call.
MacroAssembler _masm(&cbuf);
__ vzeroupper();
}
debug_only(int off1 = cbuf.insts_size());
assert(off1 - off0 == clear_avx_size(), "correct size prediction");
%}
enc_class Java_To_Runtime(method meth) %{
// No relocation needed
MacroAssembler _masm(&cbuf);
__ mov64(r10, (int64_t) $meth$$method);
__ call(r10);
%}
enc_class Java_To_Interpreter(method meth)
%{
// CALL Java_To_Interpreter
......@@ -11366,7 +11409,7 @@ instruct CallStaticJavaDirect(method meth) %{
ins_cost(300);
format %{ "call,static " %}
opcode(0xE8); /* E8 cd */
ins_encode(Java_Static_Call(meth), call_epilog);
ins_encode(clear_avx, Java_Static_Call(meth), call_epilog);
ins_pipe(pipe_slow);
ins_alignment(4);
%}
......@@ -11384,7 +11427,7 @@ instruct CallStaticJavaHandle(method meth, rbp_RegP rbp_mh_SP_save) %{
ins_cost(300);
format %{ "call,static/MethodHandle " %}
opcode(0xE8); /* E8 cd */
ins_encode(preserve_SP,
ins_encode(clear_avx, preserve_SP,
Java_Static_Call(meth),
restore_SP,
call_epilog);
......@@ -11403,7 +11446,7 @@ instruct CallDynamicJavaDirect(method meth)
ins_cost(300);
format %{ "movq rax, #Universe::non_oop_word()\n\t"
"call,dynamic " %}
ins_encode(Java_Dynamic_Call(meth), call_epilog);
ins_encode(clear_avx, Java_Dynamic_Call(meth), call_epilog);
ins_pipe(pipe_slow);
ins_alignment(4);
%}
......@@ -11416,8 +11459,7 @@ instruct CallRuntimeDirect(method meth)
ins_cost(300);
format %{ "call,runtime " %}
opcode(0xE8); /* E8 cd */
ins_encode(Java_To_Runtime(meth));
ins_encode(clear_avx, Java_To_Runtime(meth));
ins_pipe(pipe_slow);
%}
......@@ -11429,8 +11471,7 @@ instruct CallLeafDirect(method meth)
ins_cost(300);
format %{ "call_leaf,runtime " %}
opcode(0xE8); /* E8 cd */
ins_encode(Java_To_Runtime(meth));
ins_encode(clear_avx, Java_To_Runtime(meth));
ins_pipe(pipe_slow);
%}
......@@ -11442,7 +11483,6 @@ instruct CallLeafNoFPDirect(method meth)
ins_cost(300);
format %{ "call_leaf_nofp,runtime " %}
opcode(0xE8); /* E8 cd */
ins_encode(Java_To_Runtime(meth));
ins_pipe(pipe_slow);
%}
......
......@@ -55,20 +55,6 @@ encode %{
// adding a syntax that specifies the sizes of fields in an order,
// so that the adlc can build the emit functions automagically
enc_class Java_To_Runtime(method meth) %{
// No relocation needed
// movq r10, <meth>
emit_opcode(cbuf, Assembler::REX_WB);
emit_opcode(cbuf, 0xB8 | (R10_enc - 8));
emit_d64(cbuf, (int64_t) $meth$$method);
// call (r10)
emit_opcode(cbuf, Assembler::REX_B);
emit_opcode(cbuf, 0xFF);
emit_opcode(cbuf, 0xD0 | (R10_enc - 8));
%}
%}
......@@ -76,8 +62,4 @@ encode %{
source %{
int MachCallRuntimeNode::ret_addr_offset() {
return 13; // movq r10,#addr; callq (r10)
}
%}
......@@ -55,20 +55,6 @@ encode %{
// adding a syntax that specifies the sizes of fields in an order,
// so that the adlc can build the emit functions automagically
enc_class Java_To_Runtime(method meth) %{
// No relocation needed
// movq r10, <meth>
emit_opcode(cbuf, Assembler::REX_WB);
emit_opcode(cbuf, 0xB8 | (R10_enc - 8));
emit_d64(cbuf, (int64_t) $meth$$method);
// call (r10)
emit_opcode(cbuf, Assembler::REX_B);
emit_opcode(cbuf, 0xFF);
emit_opcode(cbuf, 0xD0 | (R10_enc - 8));
%}
%}
......@@ -76,8 +62,4 @@ encode %{
source %{
int MachCallRuntimeNode::ret_addr_offset() {
return 13; // movq r10,#addr; callq (r10)
}
%}
......@@ -54,39 +54,10 @@ encode %{
// main source block for now. In future, we can generalize this by
// adding a syntax that specifies the sizes of fields in an order,
// so that the adlc can build the emit functions automagically
enc_class Java_To_Runtime(method meth) %{
// No relocation needed
// movq r10, <meth>
emit_opcode(cbuf, Assembler::REX_WB);
emit_opcode(cbuf, 0xB8 | (R10_enc - 8));
emit_d64(cbuf, (int64_t) $meth$$method);
// call (r10)
emit_opcode(cbuf, Assembler::REX_B);
emit_opcode(cbuf, 0xFF);
emit_opcode(cbuf, 0xD0 | (R10_enc - 8));
%}
enc_class post_call_verify_mxcsr %{
MacroAssembler _masm(&cbuf);
if (RestoreMXCSROnJNICalls) {
__ ldmxcsr(ExternalAddress(StubRoutines::amd64::mxcsr_std()));
}
else if (CheckJNICalls) {
__ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::amd64::verify_mxcsr_entry())));
}
%}
%}
// Platform dependent source
source %{
int MachCallRuntimeNode::ret_addr_offset() {
return 13; // movq r10,#addr; callq (r10)
}
%}
......@@ -53,30 +53,11 @@ encode %{
// adding a syntax that specifies the sizes of fields in an order,
// so that the adlc can build the emit functions automagically
enc_class Java_To_Runtime (method meth) %{ // CALL Java_To_Runtime
// No relocation needed
// movq r10, <meth>
emit_opcode(cbuf, Assembler::REX_WB);
emit_opcode(cbuf, 0xB8 | (R10_enc - 8));
emit_d64(cbuf, (int64_t) $meth$$method);
// call (r10)
emit_opcode(cbuf, Assembler::REX_B);
emit_opcode(cbuf, 0xFF);
emit_opcode(cbuf, 0xD0 | (R10_enc - 8));
%}
%}
//
// Platform dependent source
//
source %{
int MachCallRuntimeNode::ret_addr_offset()
{
return 13; // movq r10,#addr; callq (r10)
}
source %{
%}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册