diff --git a/src/cpu/sparc/vm/assembler_sparc.hpp b/src/cpu/sparc/vm/assembler_sparc.hpp index 33f8edfea8be9e0ce00f092d6f4a05e66f664224..4462ee5d00f212679995d62924f86e398b78d81f 100644 --- a/src/cpu/sparc/vm/assembler_sparc.hpp +++ b/src/cpu/sparc/vm/assembler_sparc.hpp @@ -886,7 +886,11 @@ class Assembler : public AbstractAssembler { enum ASIs { // page 72, v9 ASI_PRIMARY = 0x80, - ASI_PRIMARY_LITTLE = 0x88 + ASI_PRIMARY_LITTLE = 0x88, + // Block initializing store + ASI_ST_BLKINIT_PRIMARY = 0xE2, + // Most-Recently-Used (MRU) BIS variant + ASI_ST_BLKINIT_MRU_PRIMARY = 0xF2 // add more from book as needed }; diff --git a/src/cpu/sparc/vm/sparc.ad b/src/cpu/sparc/vm/sparc.ad index ac20818530282cd1a13529616e317ec0fffede0a..9eac222b9f6c668ca57ad7e8df2e6df1d14c371d 100644 --- a/src/cpu/sparc/vm/sparc.ad +++ b/src/cpu/sparc/vm/sparc.ad @@ -471,9 +471,6 @@ extern bool can_branch_register( Node *bol, Node *cmp ); source %{ #define __ _masm. -// Block initializing store -#define ASI_BLK_INIT_QUAD_LDD_P 0xE2 - // tertiary op of a LoadP or StoreP encoding #define REGP_OP true @@ -2819,10 +2816,10 @@ enc_class Fast_Unlock(iRegP oop, iRegP box, o7RegP scratch, iRegP scratch2) %{ Register nof_bytes_arg = reg_to_register_object($cnt$$reg); Register nof_bytes_tmp = reg_to_register_object($temp$$reg); Register base_pointer_arg = reg_to_register_object($base$$reg); - + Label loop; __ mov(nof_bytes_arg, nof_bytes_tmp); - + // Loop and clear, walking backwards through the array. // nof_bytes_tmp (if >0) is always the number of bytes to zero __ bind(loop); @@ -6269,6 +6266,7 @@ instruct loadConD(regD dst, immD con, o7RegI tmp) %{ instruct prefetchr( memory mem ) %{ match( PrefetchRead mem ); ins_cost(MEMORY_REF_COST); + size(4); format %{ "PREFETCH $mem,0\t! Prefetch read-many" %} opcode(Assembler::prefetch_op3); @@ -6277,9 +6275,9 @@ instruct prefetchr( memory mem ) %{ %} instruct prefetchw( memory mem ) %{ - predicate(AllocatePrefetchStyle != 3 ); match( PrefetchWrite mem ); ins_cost(MEMORY_REF_COST); + size(4); format %{ "PREFETCH $mem,2\t! Prefetch write-many (and read)" %} opcode(Assembler::prefetch_op3); @@ -6287,24 +6285,62 @@ instruct prefetchw( memory mem ) %{ ins_pipe(iload_mem); %} -// Use BIS instruction to prefetch. -instruct prefetchw_bis( memory mem ) %{ - predicate(AllocatePrefetchStyle == 3); - match( PrefetchWrite mem ); +// Prefetch instructions for allocation. + +instruct prefetchAlloc( memory mem ) %{ + predicate(AllocatePrefetchInstr == 0); + match( PrefetchAllocation mem ); + ins_cost(MEMORY_REF_COST); + size(4); + + format %{ "PREFETCH $mem,2\t! Prefetch allocation" %} + opcode(Assembler::prefetch_op3); + ins_encode( form3_mem_prefetch_write( mem ) ); + ins_pipe(iload_mem); +%} + +// Use BIS instruction to prefetch for allocation. +// Could fault, need space at the end of TLAB. +instruct prefetchAlloc_bis( iRegP dst ) %{ + predicate(AllocatePrefetchInstr == 1); + match( PrefetchAllocation dst ); ins_cost(MEMORY_REF_COST); + size(4); - format %{ "STXA G0,$mem\t! // Block initializing store" %} + format %{ "STXA [$dst]\t! // Prefetch allocation using BIS" %} ins_encode %{ - Register base = as_Register($mem$$base); - int disp = $mem$$disp; - if (disp != 0) { - __ add(base, AllocatePrefetchStepSize, base); - } - __ stxa(G0, base, G0, ASI_BLK_INIT_QUAD_LDD_P); + __ stxa(G0, $dst$$Register, G0, Assembler::ASI_ST_BLKINIT_PRIMARY); %} ins_pipe(istore_mem_reg); %} +// Next code is used for finding next cache line address to prefetch. +#ifndef _LP64 +instruct cacheLineAdr( iRegP dst, iRegP src, immI13 mask ) %{ + match(Set dst (CastX2P (AndI (CastP2X src) mask))); + ins_cost(DEFAULT_COST); + size(4); + + format %{ "AND $src,$mask,$dst\t! next cache line address" %} + ins_encode %{ + __ and3($src$$Register, $mask$$constant, $dst$$Register); + %} + ins_pipe(ialu_reg_imm); +%} +#else +instruct cacheLineAdr( iRegP dst, iRegP src, immL13 mask ) %{ + match(Set dst (CastX2P (AndL (CastP2X src) mask))); + ins_cost(DEFAULT_COST); + size(4); + + format %{ "AND $src,$mask,$dst\t! next cache line address" %} + ins_encode %{ + __ and3($src$$Register, $mask$$constant, $dst$$Register); + %} + ins_pipe(ialu_reg_imm); +%} +#endif + //----------Store Instructions------------------------------------------------- // Store Byte instruct storeB(memory mem, iRegI src) %{ diff --git a/src/cpu/sparc/vm/vm_version_sparc.cpp b/src/cpu/sparc/vm/vm_version_sparc.cpp index 2637b1b9253ad447d468cfecda54402432a52aad..6f9ca4ea9c2542cadaf6ee5cca90256ab8688c17 100644 --- a/src/cpu/sparc/vm/vm_version_sparc.cpp +++ b/src/cpu/sparc/vm/vm_version_sparc.cpp @@ -44,20 +44,31 @@ void VM_Version::initialize() { PrefetchScanIntervalInBytes = prefetch_scan_interval_in_bytes(); PrefetchFieldsAhead = prefetch_fields_ahead(); + assert(0 <= AllocatePrefetchInstr && AllocatePrefetchInstr <= 1, "invalid value"); + if( AllocatePrefetchInstr < 0 ) AllocatePrefetchInstr = 0; + if( AllocatePrefetchInstr > 1 ) AllocatePrefetchInstr = 0; + // Allocation prefetch settings - intx cache_line_size = L1_data_cache_line_size(); + intx cache_line_size = prefetch_data_size(); if( cache_line_size > AllocatePrefetchStepSize ) AllocatePrefetchStepSize = cache_line_size; - if( FLAG_IS_DEFAULT(AllocatePrefetchLines) ) - AllocatePrefetchLines = 3; // Optimistic value - assert( AllocatePrefetchLines > 0, "invalid value"); - if( AllocatePrefetchLines < 1 ) // set valid value in product VM - AllocatePrefetchLines = 1; // Conservative value + + assert(AllocatePrefetchLines > 0, "invalid value"); + if( AllocatePrefetchLines < 1 ) // set valid value in product VM + AllocatePrefetchLines = 3; + assert(AllocateInstancePrefetchLines > 0, "invalid value"); + if( AllocateInstancePrefetchLines < 1 ) // set valid value in product VM + AllocateInstancePrefetchLines = 1; AllocatePrefetchDistance = allocate_prefetch_distance(); AllocatePrefetchStyle = allocate_prefetch_style(); - assert(AllocatePrefetchDistance % AllocatePrefetchStepSize == 0, "invalid value"); + assert((AllocatePrefetchDistance % AllocatePrefetchStepSize) == 0 && + (AllocatePrefetchDistance > 0), "invalid value"); + if ((AllocatePrefetchDistance % AllocatePrefetchStepSize) != 0 || + (AllocatePrefetchDistance <= 0)) { + AllocatePrefetchDistance = AllocatePrefetchStepSize; + } if (AllocatePrefetchStyle == 3 && !has_blk_init()) { warning("BIS instructions are not available on this CPU"); @@ -66,7 +77,7 @@ void VM_Version::initialize() { UseSSE = 0; // Only on x86 and x64 - _supports_cx8 = has_v9(); + _supports_cx8 = has_v9(); if (is_niagara()) { // Indirect branch is the same cost as direct @@ -99,19 +110,42 @@ void VM_Version::initialize() { FLAG_SET_DEFAULT(InteriorEntryAlignment, 4); } if (is_niagara_plus()) { - if (has_blk_init() && AllocatePrefetchStyle > 0 && - FLAG_IS_DEFAULT(AllocatePrefetchStyle)) { - // Use BIS instruction for allocation prefetch. - FLAG_SET_DEFAULT(AllocatePrefetchStyle, 3); + if (has_blk_init() && UseTLAB && + FLAG_IS_DEFAULT(AllocatePrefetchInstr)) { + // Use BIS instruction for TLAB allocation prefetch. + FLAG_SET_ERGO(intx, AllocatePrefetchInstr, 1); + if (FLAG_IS_DEFAULT(AllocatePrefetchStyle)) { + FLAG_SET_ERGO(intx, AllocatePrefetchStyle, 3); + } if (FLAG_IS_DEFAULT(AllocatePrefetchDistance)) { - // Use smaller prefetch distance on N2 with BIS + // Use smaller prefetch distance with BIS FLAG_SET_DEFAULT(AllocatePrefetchDistance, 64); } } + if (is_T4()) { + // Double number of prefetched cache lines on T4 + // since L2 cache line size is smaller (32 bytes). + if (FLAG_IS_DEFAULT(AllocatePrefetchLines)) { + FLAG_SET_ERGO(intx, AllocatePrefetchLines, AllocatePrefetchLines*2); + } + if (FLAG_IS_DEFAULT(AllocateInstancePrefetchLines)) { + FLAG_SET_ERGO(intx, AllocateInstancePrefetchLines, AllocateInstancePrefetchLines*2); + } + } if (AllocatePrefetchStyle != 3 && FLAG_IS_DEFAULT(AllocatePrefetchDistance)) { // Use different prefetch distance without BIS FLAG_SET_DEFAULT(AllocatePrefetchDistance, 256); } + if (AllocatePrefetchInstr == 1) { + // Need a space at the end of TLAB for BIS since it + // will fault when accessing memory outside of heap. + + // +1 for rounding up to next cache line, +1 to be safe + int lines = AllocatePrefetchLines + 2; + int step_size = AllocatePrefetchStepSize; + int distance = AllocatePrefetchDistance; + _reserve_for_allocation_prefetch = (distance + step_size*lines)/(int)HeapWordSize; + } } #endif } @@ -185,14 +219,20 @@ void VM_Version::initialize() { #ifndef PRODUCT if (PrintMiscellaneous && Verbose) { - tty->print("Allocation: "); + tty->print("Allocation"); if (AllocatePrefetchStyle <= 0) { - tty->print_cr("no prefetching"); + tty->print_cr(": no prefetching"); } else { + tty->print(" prefetching: "); + if (AllocatePrefetchInstr == 0) { + tty->print("PREFETCH"); + } else if (AllocatePrefetchInstr == 1) { + tty->print("BIS"); + } if (AllocatePrefetchLines > 1) { - tty->print_cr("PREFETCH %d, %d lines of size %d bytes", AllocatePrefetchDistance, AllocatePrefetchLines, AllocatePrefetchStepSize); + tty->print_cr(" at distance %d, %d lines of %d bytes", AllocatePrefetchDistance, AllocatePrefetchLines, AllocatePrefetchStepSize); } else { - tty->print_cr("PREFETCH %d, one line", AllocatePrefetchDistance); + tty->print_cr(" at distance %d, one line of %d bytes", AllocatePrefetchDistance, AllocatePrefetchStepSize); } } if (PrefetchCopyIntervalInBytes > 0) { diff --git a/src/cpu/sparc/vm/vm_version_sparc.hpp b/src/cpu/sparc/vm/vm_version_sparc.hpp index fa475bde65258c4b13a3d1d623e416bc7d4adc41..9c9b8fc6f3efcf009a3c6e2a9b159c0b821c3514 100644 --- a/src/cpu/sparc/vm/vm_version_sparc.hpp +++ b/src/cpu/sparc/vm/vm_version_sparc.hpp @@ -121,6 +121,7 @@ public: // Returns true if the platform is in the niagara line (T series) // and newer than the niagara1. static bool is_niagara_plus() { return is_T_family(_features) && !is_T1_model(_features); } + static bool is_T4() { return is_T_family(_features) && has_cbcond(); } // Fujitsu SPARC64 static bool is_sparc64() { return (_features & sparc64_family_m) != 0; } @@ -130,13 +131,17 @@ public: static bool has_fast_fxtof() { return is_niagara() || is_sparc64() || has_v9() && !is_ultra3(); } static bool has_fast_idiv() { return is_niagara_plus() || is_sparc64(); } + // T4 and newer Sparc have fast RDPC instruction. - static bool has_fast_rdpc() { return is_niagara_plus() && has_cbcond(); } + static bool has_fast_rdpc() { return is_T4(); } + + // T4 and newer Sparc have Most-Recently-Used (MRU) BIS. + static bool has_mru_blk_init() { return has_blk_init() && is_T4(); } static const char* cpu_features() { return _features_str; } - static intx L1_data_cache_line_size() { - return 64; // default prefetch block size on sparc + static intx prefetch_data_size() { + return is_T4() ? 32 : 64; // default prefetch block size on sparc } // Prefetch diff --git a/src/cpu/x86/vm/assembler_x86.cpp b/src/cpu/x86/vm/assembler_x86.cpp index 639d0000ebd596e29fa3471fb2ceac42f4bd2d0e..af2c3a96e21c73cc2d502413ad8a38b6f4d5644b 100644 --- a/src/cpu/x86/vm/assembler_x86.cpp +++ b/src/cpu/x86/vm/assembler_x86.cpp @@ -2315,7 +2315,7 @@ void Assembler::prefetchnta(Address src) { } void Assembler::prefetchr(Address src) { - NOT_LP64(assert(VM_Version::supports_3dnow_prefetch(), "must support")); + assert(VM_Version::supports_3dnow_prefetch(), "must support"); InstructionMark im(this); prefetch_prefix(src); emit_byte(0x0D); @@ -2347,7 +2347,7 @@ void Assembler::prefetcht2(Address src) { } void Assembler::prefetchw(Address src) { - NOT_LP64(assert(VM_Version::supports_3dnow_prefetch(), "must support")); + assert(VM_Version::supports_3dnow_prefetch(), "must support"); InstructionMark im(this); prefetch_prefix(src); emit_byte(0x0D); diff --git a/src/cpu/x86/vm/vm_version_x86.cpp b/src/cpu/x86/vm/vm_version_x86.cpp index d140f36fe20d97535925886601b7de672257903b..8a75f68ac2732a61ea398b49bfdcb09ff0e070ab 100644 --- a/src/cpu/x86/vm/vm_version_x86.cpp +++ b/src/cpu/x86/vm/vm_version_x86.cpp @@ -557,14 +557,16 @@ void VM_Version::get_processor_features() { if( !supports_sse() && supports_3dnow_prefetch() ) AllocatePrefetchInstr = 3; // Allocation prefetch settings - intx cache_line_size = L1_data_cache_line_size(); + intx cache_line_size = prefetch_data_size(); if( cache_line_size > AllocatePrefetchStepSize ) AllocatePrefetchStepSize = cache_line_size; - if( FLAG_IS_DEFAULT(AllocatePrefetchLines) ) - AllocatePrefetchLines = 3; // Optimistic value + assert(AllocatePrefetchLines > 0, "invalid value"); - if( AllocatePrefetchLines < 1 ) // set valid value in product VM - AllocatePrefetchLines = 1; // Conservative value + if( AllocatePrefetchLines < 1 ) // set valid value in product VM + AllocatePrefetchLines = 3; + assert(AllocateInstancePrefetchLines > 0, "invalid value"); + if( AllocateInstancePrefetchLines < 1 ) // set valid value in product VM + AllocateInstancePrefetchLines = 1; AllocatePrefetchDistance = allocate_prefetch_distance(); AllocatePrefetchStyle = allocate_prefetch_style(); @@ -601,10 +603,11 @@ void VM_Version::get_processor_features() { tty->print_cr("Logical CPUs per core: %u", logical_processors_per_package()); tty->print_cr("UseSSE=%d",UseSSE); - tty->print("Allocation: "); + tty->print("Allocation"); if (AllocatePrefetchStyle <= 0 || UseSSE == 0 && !supports_3dnow_prefetch()) { - tty->print_cr("no prefetching"); + tty->print_cr(": no prefetching"); } else { + tty->print(" prefetching: "); if (UseSSE == 0 && supports_3dnow_prefetch()) { tty->print("PREFETCHW"); } else if (UseSSE >= 1) { @@ -619,9 +622,9 @@ void VM_Version::get_processor_features() { } } if (AllocatePrefetchLines > 1) { - tty->print_cr(" %d, %d lines with step %d bytes", AllocatePrefetchDistance, AllocatePrefetchLines, AllocatePrefetchStepSize); + tty->print_cr(" at distance %d, %d lines of %d bytes", AllocatePrefetchDistance, AllocatePrefetchLines, AllocatePrefetchStepSize); } else { - tty->print_cr(" %d, one line", AllocatePrefetchDistance); + tty->print_cr(" at distance %d, one line of %d bytes", AllocatePrefetchDistance, AllocatePrefetchStepSize); } } diff --git a/src/cpu/x86/vm/vm_version_x86.hpp b/src/cpu/x86/vm/vm_version_x86.hpp index ba268227690790c5d6bcddd3b9365435fb7ef3fa..a1fd5130d71d3db21e9dde088ac7f0bf18e46904 100644 --- a/src/cpu/x86/vm/vm_version_x86.hpp +++ b/src/cpu/x86/vm/vm_version_x86.hpp @@ -419,7 +419,7 @@ public: return result; } - static intx L1_data_cache_line_size() { + static intx prefetch_data_size() { intx result = 0; if (is_intel()) { result = (_cpuid_info.dcp_cpuid4_ebx.bits.L1_line_size + 1); diff --git a/src/cpu/x86/vm/x86_32.ad b/src/cpu/x86/vm/x86_32.ad index 0e075218990507986f7960ca694ad10339b11699..b40375234d83ad771cae4c3b4d64d2e8d9910a6e 100644 --- a/src/cpu/x86/vm/x86_32.ad +++ b/src/cpu/x86/vm/x86_32.ad @@ -7325,8 +7325,9 @@ instruct prefetchr( memory mem ) %{ ins_cost(100); format %{ "PREFETCHR $mem\t! Prefetch into level 1 cache for read" %} - opcode(0x0F, 0x0d); /* Opcode 0F 0d /0 */ - ins_encode(OpcP, OpcS, RMopc_Mem(0x00,mem)); + ins_encode %{ + __ prefetchr($mem$$Address); + %} ins_pipe(ialu_mem); %} @@ -7336,8 +7337,9 @@ instruct prefetchrNTA( memory mem ) %{ ins_cost(100); format %{ "PREFETCHNTA $mem\t! Prefetch into non-temporal cache for read" %} - opcode(0x0F, 0x18); /* Opcode 0F 18 /0 */ - ins_encode(OpcP, OpcS, RMopc_Mem(0x00,mem)); + ins_encode %{ + __ prefetchnta($mem$$Address); + %} ins_pipe(ialu_mem); %} @@ -7347,8 +7349,9 @@ instruct prefetchrT0( memory mem ) %{ ins_cost(100); format %{ "PREFETCHT0 $mem\t! Prefetch into L1 and L2 caches for read" %} - opcode(0x0F, 0x18); /* Opcode 0F 18 /1 */ - ins_encode(OpcP, OpcS, RMopc_Mem(0x01,mem)); + ins_encode %{ + __ prefetcht0($mem$$Address); + %} ins_pipe(ialu_mem); %} @@ -7358,8 +7361,9 @@ instruct prefetchrT2( memory mem ) %{ ins_cost(100); format %{ "PREFETCHT2 $mem\t! Prefetch into L2 cache for read" %} - opcode(0x0F, 0x18); /* Opcode 0F 18 /3 */ - ins_encode(OpcP, OpcS, RMopc_Mem(0x03,mem)); + ins_encode %{ + __ prefetcht2($mem$$Address); + %} ins_pipe(ialu_mem); %} @@ -7374,46 +7378,86 @@ instruct prefetchw0( memory mem ) %{ %} instruct prefetchw( memory mem ) %{ - predicate(UseSSE==0 && VM_Version::supports_3dnow_prefetch() || AllocatePrefetchInstr==3); + predicate(UseSSE==0 && VM_Version::supports_3dnow_prefetch()); match( PrefetchWrite mem ); ins_cost(100); format %{ "PREFETCHW $mem\t! Prefetch into L1 cache and mark modified" %} - opcode(0x0F, 0x0D); /* Opcode 0F 0D /1 */ - ins_encode(OpcP, OpcS, RMopc_Mem(0x01,mem)); + ins_encode %{ + __ prefetchw($mem$$Address); + %} ins_pipe(ialu_mem); %} instruct prefetchwNTA( memory mem ) %{ - predicate(UseSSE>=1 && AllocatePrefetchInstr==0); + predicate(UseSSE>=1); match(PrefetchWrite mem); ins_cost(100); format %{ "PREFETCHNTA $mem\t! Prefetch into non-temporal cache for write" %} - opcode(0x0F, 0x18); /* Opcode 0F 18 /0 */ - ins_encode(OpcP, OpcS, RMopc_Mem(0x00,mem)); + ins_encode %{ + __ prefetchnta($mem$$Address); + %} + ins_pipe(ialu_mem); +%} + +// Prefetch instructions for allocation. + +instruct prefetchAlloc0( memory mem ) %{ + predicate(UseSSE==0 && AllocatePrefetchInstr!=3); + match(PrefetchAllocation mem); + ins_cost(0); + size(0); + format %{ "Prefetch allocation (non-SSE is empty encoding)" %} + ins_encode(); + ins_pipe(empty); +%} + +instruct prefetchAlloc( memory mem ) %{ + predicate(AllocatePrefetchInstr==3); + match( PrefetchAllocation mem ); + ins_cost(100); + + format %{ "PREFETCHW $mem\t! Prefetch allocation into L1 cache and mark modified" %} + ins_encode %{ + __ prefetchw($mem$$Address); + %} ins_pipe(ialu_mem); %} -instruct prefetchwT0( memory mem ) %{ +instruct prefetchAllocNTA( memory mem ) %{ + predicate(UseSSE>=1 && AllocatePrefetchInstr==0); + match(PrefetchAllocation mem); + ins_cost(100); + + format %{ "PREFETCHNTA $mem\t! Prefetch allocation into non-temporal cache for write" %} + ins_encode %{ + __ prefetchnta($mem$$Address); + %} + ins_pipe(ialu_mem); +%} + +instruct prefetchAllocT0( memory mem ) %{ predicate(UseSSE>=1 && AllocatePrefetchInstr==1); - match(PrefetchWrite mem); + match(PrefetchAllocation mem); ins_cost(100); - format %{ "PREFETCHT0 $mem\t! Prefetch into L1 and L2 caches for write" %} - opcode(0x0F, 0x18); /* Opcode 0F 18 /1 */ - ins_encode(OpcP, OpcS, RMopc_Mem(0x01,mem)); + format %{ "PREFETCHT0 $mem\t! Prefetch allocation into L1 and L2 caches for write" %} + ins_encode %{ + __ prefetcht0($mem$$Address); + %} ins_pipe(ialu_mem); %} -instruct prefetchwT2( memory mem ) %{ +instruct prefetchAllocT2( memory mem ) %{ predicate(UseSSE>=1 && AllocatePrefetchInstr==2); - match(PrefetchWrite mem); + match(PrefetchAllocation mem); ins_cost(100); - format %{ "PREFETCHT2 $mem\t! Prefetch into L2 cache for write" %} - opcode(0x0F, 0x18); /* Opcode 0F 18 /3 */ - ins_encode(OpcP, OpcS, RMopc_Mem(0x03,mem)); + format %{ "PREFETCHT2 $mem\t! Prefetch allocation into L2 cache for write" %} + ins_encode %{ + __ prefetcht2($mem$$Address); + %} ins_pipe(ialu_mem); %} diff --git a/src/cpu/x86/vm/x86_64.ad b/src/cpu/x86/vm/x86_64.ad index c1220fc883c0691012962b431e2923c1953ed39c..4f6fa3cbe482f24a23d3d5cf75148983d53844b7 100644 --- a/src/cpu/x86/vm/x86_64.ad +++ b/src/cpu/x86/vm/x86_64.ad @@ -6617,8 +6617,9 @@ instruct prefetchr( memory mem ) %{ ins_cost(125); format %{ "PREFETCHR $mem\t# Prefetch into level 1 cache" %} - opcode(0x0F, 0x0D); /* Opcode 0F 0D /0 */ - ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x00, mem)); + ins_encode %{ + __ prefetchr($mem$$Address); + %} ins_pipe(ialu_mem); %} @@ -6628,8 +6629,9 @@ instruct prefetchrNTA( memory mem ) %{ ins_cost(125); format %{ "PREFETCHNTA $mem\t# Prefetch into non-temporal cache for read" %} - opcode(0x0F, 0x18); /* Opcode 0F 18 /0 */ - ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x00, mem)); + ins_encode %{ + __ prefetchnta($mem$$Address); + %} ins_pipe(ialu_mem); %} @@ -6639,8 +6641,9 @@ instruct prefetchrT0( memory mem ) %{ ins_cost(125); format %{ "PREFETCHT0 $mem\t# prefetch into L1 and L2 caches for read" %} - opcode(0x0F, 0x18); /* Opcode 0F 18 /1 */ - ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x01, mem)); + ins_encode %{ + __ prefetcht0($mem$$Address); + %} ins_pipe(ialu_mem); %} @@ -6650,52 +6653,70 @@ instruct prefetchrT2( memory mem ) %{ ins_cost(125); format %{ "PREFETCHT2 $mem\t# prefetch into L2 caches for read" %} - opcode(0x0F, 0x18); /* Opcode 0F 18 /3 */ - ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x03, mem)); + ins_encode %{ + __ prefetcht2($mem$$Address); + %} ins_pipe(ialu_mem); %} -instruct prefetchw( memory mem ) %{ - predicate(AllocatePrefetchInstr==3); +instruct prefetchwNTA( memory mem ) %{ match(PrefetchWrite mem); ins_cost(125); - format %{ "PREFETCHW $mem\t# Prefetch into level 1 cache and mark modified" %} - opcode(0x0F, 0x0D); /* Opcode 0F 0D /1 */ - ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x01, mem)); + format %{ "PREFETCHNTA $mem\t# Prefetch to non-temporal cache for write" %} + ins_encode %{ + __ prefetchnta($mem$$Address); + %} ins_pipe(ialu_mem); %} -instruct prefetchwNTA( memory mem ) %{ +// Prefetch instructions for allocation. + +instruct prefetchAlloc( memory mem ) %{ + predicate(AllocatePrefetchInstr==3); + match(PrefetchAllocation mem); + ins_cost(125); + + format %{ "PREFETCHW $mem\t# Prefetch allocation into level 1 cache and mark modified" %} + ins_encode %{ + __ prefetchw($mem$$Address); + %} + ins_pipe(ialu_mem); +%} + +instruct prefetchAllocNTA( memory mem ) %{ predicate(AllocatePrefetchInstr==0); - match(PrefetchWrite mem); + match(PrefetchAllocation mem); ins_cost(125); - format %{ "PREFETCHNTA $mem\t# Prefetch to non-temporal cache for write" %} - opcode(0x0F, 0x18); /* Opcode 0F 18 /0 */ - ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x00, mem)); + format %{ "PREFETCHNTA $mem\t# Prefetch allocation to non-temporal cache for write" %} + ins_encode %{ + __ prefetchnta($mem$$Address); + %} ins_pipe(ialu_mem); %} -instruct prefetchwT0( memory mem ) %{ +instruct prefetchAllocT0( memory mem ) %{ predicate(AllocatePrefetchInstr==1); - match(PrefetchWrite mem); + match(PrefetchAllocation mem); ins_cost(125); - format %{ "PREFETCHT0 $mem\t# Prefetch to level 1 and 2 caches for write" %} - opcode(0x0F, 0x18); /* Opcode 0F 18 /1 */ - ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x01, mem)); + format %{ "PREFETCHT0 $mem\t# Prefetch allocation to level 1 and 2 caches for write" %} + ins_encode %{ + __ prefetcht0($mem$$Address); + %} ins_pipe(ialu_mem); %} -instruct prefetchwT2( memory mem ) %{ +instruct prefetchAllocT2( memory mem ) %{ predicate(AllocatePrefetchInstr==2); - match(PrefetchWrite mem); + match(PrefetchAllocation mem); ins_cost(125); - format %{ "PREFETCHT2 $mem\t# Prefetch to level 2 cache for write" %} - opcode(0x0F, 0x18); /* Opcode 0F 18 /3 */ - ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x03, mem)); + format %{ "PREFETCHT2 $mem\t# Prefetch allocation to level 2 cache for write" %} + ins_encode %{ + __ prefetcht2($mem$$Address); + %} ins_pipe(ialu_mem); %} diff --git a/src/share/vm/adlc/formssel.cpp b/src/share/vm/adlc/formssel.cpp index d0363a4d57e52d39089e62f60a79fc933c5eb70a..85118e87fe75b48a3c809752cfc4bbdd1fc19bf4 100644 --- a/src/share/vm/adlc/formssel.cpp +++ b/src/share/vm/adlc/formssel.cpp @@ -3390,7 +3390,9 @@ int MatchNode::needs_ideal_memory_edge(FormDict &globals) const { "ClearArray" }; int cnt = sizeof(needs_ideal_memory_list)/sizeof(char*); - if( strcmp(_opType,"PrefetchRead")==0 || strcmp(_opType,"PrefetchWrite")==0 ) + if( strcmp(_opType,"PrefetchRead")==0 || + strcmp(_opType,"PrefetchWrite")==0 || + strcmp(_opType,"PrefetchAllocation")==0 ) return 1; if( _lChild ) { const char *opType = _lChild->_opType; diff --git a/src/share/vm/memory/threadLocalAllocBuffer.hpp b/src/share/vm/memory/threadLocalAllocBuffer.hpp index 4b88835db72211d9a328f889b5e08158d80704dc..1b8bb0a3ca7020e3ee1581ebf4fbb22984447be0 100644 --- a/src/share/vm/memory/threadLocalAllocBuffer.hpp +++ b/src/share/vm/memory/threadLocalAllocBuffer.hpp @@ -124,16 +124,7 @@ public: // Reserve space at the end of TLAB static size_t end_reserve() { int reserve_size = typeArrayOopDesc::header_size(T_INT); - if (AllocatePrefetchStyle == 3) { - // BIS is used to prefetch - we need a space for it. - // +1 for rounding up to next cache line +1 to be safe - int lines = AllocatePrefetchLines + 2; - int step_size = AllocatePrefetchStepSize; - int distance = AllocatePrefetchDistance; - int prefetch_end = (distance + step_size*lines)/(int)HeapWordSize; - reserve_size = MAX2(reserve_size, prefetch_end); - } - return reserve_size; + return MAX2(reserve_size, VM_Version::reserve_for_allocation_prefetch()); } static size_t alignment_reserve() { return align_object_size(end_reserve()); } static size_t alignment_reserve_in_bytes() { return alignment_reserve() * HeapWordSize; } diff --git a/src/share/vm/opto/classes.hpp b/src/share/vm/opto/classes.hpp index d60637e3da4306abedfc0cdb938c86a4df7e5d44..dbece7611c18dc0d08798c189db107a3fbd8a1dc 100644 --- a/src/share/vm/opto/classes.hpp +++ b/src/share/vm/opto/classes.hpp @@ -196,6 +196,7 @@ macro(Phi) macro(PopCountI) macro(PopCountL) macro(PowD) +macro(PrefetchAllocation) macro(PrefetchRead) macro(PrefetchWrite) macro(Proj) diff --git a/src/share/vm/opto/macro.cpp b/src/share/vm/opto/macro.cpp index 61c113bc3b5854a760aeab0feef297deeaace973..1e12e36602fbeda8136996a07785e123fdca69dd 100644 --- a/src/share/vm/opto/macro.cpp +++ b/src/share/vm/opto/macro.cpp @@ -1590,7 +1590,7 @@ Node* PhaseMacroExpand::prefetch_allocation(Node* i_o, Node*& needgc_false, prefetch_adr = new (C, 4) AddPNode( old_pf_wm, new_pf_wmt, _igvn.MakeConX(distance) ); transform_later(prefetch_adr); - prefetch = new (C, 3) PrefetchWriteNode( i_o, prefetch_adr ); + prefetch = new (C, 3) PrefetchAllocationNode( i_o, prefetch_adr ); transform_later(prefetch); distance += step_size; i_o = prefetch; @@ -1611,13 +1611,14 @@ Node* PhaseMacroExpand::prefetch_allocation(Node* i_o, Node*& needgc_false, contended_phi_rawmem = pf_phi_rawmem; i_o = pf_phi_abio; } else if( UseTLAB && AllocatePrefetchStyle == 3 ) { - // Insert a prefetch for each allocation only on the fast-path + // Insert a prefetch for each allocation. + // This code is used for Sparc with BIS. Node *pf_region = new (C, 3) RegionNode(3); Node *pf_phi_rawmem = new (C, 3) PhiNode( pf_region, Type::MEMORY, TypeRawPtr::BOTTOM ); - // Generate several prefetch instructions only for arrays. - uint lines = (length != NULL) ? AllocatePrefetchLines : 1; + // Generate several prefetch instructions. + uint lines = (length != NULL) ? AllocatePrefetchLines : AllocateInstancePrefetchLines; uint step_size = AllocatePrefetchStepSize; uint distance = AllocatePrefetchDistance; @@ -1634,7 +1635,7 @@ Node* PhaseMacroExpand::prefetch_allocation(Node* i_o, Node*& needgc_false, transform_later(cache_adr); // Prefetch - Node *prefetch = new (C, 3) PrefetchWriteNode( contended_phi_rawmem, cache_adr ); + Node *prefetch = new (C, 3) PrefetchAllocationNode( contended_phi_rawmem, cache_adr ); prefetch->set_req(0, needgc_false); transform_later(prefetch); contended_phi_rawmem = prefetch; @@ -1644,7 +1645,7 @@ Node* PhaseMacroExpand::prefetch_allocation(Node* i_o, Node*& needgc_false, prefetch_adr = new (C, 4) AddPNode( cache_adr, cache_adr, _igvn.MakeConX(distance) ); transform_later(prefetch_adr); - prefetch = new (C, 3) PrefetchWriteNode( contended_phi_rawmem, prefetch_adr ); + prefetch = new (C, 3) PrefetchAllocationNode( contended_phi_rawmem, prefetch_adr ); transform_later(prefetch); distance += step_size; contended_phi_rawmem = prefetch; @@ -1653,15 +1654,15 @@ Node* PhaseMacroExpand::prefetch_allocation(Node* i_o, Node*& needgc_false, // Insert a prefetch for each allocation only on the fast-path Node *prefetch_adr; Node *prefetch; - // Generate several prefetch instructions only for arrays. - uint lines = (length != NULL) ? AllocatePrefetchLines : 1; + // Generate several prefetch instructions. + uint lines = (length != NULL) ? AllocatePrefetchLines : AllocateInstancePrefetchLines; uint step_size = AllocatePrefetchStepSize; uint distance = AllocatePrefetchDistance; for ( uint i = 0; i < lines; i++ ) { prefetch_adr = new (C, 4) AddPNode( old_eden_top, new_eden_top, _igvn.MakeConX(distance) ); transform_later(prefetch_adr); - prefetch = new (C, 3) PrefetchWriteNode( i_o, prefetch_adr ); + prefetch = new (C, 3) PrefetchAllocationNode( i_o, prefetch_adr ); // Do not let it float too high, since if eden_top == eden_end, // both might be null. if( i == 0 ) { // Set control for first prefetch, next follows it diff --git a/src/share/vm/opto/matcher.cpp b/src/share/vm/opto/matcher.cpp index 7a1358c744006055ca6f40cc9f7cded56b64a7d9..1faf3b90808adac3f73839cc03015cedfe301d2d 100644 --- a/src/share/vm/opto/matcher.cpp +++ b/src/share/vm/opto/matcher.cpp @@ -826,6 +826,7 @@ static void match_alias_type(Compile* C, Node* n, Node* m) { switch (n->Opcode()) { case Op_PrefetchRead: case Op_PrefetchWrite: + case Op_PrefetchAllocation: nidx = Compile::AliasIdxRaw; nat = TypeRawPtr::BOTTOM; break; diff --git a/src/share/vm/opto/memnode.hpp b/src/share/vm/opto/memnode.hpp index 5a8f3b5bc7091b1e536aba619eab559dea3c4cb9..a9eacdd8c9bb4461cc05d58835d937e7dd5b78db 100644 --- a/src/share/vm/opto/memnode.hpp +++ b/src/share/vm/opto/memnode.hpp @@ -1278,6 +1278,16 @@ public: virtual int Opcode() const; virtual uint ideal_reg() const { return NotAMachineReg; } virtual uint match_edge(uint idx) const { return idx==2; } + virtual const Type *bottom_type() const { return Type::ABIO; } +}; + +// Allocation prefetch which may fault, TLAB size have to be adjusted. +class PrefetchAllocationNode : public Node { +public: + PrefetchAllocationNode(Node *mem, Node *adr) : Node(0,mem,adr) {} + virtual int Opcode() const; + virtual uint ideal_reg() const { return NotAMachineReg; } + virtual uint match_edge(uint idx) const { return idx==2; } virtual const Type *bottom_type() const { return ( AllocatePrefetchStyle == 3 ) ? Type::MEMORY : Type::ABIO; } }; diff --git a/src/share/vm/runtime/globals.hpp b/src/share/vm/runtime/globals.hpp index 42981e5e47381b5082fabe729cc9bf0bbc36cec1..e044933344ad557fde78b640b85dc510d83877cd 100644 --- a/src/share/vm/runtime/globals.hpp +++ b/src/share/vm/runtime/globals.hpp @@ -2897,8 +2897,11 @@ class CommandLineFlags { product(intx, AllocatePrefetchDistance, -1, \ "Distance to prefetch ahead of allocation pointer") \ \ - product(intx, AllocatePrefetchLines, 1, \ - "Number of lines to prefetch ahead of allocation pointer") \ + product(intx, AllocatePrefetchLines, 3, \ + "Number of lines to prefetch ahead of array allocation pointer") \ + \ + product(intx, AllocateInstancePrefetchLines, 1, \ + "Number of lines to prefetch ahead of instance allocation pointer") \ \ product(intx, AllocatePrefetchStepSize, 16, \ "Step size in bytes of sequential prefetch instructions") \ diff --git a/src/share/vm/runtime/vm_version.cpp b/src/share/vm/runtime/vm_version.cpp index cd44c03da2697dbb3df0fa65bec8c39ad9d718cc..524ec49c836afb180fe803036eca6e75a5d0ccef 100644 --- a/src/share/vm/runtime/vm_version.cpp +++ b/src/share/vm/runtime/vm_version.cpp @@ -46,6 +46,7 @@ const char* Abstract_VM_Version::_s_vm_release = Abstract_VM_Version::vm_release const char* Abstract_VM_Version::_s_internal_vm_info_string = Abstract_VM_Version::internal_vm_info_string(); bool Abstract_VM_Version::_supports_cx8 = false; unsigned int Abstract_VM_Version::_logical_processors_per_package = 1U; +int Abstract_VM_Version::_reserve_for_allocation_prefetch = 0; #ifndef HOTSPOT_RELEASE_VERSION #error HOTSPOT_RELEASE_VERSION must be defined diff --git a/src/share/vm/runtime/vm_version.hpp b/src/share/vm/runtime/vm_version.hpp index 6fcbb6ac4df5d1ba2119fcb4be97dd3e76c7341a..60e71424f228c1cfaf01b977e8437cd2335548e8 100644 --- a/src/share/vm/runtime/vm_version.hpp +++ b/src/share/vm/runtime/vm_version.hpp @@ -44,6 +44,7 @@ class Abstract_VM_Version: AllStatic { static bool _initialized; static int _parallel_worker_threads; static bool _parallel_worker_threads_initialized; + static int _reserve_for_allocation_prefetch; static unsigned int nof_parallel_worker_threads(unsigned int num, unsigned int dem, @@ -77,6 +78,12 @@ class Abstract_VM_Version: AllStatic { return _logical_processors_per_package; } + // Need a space at the end of TLAB for prefetch instructions + // which may fault when accessing memory outside of heap. + static int reserve_for_allocation_prefetch() { + return _reserve_for_allocation_prefetch; + } + // ARCH specific policy for the BiasedLocking static bool use_biased_locking() { return true; }