From db6f7d55c1235073366760fd5caf97644d2fa160 Mon Sep 17 00:00:00 2001 From: kvn Date: Fri, 26 Aug 2011 08:52:22 -0700 Subject: [PATCH] 7059037: Use BIS for zeroing on T4 Summary: Use BIS for zeroing new allocated big (2Kb and more) objects and arrays. Reviewed-by: never, twisti, ysr --- src/cpu/sparc/vm/assembler_sparc.cpp | 62 ++++++++++++ src/cpu/sparc/vm/assembler_sparc.hpp | 14 ++- src/cpu/sparc/vm/copy_sparc.hpp | 8 ++ src/cpu/sparc/vm/sparc.ad | 97 ++++++++++++++----- src/cpu/sparc/vm/stubGenerator_sparc.cpp | 32 ++++++ src/cpu/sparc/vm/templateTable_sparc.cpp | 8 +- src/cpu/sparc/vm/vm_version_sparc.cpp | 10 ++ src/cpu/sparc/vm/vm_version_sparc.hpp | 4 +- src/share/vm/gc_interface/collectedHeap.cpp | 10 +- .../vm/gc_interface/collectedHeap.inline.hpp | 5 +- src/share/vm/oops/cpCacheKlass.cpp | 6 +- src/share/vm/runtime/globals.hpp | 6 ++ src/share/vm/runtime/stubRoutines.cpp | 1 + src/share/vm/runtime/stubRoutines.hpp | 4 + 14 files changed, 232 insertions(+), 35 deletions(-) diff --git a/src/cpu/sparc/vm/assembler_sparc.cpp b/src/cpu/sparc/vm/assembler_sparc.cpp index 4e5d90eaa..3ee5b72ca 100644 --- a/src/cpu/sparc/vm/assembler_sparc.cpp +++ b/src/cpu/sparc/vm/assembler_sparc.cpp @@ -4973,3 +4973,65 @@ void MacroAssembler::char_arrays_equals(Register ary1, Register ary2, // Caller should set it: // add(G0, 1, result); // equals } + +// Use BIS for zeroing (count is in bytes). +void MacroAssembler::bis_zeroing(Register to, Register count, Register temp, Label& Ldone) { + assert(UseBlockZeroing && VM_Version::has_block_zeroing(), "only works with BIS zeroing"); + Register end = count; + int cache_line_size = VM_Version::prefetch_data_size(); + // Minimum count when BIS zeroing can be used since + // it needs membar which is expensive. + int block_zero_size = MAX2(cache_line_size*3, (int)BlockZeroingLowLimit); + + Label small_loop; + // Check if count is negative (dead code) or zero. + // Note, count uses 64bit in 64 bit VM. + cmp_and_brx_short(count, 0, Assembler::lessEqual, Assembler::pn, Ldone); + + // Use BIS zeroing only for big arrays since it requires membar. + if (Assembler::is_simm13(block_zero_size)) { // < 4096 + cmp(count, block_zero_size); + } else { + set(block_zero_size, temp); + cmp(count, temp); + } + br(Assembler::lessUnsigned, false, Assembler::pt, small_loop); + delayed()->add(to, count, end); + + // Note: size is >= three (32 bytes) cache lines. + + // Clean the beginning of space up to next cache line. + for (int offs = 0; offs < cache_line_size; offs += 8) { + stx(G0, to, offs); + } + + // align to next cache line + add(to, cache_line_size, to); + and3(to, -cache_line_size, to); + + // Note: size left >= two (32 bytes) cache lines. + + // BIS should not be used to zero tail (64 bytes) + // to avoid zeroing a header of the following object. + sub(end, (cache_line_size*2)-8, end); + + Label bis_loop; + bind(bis_loop); + stxa(G0, to, G0, Assembler::ASI_ST_BLKINIT_PRIMARY); + add(to, cache_line_size, to); + cmp_and_brx_short(to, end, Assembler::lessUnsigned, Assembler::pt, bis_loop); + + // BIS needs membar. + membar(Assembler::StoreLoad); + + add(end, (cache_line_size*2)-8, end); // restore end + cmp_and_brx_short(to, end, Assembler::greaterEqualUnsigned, Assembler::pn, Ldone); + + // Clean the tail. + bind(small_loop); + stx(G0, to, 0); + add(to, 8, to); + cmp_and_brx_short(to, end, Assembler::lessUnsigned, Assembler::pt, small_loop); + nop(); // Separate short branches +} + diff --git a/src/cpu/sparc/vm/assembler_sparc.hpp b/src/cpu/sparc/vm/assembler_sparc.hpp index 4462ee5d0..f82d29a4a 100644 --- a/src/cpu/sparc/vm/assembler_sparc.hpp +++ b/src/cpu/sparc/vm/assembler_sparc.hpp @@ -885,8 +885,9 @@ class Assembler : public AbstractAssembler { } enum ASIs { // page 72, v9 - ASI_PRIMARY = 0x80, - ASI_PRIMARY_LITTLE = 0x88, + ASI_PRIMARY = 0x80, + ASI_PRIMARY_NOFAULT = 0x82, + ASI_PRIMARY_LITTLE = 0x88, // Block initializing store ASI_ST_BLKINIT_PRIMARY = 0xE2, // Most-Recently-Used (MRU) BIS variant @@ -1786,9 +1787,12 @@ public: rs1(s) | op3(wrreg_op3) | u_field(2, 29, 25) | - u_field(1, 13, 13) | + immed(true) | simm(simm13a, 13)); } - inline void wrasi( Register d) { v9_only(); emit_long( op(arith_op) | rs1(d) | op3(wrreg_op3) | u_field(3, 29, 25)); } + inline void wrasi(Register d) { v9_only(); emit_long( op(arith_op) | rs1(d) | op3(wrreg_op3) | u_field(3, 29, 25)); } + // wrasi(d, imm) stores (d xor imm) to asi + inline void wrasi(Register d, int simm13a) { v9_only(); emit_long( op(arith_op) | rs1(d) | op3(wrreg_op3) | + u_field(3, 29, 25) | immed(true) | simm(simm13a, 13)); } inline void wrfprs( Register d) { v9_only(); emit_long( op(arith_op) | rs1(d) | op3(wrreg_op3) | u_field(6, 29, 25)); } @@ -2631,6 +2635,8 @@ public: void char_arrays_equals(Register ary1, Register ary2, Register limit, Register result, Register chr1, Register chr2, Label& Ldone); + // Use BIS for zeroing + void bis_zeroing(Register to, Register count, Register temp, Label& Ldone); #undef VIRTUAL diff --git a/src/cpu/sparc/vm/copy_sparc.hpp b/src/cpu/sparc/vm/copy_sparc.hpp index 0267d25a6..176ed041c 100644 --- a/src/cpu/sparc/vm/copy_sparc.hpp +++ b/src/cpu/sparc/vm/copy_sparc.hpp @@ -156,9 +156,16 @@ static void pd_fill_to_words(HeapWord* tohw, size_t count, juint value) { #endif // _LP64 } +typedef void (*_zero_Fn)(HeapWord* to, size_t count); + static void pd_fill_to_aligned_words(HeapWord* tohw, size_t count, juint value) { assert(MinObjAlignmentInBytes >= BytesPerLong, "need alternate implementation"); + if (value == 0 && UseBlockZeroing && + (count > (BlockZeroingLowLimit >> LogHeapWordSize))) { + // Call it only when block zeroing is used + ((_zero_Fn)StubRoutines::zero_aligned_words())(tohw, count); + } else { julong* to = (julong*)tohw; julong v = ((julong)value << 32) | value; // If count is odd, odd will be equal to 1 on 32-bit platform @@ -176,6 +183,7 @@ static void pd_fill_to_aligned_words(HeapWord* tohw, size_t count, juint value) *((juint*)to) = value; } + } } static void pd_fill_to_bytes(void* to, size_t count, jubyte value) { diff --git a/src/cpu/sparc/vm/sparc.ad b/src/cpu/sparc/vm/sparc.ad index cd57f2c7d..b999716c0 100644 --- a/src/cpu/sparc/vm/sparc.ad +++ b/src/cpu/sparc/vm/sparc.ad @@ -460,6 +460,8 @@ source_hpp %{ // Must be visible to the DFA in dfa_sparc.cpp extern bool can_branch_register( Node *bol, Node *cmp ); +extern bool use_block_zeroing(Node* count); + // Macros to extract hi & lo halves from a long pair. // G0 is not part of any long pair, so assert on that. // Prevents accidentally using G1 instead of G0. @@ -521,6 +523,12 @@ bool can_branch_register( Node *bol, Node *cmp ) { return false; } +bool use_block_zeroing(Node* count) { + // Use BIS for zeroing if count is not constant + // or it is >= BlockZeroingLowLimit. + return UseBlockZeroing && (count->find_intptr_t_con(BlockZeroingLowLimit) >= BlockZeroingLowLimit); +} + // **************************************************************************** // REQUIRED FUNCTIONALITY @@ -2810,25 +2818,6 @@ enc_class Fast_Unlock(iRegP oop, iRegP box, o7RegP scratch, iRegP scratch2) %{ __ float_cmp( $primary, -1, Fsrc1, Fsrc2, Rdst); %} - // Compiler ensures base is doubleword aligned and cnt is count of doublewords - enc_class enc_Clear_Array(iRegX cnt, iRegP base, iRegX temp) %{ - MacroAssembler _masm(&cbuf); - Register nof_bytes_arg = reg_to_register_object($cnt$$reg); - Register nof_bytes_tmp = reg_to_register_object($temp$$reg); - Register base_pointer_arg = reg_to_register_object($base$$reg); - - Label loop; - __ mov(nof_bytes_arg, nof_bytes_tmp); - - // Loop and clear, walking backwards through the array. - // nof_bytes_tmp (if >0) is always the number of bytes to zero - __ bind(loop); - __ deccc(nof_bytes_tmp, 8); - __ br(Assembler::greaterEqual, true, Assembler::pt, loop); - __ delayed()-> stx(G0, base_pointer_arg, nof_bytes_tmp); - // %%%% this mini-loop must not cross a cache boundary! - %} - enc_class enc_String_Compare(o0RegP str1, o1RegP str2, g3RegI cnt1, g4RegI cnt2, notemp_iRegI result) %{ Label Ldone, Lloop; @@ -10257,9 +10246,9 @@ instruct cmpFastUnlock(flagsRegP pcc, iRegP object, iRegP box, iRegP scratch2, o ins_pipe(long_memory_op); %} -// Count and Base registers are fixed because the allocator cannot -// kill unknown registers. The encodings are generic. +// The encodings are generic. instruct clear_array(iRegX cnt, iRegP base, iRegX temp, Universe dummy, flagsReg ccr) %{ + predicate(!use_block_zeroing(n->in(2)) ); match(Set dummy (ClearArray cnt base)); effect(TEMP temp, KILL ccr); ins_cost(300); @@ -10267,7 +10256,71 @@ instruct clear_array(iRegX cnt, iRegP base, iRegX temp, Universe dummy, flagsReg "loop: SUBcc $temp,8,$temp\t! Count down a dword of bytes\n" " BRge loop\t\t! Clearing loop\n" " STX G0,[$base+$temp]\t! delay slot" %} - ins_encode( enc_Clear_Array(cnt, base, temp) ); + + ins_encode %{ + // Compiler ensures base is doubleword aligned and cnt is count of doublewords + Register nof_bytes_arg = $cnt$$Register; + Register nof_bytes_tmp = $temp$$Register; + Register base_pointer_arg = $base$$Register; + + Label loop; + __ mov(nof_bytes_arg, nof_bytes_tmp); + + // Loop and clear, walking backwards through the array. + // nof_bytes_tmp (if >0) is always the number of bytes to zero + __ bind(loop); + __ deccc(nof_bytes_tmp, 8); + __ br(Assembler::greaterEqual, true, Assembler::pt, loop); + __ delayed()-> stx(G0, base_pointer_arg, nof_bytes_tmp); + // %%%% this mini-loop must not cross a cache boundary! + %} + ins_pipe(long_memory_op); +%} + +instruct clear_array_bis(g1RegX cnt, o0RegP base, Universe dummy, flagsReg ccr) %{ + predicate(use_block_zeroing(n->in(2))); + match(Set dummy (ClearArray cnt base)); + effect(USE_KILL cnt, USE_KILL base, KILL ccr); + ins_cost(300); + format %{ "CLEAR [$base, $cnt]\t! ClearArray" %} + + ins_encode %{ + + assert(MinObjAlignmentInBytes >= BytesPerLong, "need alternate implementation"); + Register to = $base$$Register; + Register count = $cnt$$Register; + + Label Ldone; + __ nop(); // Separate short branches + // Use BIS for zeroing (temp is not used). + __ bis_zeroing(to, count, G0, Ldone); + __ bind(Ldone); + + %} + ins_pipe(long_memory_op); +%} + +instruct clear_array_bis_2(g1RegX cnt, o0RegP base, iRegX tmp, Universe dummy, flagsReg ccr) %{ + predicate(use_block_zeroing(n->in(2)) && !Assembler::is_simm13((int)BlockZeroingLowLimit)); + match(Set dummy (ClearArray cnt base)); + effect(TEMP tmp, USE_KILL cnt, USE_KILL base, KILL ccr); + ins_cost(300); + format %{ "CLEAR [$base, $cnt]\t! ClearArray" %} + + ins_encode %{ + + assert(MinObjAlignmentInBytes >= BytesPerLong, "need alternate implementation"); + Register to = $base$$Register; + Register count = $cnt$$Register; + Register temp = $tmp$$Register; + + Label Ldone; + __ nop(); // Separate short branches + // Use BIS for zeroing + __ bis_zeroing(to, count, temp, Ldone); + __ bind(Ldone); + + %} ins_pipe(long_memory_op); %} diff --git a/src/cpu/sparc/vm/stubGenerator_sparc.cpp b/src/cpu/sparc/vm/stubGenerator_sparc.cpp index 3c149bf94..9f6274d5a 100644 --- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp +++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp @@ -3069,6 +3069,34 @@ class StubGenerator: public StubCodeGenerator { return start; } + // + // Generate stub for heap zeroing. + // "to" address is aligned to jlong (8 bytes). + // + // Arguments for generated stub: + // to: O0 + // count: O1 treated as signed (count of HeapWord) + // count could be 0 + // + address generate_zero_aligned_words(const char* name) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", name); + address start = __ pc(); + + const Register to = O0; // source array address + const Register count = O1; // HeapWords count + const Register temp = O2; // scratch + + Label Ldone; + __ sllx(count, LogHeapWordSize, count); // to bytes count + // Use BIS for zeroing + __ bis_zeroing(to, count, temp, Ldone); + __ bind(Ldone); + __ retl(); + __ delayed()->nop(); + return start; +} + void generate_arraycopy_stubs() { address entry; address entry_jbyte_arraycopy; @@ -3195,6 +3223,10 @@ class StubGenerator: public StubCodeGenerator { StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); + + if (UseBlockZeroing) { + StubRoutines::_zero_aligned_words = generate_zero_aligned_words("zero_aligned_words"); + } } void generate_initial() { diff --git a/src/cpu/sparc/vm/templateTable_sparc.cpp b/src/cpu/sparc/vm/templateTable_sparc.cpp index 25b2256ff..850dbe62a 100644 --- a/src/cpu/sparc/vm/templateTable_sparc.cpp +++ b/src/cpu/sparc/vm/templateTable_sparc.cpp @@ -3374,7 +3374,7 @@ void TemplateTable::_new() { if(UseTLAB) { Register RoldTopValue = RallocatedObject; - Register RtopAddr = G3_scratch, RtlabWasteLimitValue = G3_scratch; + Register RtlabWasteLimitValue = G3_scratch; Register RnewTopValue = G1_scratch; Register RendValue = Rscratch; Register RfreeValue = RnewTopValue; @@ -3455,7 +3455,11 @@ void TemplateTable::_new() { __ delayed()->add(RallocatedObject, sizeof(oopDesc), G3_scratch); // initialize remaining object fields - { Label loop; + if (UseBlockZeroing) { + // Use BIS for zeroing + __ bis_zeroing(G3_scratch, Roffset, G1_scratch, initialize_header); + } else { + Label loop; __ subcc(Roffset, wordSize, Roffset); __ bind(loop); //__ subcc(Roffset, wordSize, Roffset); // executed above loop or in delay slot diff --git a/src/cpu/sparc/vm/vm_version_sparc.cpp b/src/cpu/sparc/vm/vm_version_sparc.cpp index 6f9ca4ea9..2aae43c28 100644 --- a/src/cpu/sparc/vm/vm_version_sparc.cpp +++ b/src/cpu/sparc/vm/vm_version_sparc.cpp @@ -170,6 +170,16 @@ void VM_Version::initialize() { FLAG_SET_DEFAULT(UseCBCond, false); } + assert(BlockZeroingLowLimit > 0, "invalid value"); + if (has_block_zeroing()) { + if (FLAG_IS_DEFAULT(UseBlockZeroing)) { + FLAG_SET_DEFAULT(UseBlockZeroing, true); + } + } else if (UseBlockZeroing) { + warning("BIS zeroing instructions are not available on this CPU"); + FLAG_SET_DEFAULT(UseBlockZeroing, false); + } + #ifdef COMPILER2 // T4 and newer Sparc cpus have fast RDPC. if (has_fast_rdpc() && FLAG_IS_DEFAULT(UseRDPCForConstantTableBase)) { diff --git a/src/cpu/sparc/vm/vm_version_sparc.hpp b/src/cpu/sparc/vm/vm_version_sparc.hpp index 9c9b8fc6f..db3343b51 100644 --- a/src/cpu/sparc/vm/vm_version_sparc.hpp +++ b/src/cpu/sparc/vm/vm_version_sparc.hpp @@ -135,8 +135,8 @@ public: // T4 and newer Sparc have fast RDPC instruction. static bool has_fast_rdpc() { return is_T4(); } - // T4 and newer Sparc have Most-Recently-Used (MRU) BIS. - static bool has_mru_blk_init() { return has_blk_init() && is_T4(); } + // On T4 and newer Sparc BIS to the beginning of cache line always zeros it. + static bool has_block_zeroing() { return has_blk_init() && is_T4(); } static const char* cpu_features() { return _features_str; } diff --git a/src/share/vm/gc_interface/collectedHeap.cpp b/src/share/vm/gc_interface/collectedHeap.cpp index 6b99a8f62..1ead9eef7 100644 --- a/src/share/vm/gc_interface/collectedHeap.cpp +++ b/src/share/vm/gc_interface/collectedHeap.cpp @@ -157,8 +157,14 @@ HeapWord* CollectedHeap::allocate_from_tlab_slow(Thread* thread, size_t size) { // ..and clear it. Copy::zero_to_words(obj, new_tlab_size); } else { - // ...and clear just the allocated object. - Copy::zero_to_words(obj, size); + // ...and zap just allocated object. +#ifdef ASSERT + // Skip mangling the space corresponding to the object header to + // ensure that the returned space is not considered parsable by + // any concurrent GC thread. + size_t hdr_size = oopDesc::header_size(); + Copy::fill_to_words(obj + hdr_size, new_tlab_size - hdr_size, badHeapWordVal); +#endif // ASSERT } thread->tlab().fill(obj, obj + size, new_tlab_size); return obj; diff --git a/src/share/vm/gc_interface/collectedHeap.inline.hpp b/src/share/vm/gc_interface/collectedHeap.inline.hpp index 68003b75b..cf9946d79 100644 --- a/src/share/vm/gc_interface/collectedHeap.inline.hpp +++ b/src/share/vm/gc_interface/collectedHeap.inline.hpp @@ -287,7 +287,10 @@ oop CollectedHeap::permanent_obj_allocate_no_klass_install(KlassHandle klass, assert(size >= 0, "int won't convert to size_t"); HeapWord* obj = common_permanent_mem_allocate_init(size, CHECK_NULL); post_allocation_setup_no_klass_install(klass, obj, size); - NOT_PRODUCT(Universe::heap()->check_for_bad_heap_word_value(obj, size)); +#ifndef PRODUCT + const size_t hs = oopDesc::header_size(); + Universe::heap()->check_for_bad_heap_word_value(obj+hs, size-hs); +#endif return (oop)obj; } diff --git a/src/share/vm/oops/cpCacheKlass.cpp b/src/share/vm/oops/cpCacheKlass.cpp index 86d9dc083..843f095d8 100644 --- a/src/share/vm/oops/cpCacheKlass.cpp +++ b/src/share/vm/oops/cpCacheKlass.cpp @@ -63,8 +63,10 @@ constantPoolCacheOop constantPoolCacheKlass::allocate(int length, // CollectedHeap::permanent_obj_allocate(klass, size, CHECK_NULL); oop obj = CollectedHeap::permanent_obj_allocate_no_klass_install(klass, size, CHECK_NULL); - NOT_PRODUCT(Universe::heap()->check_for_bad_heap_word_value((HeapWord*) obj, - size)); +#ifndef PRODUCT + const size_t hs = oopDesc::header_size(); + Universe::heap()->check_for_bad_heap_word_value(((HeapWord*) obj)+hs, size-hs); +#endif constantPoolCacheOop cache = (constantPoolCacheOop) obj; assert(!UseConcMarkSweepGC || obj->klass_or_null() == NULL, "klass should be NULL here when using CMS"); diff --git a/src/share/vm/runtime/globals.hpp b/src/share/vm/runtime/globals.hpp index e04493334..905379a72 100644 --- a/src/share/vm/runtime/globals.hpp +++ b/src/share/vm/runtime/globals.hpp @@ -1979,6 +1979,12 @@ class CommandLineFlags { product(bool, TLABStats, true, \ "Print various TLAB related information") \ \ + product(bool, UseBlockZeroing, false, \ + "Use special cpu instructions for block zeroing") \ + \ + product(intx, BlockZeroingLowLimit, 2048, \ + "Minimum size in bytes when block zeroing will be used") \ + \ product(bool, PrintRevisitStats, false, \ "Print revisit (klass and MDO) stack related information") \ \ diff --git a/src/share/vm/runtime/stubRoutines.cpp b/src/share/vm/runtime/stubRoutines.cpp index 6dd3a9f17..5f77761a2 100644 --- a/src/share/vm/runtime/stubRoutines.cpp +++ b/src/share/vm/runtime/stubRoutines.cpp @@ -108,6 +108,7 @@ address StubRoutines::_arrayof_jlong_disjoint_arraycopy = CAST_FROM_FN_PTR(addr address StubRoutines::_arrayof_oop_disjoint_arraycopy = CAST_FROM_FN_PTR(address, StubRoutines::arrayof_oop_copy); address StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = CAST_FROM_FN_PTR(address, StubRoutines::arrayof_oop_copy_uninit); +address StubRoutines::_zero_aligned_words = CAST_FROM_FN_PTR(address, Copy::zero_to_words); address StubRoutines::_checkcast_arraycopy = NULL; address StubRoutines::_checkcast_arraycopy_uninit = NULL; diff --git a/src/share/vm/runtime/stubRoutines.hpp b/src/share/vm/runtime/stubRoutines.hpp index fff494e31..e2ca75c81 100644 --- a/src/share/vm/runtime/stubRoutines.hpp +++ b/src/share/vm/runtime/stubRoutines.hpp @@ -199,6 +199,9 @@ class StubRoutines: AllStatic { static address _arrayof_jshort_fill; static address _arrayof_jint_fill; + // zero heap space aligned to jlong (8 bytes) + static address _zero_aligned_words; + // These are versions of the java.lang.Math methods which perform // the same operations as the intrinsic version. They are used for // constant folding in the compiler to ensure equivalence. If the @@ -332,6 +335,7 @@ class StubRoutines: AllStatic { static address select_fill_function(BasicType t, bool aligned, const char* &name); + static address zero_aligned_words() { return _zero_aligned_words; } static double intrinsic_log(double d) { assert(_intrinsic_log != NULL, "must be defined"); -- GitLab