7059037: Use BIS for zeroing on T4

Summary: Use BIS for zeroing new allocated big (2Kb and more) objects and arrays. Reviewed-by: never, twisti, ysr

7059037: Use BIS for zeroing on T4
Summary: Use BIS for zeroing new allocated big (2Kb and more) objects and arrays. Reviewed-by: never, twisti, ysr
db6f7d55 · kvn · 117f2450 · db6f7d55 · db6f7d55 · db6f7d55
14 changed file
--- a/src/cpu/sparc/vm/assembler_sparc.cpp
+++ b/src/cpu/sparc/vm/assembler_sparc.cpp
@@ -4973,3 +4973,65 @@ void MacroAssembler::char_arrays_equals(Register ary1, Register ary2,
  // Caller should set it:
  // add(G0, 1, result); // equals
 }
+
+// Use BIS for zeroing (count is in bytes).
+void MacroAssembler::bis_zeroing(Register to, Register count, Register temp, Label& Ldone) {
+  assert(UseBlockZeroing && VM_Version::has_block_zeroing(), "only works with BIS zeroing");
+  Register end = count;
+  int cache_line_size = VM_Version::prefetch_data_size();
+  // Minimum count when BIS zeroing can be used since
+  // it needs membar which is expensive.
+  int block_zero_size  = MAX2(cache_line_size*3, (int)BlockZeroingLowLimit);
+
+  Label small_loop;
+  // Check if count is negative (dead code) or zero.
+  // Note, count uses 64bit in 64 bit VM.
+  cmp_and_brx_short(count, 0, Assembler::lessEqual, Assembler::pn, Ldone);
+
+  // Use BIS zeroing only for big arrays since it requires membar.
+  if (Assembler::is_simm13(block_zero_size)) { // < 4096
+    cmp(count, block_zero_size);
+  } else {
+    set(block_zero_size, temp);
+    cmp(count, temp);
+  }
+  br(Assembler::lessUnsigned, false, Assembler::pt, small_loop);
+  delayed()->add(to, count, end);
+
+  // Note: size is >= three (32 bytes) cache lines.
+
+  // Clean the beginning of space up to next cache line.
+  for (int offs = 0; offs < cache_line_size; offs += 8) {
+    stx(G0, to, offs);
+  }
+
+  // align to next cache line
+  add(to, cache_line_size, to);
+  and3(to, -cache_line_size, to);
+
+  // Note: size left >= two (32 bytes) cache lines.
+
+  // BIS should not be used to zero tail (64 bytes)
+  // to avoid zeroing a header of the following object.
+  sub(end, (cache_line_size*2)-8, end);
+
+  Label bis_loop;
+  bind(bis_loop);
+  stxa(G0, to, G0, Assembler::ASI_ST_BLKINIT_PRIMARY);
+  add(to, cache_line_size, to);
+  cmp_and_brx_short(to, end, Assembler::lessUnsigned, Assembler::pt, bis_loop);
+
+  // BIS needs membar.
+  membar(Assembler::StoreLoad);
+
+  add(end, (cache_line_size*2)-8, end); // restore end
+  cmp_and_brx_short(to, end, Assembler::greaterEqualUnsigned, Assembler::pn, Ldone);
+
+  // Clean the tail.
+  bind(small_loop);
+  stx(G0, to, 0);
+  add(to, 8, to);
+  cmp_and_brx_short(to, end, Assembler::lessUnsigned, Assembler::pt, small_loop);
+  nop(); // Separate short branches
+}
+
--- a/src/cpu/sparc/vm/assembler_sparc.hpp
+++ b/src/cpu/sparc/vm/assembler_sparc.hpp
@@ -886,6 +886,7 @@ class Assembler : public AbstractAssembler  {

  enum ASIs { // page 72, v9
    ASI_PRIMARY            = 0x80,
+    ASI_PRIMARY_NOFAULT    = 0x82,
    ASI_PRIMARY_LITTLE     = 0x88,
    // Block initializing store
    ASI_ST_BLKINIT_PRIMARY = 0xE2,
@@ -1786,9 +1787,12 @@ public:
                                                                           rs1(s) |
                                                                           op3(wrreg_op3) |
                                                                           u_field(2, 29, 25) |
-                                                                           u_field(1, 13, 13) |
+                                                                           immed(true) |
                                                                           simm(simm13a, 13)); }
-  inline void wrasi(  Register d) { v9_only(); emit_long( op(arith_op) | rs1(d) | op3(wrreg_op3) | u_field(3, 29, 25)); }
+  inline void wrasi(Register d) { v9_only(); emit_long( op(arith_op) | rs1(d) | op3(wrreg_op3) | u_field(3, 29, 25)); }
+  // wrasi(d, imm) stores (d xor imm) to asi
+  inline void wrasi(Register d, int simm13a) { v9_only(); emit_long( op(arith_op) | rs1(d) | op3(wrreg_op3) |
+                                               u_field(3, 29, 25) | immed(true) | simm(simm13a, 13)); }
  inline void wrfprs( Register d) { v9_only(); emit_long( op(arith_op) | rs1(d) | op3(wrreg_op3) | u_field(6, 29, 25)); }


@@ -2631,6 +2635,8 @@ public:
  void char_arrays_equals(Register ary1, Register ary2,
                          Register limit, Register result,
                          Register chr1, Register chr2, Label& Ldone);
+  // Use BIS for zeroing
+  void bis_zeroing(Register to, Register count, Register temp, Label& Ldone);

 #undef VIRTUAL


--- a/src/cpu/sparc/vm/copy_sparc.hpp
+++ b/src/cpu/sparc/vm/copy_sparc.hpp
@@ -156,9 +156,16 @@ static void pd_fill_to_words(HeapWord* tohw, size_t count, juint value) {
 #endif // _LP64
 }

+typedef void (*_zero_Fn)(HeapWord* to, size_t count);
+
 static void pd_fill_to_aligned_words(HeapWord* tohw, size_t count, juint value) {
  assert(MinObjAlignmentInBytes >= BytesPerLong, "need alternate implementation");

+  if (value == 0 && UseBlockZeroing &&
+      (count > (BlockZeroingLowLimit >> LogHeapWordSize))) {
+   // Call it only when block zeroing is used
+   ((_zero_Fn)StubRoutines::zero_aligned_words())(tohw, count);
+  } else {
   julong* to = (julong*)tohw;
   julong  v  = ((julong)value << 32) | value;
   // If count is odd, odd will be equal to 1 on 32-bit platform
@@ -176,6 +183,7 @@ static void pd_fill_to_aligned_words(HeapWord* tohw, size_t count, juint value)
     *((juint*)to) = value;

   }
+  }
 }

 static void pd_fill_to_bytes(void* to, size_t count, jubyte value) {

--- a/src/cpu/sparc/vm/sparc.ad
+++ b/src/cpu/sparc/vm/sparc.ad
@@ -460,6 +460,8 @@ source_hpp %{
 // Must be visible to the DFA in dfa_sparc.cpp
 extern bool can_branch_register( Node *bol, Node *cmp );

+extern bool use_block_zeroing(Node* count);
+
 // Macros to extract hi & lo halves from a long pair.
 // G0 is not part of any long pair, so assert on that.
 // Prevents accidentally using G1 instead of G0.
@@ -521,6 +523,12 @@ bool can_branch_register( Node *bol, Node *cmp ) {
  return false;
 }

+bool use_block_zeroing(Node* count) {
+  // Use BIS for zeroing if count is not constant
+  // or it is >= BlockZeroingLowLimit.
+  return UseBlockZeroing && (count->find_intptr_t_con(BlockZeroingLowLimit) >= BlockZeroingLowLimit);
+}
+
 // ****************************************************************************

 // REQUIRED FUNCTIONALITY
@@ -2810,25 +2818,6 @@ enc_class Fast_Unlock(iRegP oop, iRegP box, o7RegP scratch, iRegP scratch2) %{
    __ float_cmp( $primary, -1, Fsrc1, Fsrc2, Rdst);
  %}

-  // Compiler ensures base is doubleword aligned and cnt is count of doublewords
-  enc_class enc_Clear_Array(iRegX cnt, iRegP base, iRegX temp) %{
-    MacroAssembler _masm(&cbuf);
-    Register    nof_bytes_arg   = reg_to_register_object($cnt$$reg);
-    Register    nof_bytes_tmp    = reg_to_register_object($temp$$reg);
-    Register    base_pointer_arg = reg_to_register_object($base$$reg);
-  
-    Label loop;
-    __ mov(nof_bytes_arg, nof_bytes_tmp);
-  
-    // Loop and clear, walking backwards through the array.
-    // nof_bytes_tmp (if >0) is always the number of bytes to zero
-    __ bind(loop);
-    __ deccc(nof_bytes_tmp, 8);
-    __ br(Assembler::greaterEqual, true, Assembler::pt, loop);
-    __ delayed()-> stx(G0, base_pointer_arg, nof_bytes_tmp);
-    // %%%% this mini-loop must not cross a cache boundary!
-  %}
-

  enc_class enc_String_Compare(o0RegP str1, o1RegP str2, g3RegI cnt1, g4RegI cnt2, notemp_iRegI result) %{
    Label Ldone, Lloop;
@@ -10257,9 +10246,9 @@ instruct cmpFastUnlock(flagsRegP pcc, iRegP object, iRegP box, iRegP scratch2, o
  ins_pipe(long_memory_op);
 %}

-// Count and Base registers are fixed because the allocator cannot
-// kill unknown registers.  The encodings are generic.
+// The encodings are generic.
 instruct clear_array(iRegX cnt, iRegP base, iRegX temp, Universe dummy, flagsReg ccr) %{
+  predicate(!use_block_zeroing(n->in(2)) );
  match(Set dummy (ClearArray cnt base));
  effect(TEMP temp, KILL ccr);
  ins_cost(300);
@@ -10267,7 +10256,71 @@ instruct clear_array(iRegX cnt, iRegP base, iRegX temp, Universe dummy, flagsReg
    "loop:   SUBcc  $temp,8,$temp\t! Count down a dword of bytes\n"
    "        BRge   loop\t\t! Clearing loop\n"
    "        STX    G0,[$base+$temp]\t! delay slot" %}
-  ins_encode( enc_Clear_Array(cnt, base, temp) );
+
+  ins_encode %{
+    // Compiler ensures base is doubleword aligned and cnt is count of doublewords
+    Register nof_bytes_arg    = $cnt$$Register;
+    Register nof_bytes_tmp    = $temp$$Register;
+    Register base_pointer_arg = $base$$Register;
+
+    Label loop;
+    __ mov(nof_bytes_arg, nof_bytes_tmp);
+
+    // Loop and clear, walking backwards through the array.
+    // nof_bytes_tmp (if >0) is always the number of bytes to zero
+    __ bind(loop);
+    __ deccc(nof_bytes_tmp, 8);
+    __ br(Assembler::greaterEqual, true, Assembler::pt, loop);
+    __ delayed()-> stx(G0, base_pointer_arg, nof_bytes_tmp);
+    // %%%% this mini-loop must not cross a cache boundary!
+  %}
+  ins_pipe(long_memory_op);
+%}
+
+instruct clear_array_bis(g1RegX cnt, o0RegP base, Universe dummy, flagsReg ccr) %{
+  predicate(use_block_zeroing(n->in(2)));
+  match(Set dummy (ClearArray cnt base));
+  effect(USE_KILL cnt, USE_KILL base, KILL ccr);
+  ins_cost(300);
+  format %{ "CLEAR  [$base, $cnt]\t! ClearArray" %}
+
+  ins_encode %{
+
+    assert(MinObjAlignmentInBytes >= BytesPerLong, "need alternate implementation");
+    Register to    = $base$$Register;
+    Register count = $cnt$$Register;
+
+    Label Ldone;
+    __ nop(); // Separate short branches
+    // Use BIS for zeroing (temp is not used).
+    __ bis_zeroing(to, count, G0, Ldone);
+    __ bind(Ldone);
+
+  %}
+  ins_pipe(long_memory_op);
+%}
+
+instruct clear_array_bis_2(g1RegX cnt, o0RegP base, iRegX tmp, Universe dummy, flagsReg ccr) %{
+  predicate(use_block_zeroing(n->in(2)) && !Assembler::is_simm13((int)BlockZeroingLowLimit));
+  match(Set dummy (ClearArray cnt base));
+  effect(TEMP tmp, USE_KILL cnt, USE_KILL base, KILL ccr);
+  ins_cost(300);
+  format %{ "CLEAR  [$base, $cnt]\t! ClearArray" %}
+
+  ins_encode %{
+
+    assert(MinObjAlignmentInBytes >= BytesPerLong, "need alternate implementation");
+    Register to    = $base$$Register;
+    Register count = $cnt$$Register;
+    Register temp  = $tmp$$Register;
+
+    Label Ldone;
+    __ nop(); // Separate short branches
+    // Use BIS for zeroing
+    __ bis_zeroing(to, count, temp, Ldone);
+    __ bind(Ldone);
+
+  %}
  ins_pipe(long_memory_op);
 %}


--- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp
+++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp
@@ -3069,6 +3069,34 @@ class StubGenerator: public StubCodeGenerator {
    return start;
  }

+  //
+  //  Generate stub for heap zeroing.
+  //  "to" address is aligned to jlong (8 bytes).
+  //
+  // Arguments for generated stub:
+  //      to:    O0
+  //      count: O1 treated as signed (count of HeapWord)
+  //             count could be 0
+  //
+  address generate_zero_aligned_words(const char* name) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ pc();
+
+    const Register to    = O0;   // source array address
+    const Register count = O1;   // HeapWords count
+    const Register temp  = O2;   // scratch
+
+    Label Ldone;
+    __ sllx(count, LogHeapWordSize, count); // to bytes count
+    // Use BIS for zeroing
+    __ bis_zeroing(to, count, temp, Ldone);
+    __ bind(Ldone);
+    __ retl();
+    __ delayed()->nop();
+    return start;
+}
+
  void generate_arraycopy_stubs() {
    address entry;
    address entry_jbyte_arraycopy;
@@ -3195,6 +3223,10 @@ class StubGenerator: public StubCodeGenerator {
    StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
    StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
    StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
+
+    if (UseBlockZeroing) {
+      StubRoutines::_zero_aligned_words = generate_zero_aligned_words("zero_aligned_words");
+    }
  }

  void generate_initial() {

--- a/src/cpu/sparc/vm/templateTable_sparc.cpp
+++ b/src/cpu/sparc/vm/templateTable_sparc.cpp
@@ -3374,7 +3374,7 @@ void TemplateTable::_new() {

  if(UseTLAB) {
    Register RoldTopValue = RallocatedObject;
-    Register RtopAddr = G3_scratch, RtlabWasteLimitValue = G3_scratch;
+    Register RtlabWasteLimitValue = G3_scratch;
    Register RnewTopValue = G1_scratch;
    Register RendValue = Rscratch;
    Register RfreeValue = RnewTopValue;
@@ -3455,7 +3455,11 @@ void TemplateTable::_new() {
    __ delayed()->add(RallocatedObject, sizeof(oopDesc), G3_scratch);

    // initialize remaining object fields
-    { Label loop;
+    if (UseBlockZeroing) {
+      // Use BIS for zeroing
+      __ bis_zeroing(G3_scratch, Roffset, G1_scratch, initialize_header);
+    } else {
+      Label loop;
      __ subcc(Roffset, wordSize, Roffset);
      __ bind(loop);
      //__ subcc(Roffset, wordSize, Roffset);      // executed above loop or in delay slot

--- a/src/cpu/sparc/vm/vm_version_sparc.cpp
+++ b/src/cpu/sparc/vm/vm_version_sparc.cpp
@@ -170,6 +170,16 @@ void VM_Version::initialize() {
    FLAG_SET_DEFAULT(UseCBCond, false);
  }

+  assert(BlockZeroingLowLimit > 0, "invalid value");
+  if (has_block_zeroing()) {
+    if (FLAG_IS_DEFAULT(UseBlockZeroing)) {
+      FLAG_SET_DEFAULT(UseBlockZeroing, true);
+    }
+  } else if (UseBlockZeroing) {
+    warning("BIS zeroing instructions are not available on this CPU");
+    FLAG_SET_DEFAULT(UseBlockZeroing, false);
+  }
+
 #ifdef COMPILER2
  // T4 and newer Sparc cpus have fast RDPC.
  if (has_fast_rdpc() && FLAG_IS_DEFAULT(UseRDPCForConstantTableBase)) {

--- a/src/cpu/sparc/vm/vm_version_sparc.hpp
+++ b/src/cpu/sparc/vm/vm_version_sparc.hpp
@@ -135,8 +135,8 @@ public:
  // T4 and newer Sparc have fast RDPC instruction.
  static bool has_fast_rdpc()           { return is_T4(); }

-  // T4 and newer Sparc have Most-Recently-Used (MRU) BIS.
-  static bool has_mru_blk_init()        { return has_blk_init() && is_T4(); }
+  // On T4 and newer Sparc BIS to the beginning of cache line always zeros it.
+  static bool has_block_zeroing()       { return has_blk_init() && is_T4(); }

  static const char* cpu_features()     { return _features_str; }


--- a/src/share/vm/gc_interface/collectedHeap.cpp
+++ b/src/share/vm/gc_interface/collectedHeap.cpp
@@ -157,8 +157,14 @@ HeapWord* CollectedHeap::allocate_from_tlab_slow(Thread* thread, size_t size) {
    // ..and clear it.
    Copy::zero_to_words(obj, new_tlab_size);
  } else {
-    // ...and clear just the allocated object.
-    Copy::zero_to_words(obj, size);
+    // ...and zap just allocated object.
+#ifdef ASSERT
+    // Skip mangling the space corresponding to the object header to
+    // ensure that the returned space is not considered parsable by
+    // any concurrent GC thread.
+    size_t hdr_size = oopDesc::header_size();
+    Copy::fill_to_words(obj + hdr_size, new_tlab_size - hdr_size, badHeapWordVal);
+#endif // ASSERT
  }
  thread->tlab().fill(obj, obj + size, new_tlab_size);
  return obj;

--- a/src/share/vm/gc_interface/collectedHeap.inline.hpp
+++ b/src/share/vm/gc_interface/collectedHeap.inline.hpp
@@ -287,7 +287,10 @@ oop CollectedHeap::permanent_obj_allocate_no_klass_install(KlassHandle klass,
  assert(size >= 0, "int won't convert to size_t");
  HeapWord* obj = common_permanent_mem_allocate_init(size, CHECK_NULL);
  post_allocation_setup_no_klass_install(klass, obj, size);
-  NOT_PRODUCT(Universe::heap()->check_for_bad_heap_word_value(obj, size));
+#ifndef PRODUCT
+  const size_t hs = oopDesc::header_size();
+  Universe::heap()->check_for_bad_heap_word_value(obj+hs, size-hs);
+#endif
  return (oop)obj;
 }


--- a/src/share/vm/oops/cpCacheKlass.cpp
+++ b/src/share/vm/oops/cpCacheKlass.cpp
@@ -63,8 +63,10 @@ constantPoolCacheOop constantPoolCacheKlass::allocate(int length,
  //   CollectedHeap::permanent_obj_allocate(klass, size, CHECK_NULL);

  oop obj = CollectedHeap::permanent_obj_allocate_no_klass_install(klass, size, CHECK_NULL);
-  NOT_PRODUCT(Universe::heap()->check_for_bad_heap_word_value((HeapWord*) obj,
-                                                              size));
+#ifndef PRODUCT
+  const size_t hs = oopDesc::header_size();
+  Universe::heap()->check_for_bad_heap_word_value(((HeapWord*) obj)+hs, size-hs);
+#endif
  constantPoolCacheOop cache = (constantPoolCacheOop) obj;
  assert(!UseConcMarkSweepGC || obj->klass_or_null() == NULL,
         "klass should be NULL here when using CMS");

--- a/src/share/vm/runtime/globals.hpp
+++ b/src/share/vm/runtime/globals.hpp
@@ -1979,6 +1979,12 @@ class CommandLineFlags {
  product(bool, TLABStats, true,                                            \
          "Print various TLAB related information")                         \
                                                                            \
+  product(bool, UseBlockZeroing, false,                                     \
+          "Use special cpu instructions for block zeroing")                 \
+                                                                            \
+  product(intx, BlockZeroingLowLimit, 2048,                                 \
+          "Minimum size in bytes when block zeroing will be used")          \
+                                                                            \
  product(bool, PrintRevisitStats, false,                                   \
          "Print revisit (klass and MDO) stack related information")        \
                                                                            \

--- a/src/share/vm/runtime/stubRoutines.cpp
+++ b/src/share/vm/runtime/stubRoutines.cpp
@@ -108,6 +108,7 @@ address StubRoutines::_arrayof_jlong_disjoint_arraycopy  = CAST_FROM_FN_PTR(addr
 address StubRoutines::_arrayof_oop_disjoint_arraycopy    = CAST_FROM_FN_PTR(address, StubRoutines::arrayof_oop_copy);
 address StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit  = CAST_FROM_FN_PTR(address, StubRoutines::arrayof_oop_copy_uninit);

+address StubRoutines::_zero_aligned_words = CAST_FROM_FN_PTR(address, Copy::zero_to_words);

 address StubRoutines::_checkcast_arraycopy               = NULL;
 address StubRoutines::_checkcast_arraycopy_uninit        = NULL;

--- a/src/share/vm/runtime/stubRoutines.hpp
+++ b/src/share/vm/runtime/stubRoutines.hpp
@@ -199,6 +199,9 @@ class StubRoutines: AllStatic {
  static address _arrayof_jshort_fill;
  static address _arrayof_jint_fill;

+  // zero heap space aligned to jlong (8 bytes)
+  static address _zero_aligned_words;
+
  // These are versions of the java.lang.Math methods which perform
  // the same operations as the intrinsic version.  They are used for
  // constant folding in the compiler to ensure equivalence.  If the
@@ -332,6 +335,7 @@ class StubRoutines: AllStatic {

  static address select_fill_function(BasicType t, bool aligned, const char* &name);

+  static address zero_aligned_words()   { return _zero_aligned_words; }

  static double  intrinsic_log(double d) {
    assert(_intrinsic_log != NULL, "must be defined");