From e80d667cefc2a1e8b8ace228c478485e5ea4b1e1 Mon Sep 17 00:00:00 2001 From: kvn Date: Fri, 2 Sep 2011 12:13:33 -0700 Subject: [PATCH] 7039731: arraycopy could use prefetch on SPARC Summary: Use BIS and prefetch in arraycopy stubs for Sparc (BIS for T4 only). Reviewed-by: never, iveresov --- src/cpu/sparc/vm/stubGenerator_sparc.cpp | 334 +++++++++++++++++------ src/cpu/sparc/vm/vm_version_sparc.cpp | 28 ++ src/share/vm/runtime/globals.hpp | 12 + 3 files changed, 284 insertions(+), 90 deletions(-) diff --git a/src/cpu/sparc/vm/stubGenerator_sparc.cpp b/src/cpu/sparc/vm/stubGenerator_sparc.cpp index 9f6274d5a..a7fe854e0 100644 --- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp +++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp @@ -1124,6 +1124,126 @@ class StubGenerator: public StubCodeGenerator { } } + // + // Generate main code for disjoint arraycopy + // + typedef void (StubGenerator::*CopyLoopFunc)(Register from, Register to, Register count, int count_dec, + Label& L_loop, bool use_prefetch, bool use_bis); + + void disjoint_copy_core(Register from, Register to, Register count, int log2_elem_size, + int iter_size, CopyLoopFunc copy_loop_func) { + Label L_copy; + + assert(log2_elem_size <= 3, "the following code should be changed"); + int count_dec = 16>>log2_elem_size; + + int prefetch_dist = MAX2(ArraycopySrcPrefetchDistance, ArraycopyDstPrefetchDistance); + assert(prefetch_dist < 4096, "invalid value"); + prefetch_dist = (prefetch_dist + (iter_size-1)) & (-iter_size); // round up to one iteration copy size + int prefetch_count = (prefetch_dist >> log2_elem_size); // elements count + + if (UseBlockCopy) { + Label L_block_copy, L_block_copy_prefetch, L_skip_block_copy; + + // 64 bytes tail + bytes copied in one loop iteration + int tail_size = 64 + iter_size; + int block_copy_count = (MAX2(tail_size, (int)BlockCopyLowLimit)) >> log2_elem_size; + // Use BIS copy only for big arrays since it requires membar. + __ set(block_copy_count, O4); + __ cmp_and_br_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_skip_block_copy); + // This code is for disjoint source and destination: + // to <= from || to >= from+count + // but BIS will stomp over 'from' if (to > from-tail_size && to <= from) + __ sub(from, to, O4); + __ srax(O4, 4, O4); // divide by 16 since following short branch have only 5 bits for imm. + __ cmp_and_br_short(O4, (tail_size>>4), Assembler::lessEqualUnsigned, Assembler::pn, L_skip_block_copy); + + __ wrasi(G0, Assembler::ASI_ST_BLKINIT_PRIMARY); + // BIS should not be used to copy tail (64 bytes+iter_size) + // to avoid zeroing of following values. + __ sub(count, (tail_size>>log2_elem_size), count); // count is still positive >= 0 + + if (prefetch_count > 0) { // rounded up to one iteration count + // Do prefetching only if copy size is bigger + // than prefetch distance. + __ set(prefetch_count, O4); + __ cmp_and_brx_short(count, O4, Assembler::less, Assembler::pt, L_block_copy); + __ sub(count, prefetch_count, count); + + (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy_prefetch, true, true); + __ add(count, prefetch_count, count); // restore count + + } // prefetch_count > 0 + + (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy, false, true); + __ add(count, (tail_size>>log2_elem_size), count); // restore count + + __ wrasi(G0, Assembler::ASI_PRIMARY_NOFAULT); + // BIS needs membar. + __ membar(Assembler::StoreLoad); + // Copy tail + __ ba_short(L_copy); + + __ BIND(L_skip_block_copy); + } // UseBlockCopy + + if (prefetch_count > 0) { // rounded up to one iteration count + // Do prefetching only if copy size is bigger + // than prefetch distance. + __ set(prefetch_count, O4); + __ cmp_and_brx_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_copy); + __ sub(count, prefetch_count, count); + + Label L_copy_prefetch; + (this->*copy_loop_func)(from, to, count, count_dec, L_copy_prefetch, true, false); + __ add(count, prefetch_count, count); // restore count + + } // prefetch_count > 0 + + (this->*copy_loop_func)(from, to, count, count_dec, L_copy, false, false); + } + + + + // + // Helper methods for copy_16_bytes_forward_with_shift() + // + void copy_16_bytes_shift_loop(Register from, Register to, Register count, int count_dec, + Label& L_loop, bool use_prefetch, bool use_bis) { + + const Register left_shift = G1; // left shift bit counter + const Register right_shift = G5; // right shift bit counter + + __ align(OptoLoopAlignment); + __ BIND(L_loop); + if (use_prefetch) { + if (ArraycopySrcPrefetchDistance > 0) { + __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads); + } + if (ArraycopyDstPrefetchDistance > 0) { + __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads); + } + } + __ ldx(from, 0, O4); + __ ldx(from, 8, G4); + __ inc(to, 16); + __ inc(from, 16); + __ deccc(count, count_dec); // Can we do next iteration after this one? + __ srlx(O4, right_shift, G3); + __ bset(G3, O3); + __ sllx(O4, left_shift, O4); + __ srlx(G4, right_shift, G3); + __ bset(G3, O4); + if (use_bis) { + __ stxa(O3, to, -16); + __ stxa(O4, to, -8); + } else { + __ stx(O3, to, -16); + __ stx(O4, to, -8); + } + __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); + __ delayed()->sllx(G4, left_shift, O3); + } // Copy big chunks forward with shift // @@ -1135,64 +1255,51 @@ class StubGenerator: public StubCodeGenerator { // L_copy_bytes - copy exit label // void copy_16_bytes_forward_with_shift(Register from, Register to, - Register count, int count_dec, Label& L_copy_bytes) { - Label L_loop, L_aligned_copy, L_copy_last_bytes; + Register count, int log2_elem_size, Label& L_copy_bytes) { + Label L_aligned_copy, L_copy_last_bytes; + assert(log2_elem_size <= 3, "the following code should be changed"); + int count_dec = 16>>log2_elem_size; // if both arrays have the same alignment mod 8, do 8 bytes aligned copy - __ andcc(from, 7, G1); // misaligned bytes - __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); - __ delayed()->nop(); + __ andcc(from, 7, G1); // misaligned bytes + __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); + __ delayed()->nop(); const Register left_shift = G1; // left shift bit counter const Register right_shift = G5; // right shift bit counter - __ sll(G1, LogBitsPerByte, left_shift); - __ mov(64, right_shift); - __ sub(right_shift, left_shift, right_shift); + __ sll(G1, LogBitsPerByte, left_shift); + __ mov(64, right_shift); + __ sub(right_shift, left_shift, right_shift); // // Load 2 aligned 8-bytes chunks and use one from previous iteration // to form 2 aligned 8-bytes chunks to store. // - __ deccc(count, count_dec); // Pre-decrement 'count' - __ andn(from, 7, from); // Align address - __ ldx(from, 0, O3); - __ inc(from, 8); - __ align(OptoLoopAlignment); - __ BIND(L_loop); - __ ldx(from, 0, O4); - __ deccc(count, count_dec); // Can we do next iteration after this one? - __ ldx(from, 8, G4); - __ inc(to, 16); - __ inc(from, 16); - __ sllx(O3, left_shift, O3); - __ srlx(O4, right_shift, G3); - __ bset(G3, O3); - __ stx(O3, to, -16); - __ sllx(O4, left_shift, O4); - __ srlx(G4, right_shift, G3); - __ bset(G3, O4); - __ stx(O4, to, -8); - __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); - __ delayed()->mov(G4, O3); - - __ inccc(count, count_dec>>1 ); // + 8 bytes - __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes); - __ delayed()->inc(count, count_dec>>1); // restore 'count' - - // copy 8 bytes, part of them already loaded in O3 - __ ldx(from, 0, O4); - __ inc(to, 8); - __ inc(from, 8); - __ sllx(O3, left_shift, O3); - __ srlx(O4, right_shift, G3); - __ bset(O3, G3); - __ stx(G3, to, -8); + __ dec(count, count_dec); // Pre-decrement 'count' + __ andn(from, 7, from); // Align address + __ ldx(from, 0, O3); + __ inc(from, 8); + __ sllx(O3, left_shift, O3); + + disjoint_copy_core(from, to, count, log2_elem_size, 16, copy_16_bytes_shift_loop); + + __ inccc(count, count_dec>>1 ); // + 8 bytes + __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes); + __ delayed()->inc(count, count_dec>>1); // restore 'count' + + // copy 8 bytes, part of them already loaded in O3 + __ ldx(from, 0, O4); + __ inc(to, 8); + __ inc(from, 8); + __ srlx(O4, right_shift, G3); + __ bset(O3, G3); + __ stx(G3, to, -8); __ BIND(L_copy_last_bytes); - __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes - __ br(Assembler::always, false, Assembler::pt, L_copy_bytes); - __ delayed()->sub(from, right_shift, from); // restore address + __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes + __ br(Assembler::always, false, Assembler::pt, L_copy_bytes); + __ delayed()->sub(from, right_shift, from); // restore address __ BIND(L_aligned_copy); } @@ -1348,7 +1455,7 @@ class StubGenerator: public StubCodeGenerator { // The compare above (count >= 23) guarantes 'count' >= 16 bytes. // Also jump over aligned copy after the copy with shift completed. - copy_16_bytes_forward_with_shift(from, to, count, 16, L_copy_byte); + copy_16_bytes_forward_with_shift(from, to, count, 0, L_copy_byte); } // Both array are 8 bytes aligned, copy 16 bytes at a time @@ -1576,7 +1683,7 @@ class StubGenerator: public StubCodeGenerator { // The compare above (count >= 11) guarantes 'count' >= 16 bytes. // Also jump over aligned copy after the copy with shift completed. - copy_16_bytes_forward_with_shift(from, to, count, 8, L_copy_2_bytes); + copy_16_bytes_forward_with_shift(from, to, count, 1, L_copy_2_bytes); } // Both array are 8 bytes aligned, copy 16 bytes at a time @@ -1949,6 +2056,45 @@ class StubGenerator: public StubCodeGenerator { return start; } + // + // Helper methods for generate_disjoint_int_copy_core() + // + void copy_16_bytes_loop(Register from, Register to, Register count, int count_dec, + Label& L_loop, bool use_prefetch, bool use_bis) { + + __ align(OptoLoopAlignment); + __ BIND(L_loop); + if (use_prefetch) { + if (ArraycopySrcPrefetchDistance > 0) { + __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads); + } + if (ArraycopyDstPrefetchDistance > 0) { + __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads); + } + } + __ ldx(from, 4, O4); + __ ldx(from, 12, G4); + __ inc(to, 16); + __ inc(from, 16); + __ deccc(count, 4); // Can we do next iteration after this one? + + __ srlx(O4, 32, G3); + __ bset(G3, O3); + __ sllx(O4, 32, O4); + __ srlx(G4, 32, G3); + __ bset(G3, O4); + if (use_bis) { + __ stxa(O3, to, -16); + __ stxa(O4, to, -8); + } else { + __ stx(O3, to, -16); + __ stx(O4, to, -8); + } + __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); + __ delayed()->sllx(G4, 32, O3); + + } + // // Generate core code for disjoint int copy (and oop copy on 32-bit). // If "aligned" is true, the "from" and "to" addresses are assumed @@ -1962,7 +2108,7 @@ class StubGenerator: public StubCodeGenerator { void generate_disjoint_int_copy_core(bool aligned) { Label L_skip_alignment, L_aligned_copy; - Label L_copy_16_bytes, L_copy_4_bytes, L_copy_4_bytes_loop, L_exit; + Label L_copy_4_bytes, L_copy_4_bytes_loop, L_exit; const Register from = O0; // source array address const Register to = O1; // destination array address @@ -2013,30 +2159,16 @@ class StubGenerator: public StubCodeGenerator { // copy with shift 4 elements (16 bytes) at a time __ dec(count, 4); // The cmp at the beginning guaranty count >= 4 + __ sllx(O3, 32, O3); - __ align(OptoLoopAlignment); - __ BIND(L_copy_16_bytes); - __ ldx(from, 4, O4); - __ deccc(count, 4); // Can we do next iteration after this one? - __ ldx(from, 12, G4); - __ inc(to, 16); - __ inc(from, 16); - __ sllx(O3, 32, O3); - __ srlx(O4, 32, G3); - __ bset(G3, O3); - __ stx(O3, to, -16); - __ sllx(O4, 32, O4); - __ srlx(G4, 32, G3); - __ bset(G3, O4); - __ stx(O4, to, -8); - __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes); - __ delayed()->mov(G4, O3); + disjoint_copy_core(from, to, count, 2, 16, copy_16_bytes_loop); __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes); __ delayed()->inc(count, 4); // restore 'count' __ BIND(L_aligned_copy); - } + } // !aligned + // copy 4 elements (16 bytes) at a time __ and3(count, 1, G4); // Save __ srl(count, 1, count); @@ -2222,6 +2354,38 @@ class StubGenerator: public StubCodeGenerator { return start; } + // + // Helper methods for generate_disjoint_long_copy_core() + // + void copy_64_bytes_loop(Register from, Register to, Register count, int count_dec, + Label& L_loop, bool use_prefetch, bool use_bis) { + __ align(OptoLoopAlignment); + __ BIND(L_loop); + for (int off = 0; off < 64; off += 16) { + if (use_prefetch && (off & 31) == 0) { + if (ArraycopySrcPrefetchDistance > 0) { + __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads); + } + if (ArraycopyDstPrefetchDistance > 0) { + __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads); + } + } + __ ldx(from, off+0, O4); + __ ldx(from, off+8, O5); + if (use_bis) { + __ stxa(O4, to, off+0); + __ stxa(O5, to, off+8); + } else { + __ stx(O4, to, off+0); + __ stx(O5, to, off+8); + } + } + __ deccc(count, 8); + __ inc(from, 64); + __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); + __ delayed()->inc(to, 64); + } + // // Generate core code for disjoint long copy (and oop copy on 64-bit). // "aligned" is ignored, because we must make the stronger @@ -2261,38 +2425,28 @@ class StubGenerator: public StubCodeGenerator { const Register offset0 = O4; // element offset const Register offset8 = O5; // next element offset - __ deccc(count, 2); - __ mov(G0, offset0); // offset from start of arrays (0) - __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes ); - __ delayed()->add(offset0, 8, offset8); + __ deccc(count, 2); + __ mov(G0, offset0); // offset from start of arrays (0) + __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes ); + __ delayed()->add(offset0, 8, offset8); // Copy by 64 bytes chunks - Label L_copy_64_bytes; + const Register from64 = O3; // source address const Register to64 = G3; // destination address - __ subcc(count, 6, O3); - __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes ); - __ delayed()->mov(to, to64); - // Now we can use O4(offset0), O5(offset8) as temps - __ mov(O3, count); - __ mov(from, from64); + __ subcc(count, 6, O3); + __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes ); + __ delayed()->mov(to, to64); + // Now we can use O4(offset0), O5(offset8) as temps + __ mov(O3, count); + // count >= 0 (original count - 8) + __ mov(from, from64); - __ align(OptoLoopAlignment); - __ BIND(L_copy_64_bytes); - for( int off = 0; off < 64; off += 16 ) { - __ ldx(from64, off+0, O4); - __ ldx(from64, off+8, O5); - __ stx(O4, to64, off+0); - __ stx(O5, to64, off+8); - } - __ deccc(count, 8); - __ inc(from64, 64); - __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_64_bytes); - __ delayed()->inc(to64, 64); + disjoint_copy_core(from64, to64, count, 3, 64, copy_64_bytes_loop); // Restore O4(offset0), O5(offset8) __ sub(from64, from, offset0); - __ inccc(count, 6); + __ inccc(count, 6); // restore count __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes ); __ delayed()->add(offset0, 8, offset8); diff --git a/src/cpu/sparc/vm/vm_version_sparc.cpp b/src/cpu/sparc/vm/vm_version_sparc.cpp index 2aae43c28..e1429c68d 100644 --- a/src/cpu/sparc/vm/vm_version_sparc.cpp +++ b/src/cpu/sparc/vm/vm_version_sparc.cpp @@ -75,6 +75,24 @@ void VM_Version::initialize() { FLAG_SET_DEFAULT(AllocatePrefetchStyle, 1); } + if (has_v9()) { + assert(ArraycopySrcPrefetchDistance < 4096, "invalid value"); + if (ArraycopySrcPrefetchDistance >= 4096) + ArraycopySrcPrefetchDistance = 4064; + assert(ArraycopyDstPrefetchDistance < 4096, "invalid value"); + if (ArraycopyDstPrefetchDistance >= 4096) + ArraycopyDstPrefetchDistance = 4064; + } else { + if (ArraycopySrcPrefetchDistance > 0) { + warning("prefetch instructions are not available on this CPU"); + FLAG_SET_DEFAULT(ArraycopySrcPrefetchDistance, 0); + } + if (ArraycopyDstPrefetchDistance > 0) { + warning("prefetch instructions are not available on this CPU"); + FLAG_SET_DEFAULT(ArraycopyDstPrefetchDistance, 0); + } + } + UseSSE = 0; // Only on x86 and x64 _supports_cx8 = has_v9(); @@ -180,6 +198,16 @@ void VM_Version::initialize() { FLAG_SET_DEFAULT(UseBlockZeroing, false); } + assert(BlockCopyLowLimit > 0, "invalid value"); + if (has_block_zeroing()) { // has_blk_init() && is_T4(): core's local L2 cache + if (FLAG_IS_DEFAULT(UseBlockCopy)) { + FLAG_SET_DEFAULT(UseBlockCopy, true); + } + } else if (UseBlockCopy) { + warning("BIS instructions are not available or expensive on this CPU"); + FLAG_SET_DEFAULT(UseBlockCopy, false); + } + #ifdef COMPILER2 // T4 and newer Sparc cpus have fast RDPC. if (has_fast_rdpc() && FLAG_IS_DEFAULT(UseRDPCForConstantTableBase)) { diff --git a/src/share/vm/runtime/globals.hpp b/src/share/vm/runtime/globals.hpp index 905379a72..1f3d086fb 100644 --- a/src/share/vm/runtime/globals.hpp +++ b/src/share/vm/runtime/globals.hpp @@ -1985,6 +1985,12 @@ class CommandLineFlags { product(intx, BlockZeroingLowLimit, 2048, \ "Minimum size in bytes when block zeroing will be used") \ \ + product(bool, UseBlockCopy, false, \ + "Use special cpu instructions for block copy") \ + \ + product(intx, BlockCopyLowLimit, 2048, \ + "Minimum size in bytes when block copy will be used") \ + \ product(bool, PrintRevisitStats, false, \ "Print revisit (klass and MDO) stack related information") \ \ @@ -2918,6 +2924,12 @@ class CommandLineFlags { product(intx, ReadPrefetchInstr, 0, \ "Prefetch instruction to prefetch ahead") \ \ + product(uintx, ArraycopySrcPrefetchDistance, 0, \ + "Distance to prefetch source array in arracopy") \ + \ + product(uintx, ArraycopyDstPrefetchDistance, 0, \ + "Distance to prefetch destination array in arracopy") \ + \ /* deoptimization */ \ develop(bool, TraceDeoptimization, false, \ "Trace deoptimization") \ -- GitLab