From 68f60c445e85a31d6c6a8947ecf4680bb8477520 Mon Sep 17 00:00:00 2001 From: mhorie Date: Mon, 20 May 2019 16:05:27 -0400 Subject: [PATCH] 8158232: PPC64: improve byte, int and long array copy stubs by using VSX instructions Reviewed-by: goetz, mdoerr Contributed-by: Kazunori Ogata --- src/cpu/ppc/vm/stubGenerator_ppc.cpp | 138 +++++++++++++++++++++++++-- 1 file changed, 132 insertions(+), 6 deletions(-) diff --git a/src/cpu/ppc/vm/stubGenerator_ppc.cpp b/src/cpu/ppc/vm/stubGenerator_ppc.cpp index 3d302c4b0..01fd08d4a 100644 --- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp +++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp @@ -1131,8 +1131,11 @@ class StubGenerator: public StubCodeGenerator { Register tmp3 = R8_ARG6; Register tmp4 = R9_ARG7; + VectorSRegister tmp_vsr1 = VSR1; + VectorSRegister tmp_vsr2 = VSR2; + + Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10; - Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9; // Don't try anything fancy if arrays don't have many elements. __ li(tmp3, 0); __ cmpwi(CCR0, R5_ARG3, 17); @@ -1186,6 +1189,8 @@ class StubGenerator: public StubCodeGenerator { __ andi_(R5_ARG3, R5_ARG3, 31); __ mtctr(tmp1); + if (!VM_Version::has_vsx()) { + __ bind(l_8); // Use unrolled version for mass copying (copy 32 elements a time) // Load feeding store gets zero latency on Power6, however not on Power5. @@ -1201,7 +1206,44 @@ class StubGenerator: public StubCodeGenerator { __ addi(R3_ARG1, R3_ARG1, 32); __ addi(R4_ARG2, R4_ARG2, 32); __ bdnz(l_8); - } + + } else { // Processor supports VSX, so use it to mass copy. + + // Prefetch the data into the L2 cache. + __ dcbt(R3_ARG1, 0); + + // If supported set DSCR pre-fetch to deepest. + if (VM_Version::has_mfdscr()) { + __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7); + __ mtdscr(tmp2); + } + + __ li(tmp1, 16); + + // Backbranch target aligned to 32-byte. Not 16-byte align as + // loop contains < 8 instructions that fit inside a single + // i-cache sector. + __ align(32); + + __ bind(l_10); + // Use loop with VSX load/store instructions to + // copy 32 elements a time. + __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load src + __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst + __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 + __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 + __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 + __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 + __ bdnz(l_10); // Dec CTR and loop if not zero. + + // Restore DSCR pre-fetch value. + if (VM_Version::has_mfdscr()) { + __ load_const_optimized(tmp2, VM_Version::_dscr_val); + __ mtdscr(tmp2); + } + + } // VSX + } // FasterArrayCopy __ bind(l_6); @@ -1570,7 +1612,11 @@ class StubGenerator: public StubCodeGenerator { Register tmp3 = R8_ARG6; Register tmp4 = R0; - Label l_1, l_2, l_3, l_4, l_5, l_6; + VectorSRegister tmp_vsr1 = VSR1; + VectorSRegister tmp_vsr2 = VSR2; + + Label l_1, l_2, l_3, l_4, l_5, l_6, l_7; + // for short arrays, just do single element copy __ li(tmp3, 0); __ cmpwi(CCR0, R5_ARG3, 5); @@ -1605,6 +1651,8 @@ class StubGenerator: public StubCodeGenerator { __ andi_(R5_ARG3, R5_ARG3, 7); __ mtctr(tmp1); + if (!VM_Version::has_vsx()) { + __ bind(l_6); // Use unrolled version for mass copying (copy 8 elements a time). // Load feeding store gets zero latency on power6, however not on power 5. @@ -1620,7 +1668,44 @@ class StubGenerator: public StubCodeGenerator { __ addi(R3_ARG1, R3_ARG1, 32); __ addi(R4_ARG2, R4_ARG2, 32); __ bdnz(l_6); - } + + } else { // Processor supports VSX, so use it to mass copy. + + // Prefetch the data into the L2 cache. + __ dcbt(R3_ARG1, 0); + + // If supported set DSCR pre-fetch to deepest. + if (VM_Version::has_mfdscr()) { + __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7); + __ mtdscr(tmp2); + } + + __ li(tmp1, 16); + + // Backbranch target aligned to 32-byte. Not 16-byte align as + // loop contains < 8 instructions that fit inside a single + // i-cache sector. + __ align(32); + + __ bind(l_7); + // Use loop with VSX load/store instructions to + // copy 8 elements a time. + __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load src + __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst + __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 + __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 + __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 + __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 + __ bdnz(l_7); // Dec CTR and loop if not zero. + + // Restore DSCR pre-fetch value. + if (VM_Version::has_mfdscr()) { + __ load_const_optimized(tmp2, VM_Version::_dscr_val); + __ mtdscr(tmp2); + } + + } // VSX + } // FasterArrayCopy // copy 1 element at a time __ bind(l_2); @@ -1772,7 +1857,10 @@ class StubGenerator: public StubCodeGenerator { Register tmp3 = R8_ARG6; Register tmp4 = R0; - Label l_1, l_2, l_3, l_4; + Label l_1, l_2, l_3, l_4, l_5; + + VectorSRegister tmp_vsr1 = VSR1; + VectorSRegister tmp_vsr2 = VSR2; { // FasterArrayCopy __ cmpwi(CCR0, R5_ARG3, 3); @@ -1782,6 +1870,7 @@ class StubGenerator: public StubCodeGenerator { __ andi_(R5_ARG3, R5_ARG3, 3); __ mtctr(tmp1); + if (!VM_Version::has_vsx()) { __ bind(l_4); // Use unrolled version for mass copying (copy 4 elements a time). // Load feeding store gets zero latency on Power6, however not on Power5. @@ -1797,7 +1886,44 @@ class StubGenerator: public StubCodeGenerator { __ addi(R3_ARG1, R3_ARG1, 32); __ addi(R4_ARG2, R4_ARG2, 32); __ bdnz(l_4); - } + + } else { // Processor supports VSX, so use it to mass copy. + + // Prefetch the data into the L2 cache. + __ dcbt(R3_ARG1, 0); + + // If supported set DSCR pre-fetch to deepest. + if (VM_Version::has_mfdscr()) { + __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7); + __ mtdscr(tmp2); + } + + __ li(tmp1, 16); + + // Backbranch target aligned to 32-byte. Not 16-byte align as + // loop contains < 8 instructions that fit inside a single + // i-cache sector. + __ align(32); + + __ bind(l_5); + // Use loop with VSX load/store instructions to + // copy 4 elements a time. + __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load src + __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst + __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 + __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 + __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 + __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 + __ bdnz(l_5); // Dec CTR and loop if not zero. + + // Restore DSCR pre-fetch value. + if (VM_Version::has_mfdscr()) { + __ load_const_optimized(tmp2, VM_Version::_dscr_val); + __ mtdscr(tmp2); + } + + } // VSX + } // FasterArrayCopy // copy 1 element at a time __ bind(l_3); -- GitLab