提交 68f60c44 编写于 作者: M mhorie

8158232: PPC64: improve byte, int and long array copy stubs by using VSX instructions

Reviewed-by: goetz, mdoerr
Contributed-by: NKazunori Ogata <ogatak@jp.ibm.com>
上级 b314bffc
......@@ -1131,8 +1131,11 @@ class StubGenerator: public StubCodeGenerator {
Register tmp3 = R8_ARG6;
Register tmp4 = R9_ARG7;
VectorSRegister tmp_vsr1 = VSR1;
VectorSRegister tmp_vsr2 = VSR2;
Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10;
Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
// Don't try anything fancy if arrays don't have many elements.
__ li(tmp3, 0);
__ cmpwi(CCR0, R5_ARG3, 17);
......@@ -1186,6 +1189,8 @@ class StubGenerator: public StubCodeGenerator {
__ andi_(R5_ARG3, R5_ARG3, 31);
__ mtctr(tmp1);
if (!VM_Version::has_vsx()) {
__ bind(l_8);
// Use unrolled version for mass copying (copy 32 elements a time)
// Load feeding store gets zero latency on Power6, however not on Power5.
......@@ -1201,8 +1206,45 @@ class StubGenerator: public StubCodeGenerator {
__ addi(R3_ARG1, R3_ARG1, 32);
__ addi(R4_ARG2, R4_ARG2, 32);
__ bdnz(l_8);
} else { // Processor supports VSX, so use it to mass copy.
// Prefetch the data into the L2 cache.
__ dcbt(R3_ARG1, 0);
// If supported set DSCR pre-fetch to deepest.
if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
__ mtdscr(tmp2);
}
__ li(tmp1, 16);
// Backbranch target aligned to 32-byte. Not 16-byte align as
// loop contains < 8 instructions that fit inside a single
// i-cache sector.
__ align(32);
__ bind(l_10);
// Use loop with VSX load/store instructions to
// copy 32 elements a time.
__ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load src
__ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst
__ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
__ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
__ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
__ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
__ bdnz(l_10); // Dec CTR and loop if not zero.
// Restore DSCR pre-fetch value.
if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val);
__ mtdscr(tmp2);
}
} // VSX
} // FasterArrayCopy
__ bind(l_6);
// copy 4 elements at a time
......@@ -1570,7 +1612,11 @@ class StubGenerator: public StubCodeGenerator {
Register tmp3 = R8_ARG6;
Register tmp4 = R0;
Label l_1, l_2, l_3, l_4, l_5, l_6;
VectorSRegister tmp_vsr1 = VSR1;
VectorSRegister tmp_vsr2 = VSR2;
Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
// for short arrays, just do single element copy
__ li(tmp3, 0);
__ cmpwi(CCR0, R5_ARG3, 5);
......@@ -1605,6 +1651,8 @@ class StubGenerator: public StubCodeGenerator {
__ andi_(R5_ARG3, R5_ARG3, 7);
__ mtctr(tmp1);
if (!VM_Version::has_vsx()) {
__ bind(l_6);
// Use unrolled version for mass copying (copy 8 elements a time).
// Load feeding store gets zero latency on power6, however not on power 5.
......@@ -1620,8 +1668,45 @@ class StubGenerator: public StubCodeGenerator {
__ addi(R3_ARG1, R3_ARG1, 32);
__ addi(R4_ARG2, R4_ARG2, 32);
__ bdnz(l_6);
} else { // Processor supports VSX, so use it to mass copy.
// Prefetch the data into the L2 cache.
__ dcbt(R3_ARG1, 0);
// If supported set DSCR pre-fetch to deepest.
if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
__ mtdscr(tmp2);
}
__ li(tmp1, 16);
// Backbranch target aligned to 32-byte. Not 16-byte align as
// loop contains < 8 instructions that fit inside a single
// i-cache sector.
__ align(32);
__ bind(l_7);
// Use loop with VSX load/store instructions to
// copy 8 elements a time.
__ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load src
__ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst
__ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
__ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
__ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
__ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
__ bdnz(l_7); // Dec CTR and loop if not zero.
// Restore DSCR pre-fetch value.
if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val);
__ mtdscr(tmp2);
}
} // VSX
} // FasterArrayCopy
// copy 1 element at a time
__ bind(l_2);
__ cmpwi(CCR0, R5_ARG3, 0);
......@@ -1772,7 +1857,10 @@ class StubGenerator: public StubCodeGenerator {
Register tmp3 = R8_ARG6;
Register tmp4 = R0;
Label l_1, l_2, l_3, l_4;
Label l_1, l_2, l_3, l_4, l_5;
VectorSRegister tmp_vsr1 = VSR1;
VectorSRegister tmp_vsr2 = VSR2;
{ // FasterArrayCopy
__ cmpwi(CCR0, R5_ARG3, 3);
......@@ -1782,6 +1870,7 @@ class StubGenerator: public StubCodeGenerator {
__ andi_(R5_ARG3, R5_ARG3, 3);
__ mtctr(tmp1);
if (!VM_Version::has_vsx()) {
__ bind(l_4);
// Use unrolled version for mass copying (copy 4 elements a time).
// Load feeding store gets zero latency on Power6, however not on Power5.
......@@ -1797,8 +1886,45 @@ class StubGenerator: public StubCodeGenerator {
__ addi(R3_ARG1, R3_ARG1, 32);
__ addi(R4_ARG2, R4_ARG2, 32);
__ bdnz(l_4);
} else { // Processor supports VSX, so use it to mass copy.
// Prefetch the data into the L2 cache.
__ dcbt(R3_ARG1, 0);
// If supported set DSCR pre-fetch to deepest.
if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
__ mtdscr(tmp2);
}
__ li(tmp1, 16);
// Backbranch target aligned to 32-byte. Not 16-byte align as
// loop contains < 8 instructions that fit inside a single
// i-cache sector.
__ align(32);
__ bind(l_5);
// Use loop with VSX load/store instructions to
// copy 4 elements a time.
__ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load src
__ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst
__ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
__ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
__ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
__ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
__ bdnz(l_5); // Dec CTR and loop if not zero.
// Restore DSCR pre-fetch value.
if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val);
__ mtdscr(tmp2);
}
} // VSX
} // FasterArrayCopy
// copy 1 element at a time
__ bind(l_3);
__ cmpwi(CCR0, R5_ARG3, 0);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册