提交 1b55810b 编写于 作者: M mdoerr

8164920: ppc: enhancement of CRC32 intrinsic

Reviewed-by: goetz, mdoerr
Contributed-by: NHiroshi H Horii <horii@jp.ibm.com>
上级 d9bd1ec1
......@@ -468,6 +468,10 @@ class Assembler : public AbstractAssembler {
LVSL_OPCODE = (31u << OPCODE_SHIFT | 6u << 1),
LVSR_OPCODE = (31u << OPCODE_SHIFT | 38u << 1),
// Vector-Scalar (VSX) instruction support.
MTVSRD_OPCODE = (31u << OPCODE_SHIFT | 179u << 1),
MFVSRD_OPCODE = (31u << OPCODE_SHIFT | 51u << 1),
// Vector Permute and Formatting
VPKPX_OPCODE = (4u << OPCODE_SHIFT | 782u ),
VPKSHSS_OPCODE = (4u << OPCODE_SHIFT | 398u ),
......@@ -1938,6 +1942,10 @@ class Assembler : public AbstractAssembler {
inline void mtvscr( VectorRegister b);
inline void mfvscr( VectorRegister d);
// Vector-Scalar (VSX) instructions.
inline void mtvrd( VectorRegister d, Register a);
inline void mfvrd( Register a, VectorRegister d);
// AES (introduced with Power 8)
inline void vcipher( VectorRegister d, VectorRegister a, VectorRegister b);
inline void vcipherlast( VectorRegister d, VectorRegister a, VectorRegister b);
......
......@@ -623,6 +623,10 @@ inline void Assembler::stvxl( VectorRegister d, Register s1, Register s2) { emit
inline void Assembler::lvsl( VectorRegister d, Register s1, Register s2) { emit_int32( LVSL_OPCODE | vrt(d) | ra0mem(s1) | rb(s2)); }
inline void Assembler::lvsr( VectorRegister d, Register s1, Register s2) { emit_int32( LVSR_OPCODE | vrt(d) | ra0mem(s1) | rb(s2)); }
// Vector-Scalar (VSX) instructions.
inline void Assembler::mtvrd( VectorRegister d, Register a) { emit_int32( MTVSRD_OPCODE | vrt(d) | ra(a) | 1u); } // 1u: d is treated as Vector (VMX/Altivec).
inline void Assembler::mfvrd( Register a, VectorRegister d) { emit_int32( MFVSRD_OPCODE | vrt(d) | ra(a) | 1u); } // 1u: d is treated as Vector (VMX/Altivec).
inline void Assembler::vpkpx( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPKPX_OPCODE | vrt(d) | vra(a) | vrb(b)); }
inline void Assembler::vpkshss( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPKSHSS_OPCODE | vrt(d) | vra(a) | vrb(b)); }
inline void Assembler::vpkswss( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPKSWSS_OPCODE | vrt(d) | vra(a) | vrb(b)); }
......
......@@ -3423,6 +3423,565 @@ void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len
BLOCK_COMMENT("} kernel_crc32_1byte");
}
/**
* @param crc register containing existing CRC (32-bit)
* @param buf register pointing to input byte buffer (byte*)
* @param len register containing number of bytes
* @param table register pointing to CRC table
* @param constants register pointing to CRC table for 128-bit aligned memory
* @param barretConstants register pointing to table for barrett reduction
* @param t0 volatile register
* @param t1 volatile register
* @param t2 volatile register
* @param t3 volatile register
*/
void MacroAssembler::kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table,
Register constants, Register barretConstants,
Register t0, Register t1, Register t2, Register t3, Register t4) {
assert_different_registers(crc, buf, len, table);
Label L_alignedHead, L_tail, L_alignTail, L_start, L_end;
Register prealign = t0;
Register postalign = t0;
BLOCK_COMMENT("kernel_crc32_1word_vpmsumb {");
// 1. use kernel_crc32_1word for shorter than 384bit
clrldi(len, len, 32);
cmpdi(CCR0, len, 384);
bge(CCR0, L_start);
Register tc0 = t4;
Register tc1 = constants;
Register tc2 = barretConstants;
kernel_crc32_1word(crc, buf, len, table,t0, t1, t2, t3, tc0, tc1, tc2, table);
b(L_end);
BIND(L_start);
// 2. ~c
nand(crc, crc, crc);
// 3. calculate from 0 to first 128bit-aligned address
clrldi_(prealign, buf, 57);
beq(CCR0, L_alignedHead);
subfic(prealign, prealign, 128);
subf(len, prealign, len);
update_byteLoop_crc32(crc, buf, prealign, table, t2, false, false);
// 4. calculate from first 128bit-aligned address to last 128bit-aligned address
BIND(L_alignedHead);
clrldi(postalign, len, 57);
subf(len, postalign, len);
// len must be more than 256bit
kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t1, t2, t3);
// 5. calculate remaining
cmpdi(CCR0, postalign, 0);
beq(CCR0, L_tail);
update_byteLoop_crc32(crc, buf, postalign, table, t2, false, false);
BIND(L_tail);
// 6. ~c
nand(crc, crc, crc);
BIND(L_end);
BLOCK_COMMENT("} kernel_crc32_1word_vpmsumb");
}
/**
* @param crc register containing existing CRC (32-bit)
* @param buf register pointing to input byte buffer (byte*)
* @param len register containing number of bytes
* @param constants register pointing to CRC table for 128-bit aligned memory
* @param barretConstants register pointing to table for barrett reduction
* @param t0 volatile register
* @param t1 volatile register
* @param t2 volatile register
*/
void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
Register constants, Register barretConstants, Register t0, Register t1, Register t2) {
Label L_mainLoop, L_tail, L_alignTail, L_barrett_reduction, L_end, L_first_warm_up_done, L_first_cool_down, L_second_cool_down, L_XOR, L_test;
Label L_lv0, L_lv1, L_lv2, L_lv3, L_lv4, L_lv5, L_lv6, L_lv7, L_lv8, L_lv9, L_lv10, L_lv11, L_lv12, L_lv13, L_lv14, L_lv15;
Label L_1, L_2, L_3, L_4;
Register rLoaded = t0;
Register rTmp1 = t1;
Register rTmp2 = t2;
Register off16 = R22;
Register off32 = R23;
Register off48 = R24;
Register off64 = R25;
Register off80 = R26;
Register off96 = R27;
Register off112 = R28;
Register rIdx = R29;
Register rMax = R30;
Register constantsPos = R31;
VectorRegister mask_32bit = VR24;
VectorRegister mask_64bit = VR25;
VectorRegister zeroes = VR26;
VectorRegister const1 = VR27;
VectorRegister const2 = VR28;
// Save non-volatile vector registers (frameless).
Register offset = t1; int offsetInt = 0;
offsetInt -= 16; li(offset, -16); stvx(VR20, offset, R1_SP);
offsetInt -= 16; addi(offset, offset, -16); stvx(VR21, offset, R1_SP);
offsetInt -= 16; addi(offset, offset, -16); stvx(VR22, offset, R1_SP);
offsetInt -= 16; addi(offset, offset, -16); stvx(VR23, offset, R1_SP);
offsetInt -= 16; addi(offset, offset, -16); stvx(VR24, offset, R1_SP);
offsetInt -= 16; addi(offset, offset, -16); stvx(VR25, offset, R1_SP);
offsetInt -= 16; addi(offset, offset, -16); stvx(VR26, offset, R1_SP);
offsetInt -= 16; addi(offset, offset, -16); stvx(VR27, offset, R1_SP);
offsetInt -= 16; addi(offset, offset, -16); stvx(VR28, offset, R1_SP);
offsetInt -= 8; std(R22, offsetInt, R1_SP);
offsetInt -= 8; std(R23, offsetInt, R1_SP);
offsetInt -= 8; std(R24, offsetInt, R1_SP);
offsetInt -= 8; std(R25, offsetInt, R1_SP);
offsetInt -= 8; std(R26, offsetInt, R1_SP);
offsetInt -= 8; std(R27, offsetInt, R1_SP);
offsetInt -= 8; std(R28, offsetInt, R1_SP);
offsetInt -= 8; std(R29, offsetInt, R1_SP);
offsetInt -= 8; std(R30, offsetInt, R1_SP);
offsetInt -= 8; std(R31, offsetInt, R1_SP);
// Set constants
li(off16, 16);
li(off32, 32);
li(off48, 48);
li(off64, 64);
li(off80, 80);
li(off96, 96);
li(off112, 112);
clrldi(crc, crc, 32);
vxor(zeroes, zeroes, zeroes);
vspltisw(VR0, -1);
vsldoi(mask_32bit, zeroes, VR0, 4);
vsldoi(mask_64bit, zeroes, VR0, -8);
// Get the initial value into v8
vxor(VR8, VR8, VR8);
mtvrd(VR8, crc);
vsldoi(VR8, zeroes, VR8, -8); // shift into bottom 32 bits
li (rLoaded, 0);
rldicr(rIdx, len, 0, 56);
{
BIND(L_1);
// Checksum in blocks of MAX_SIZE (32768)
lis(rMax, 0);
ori(rMax, rMax, 32768);
mr(rTmp2, rMax);
cmpd(CCR0, rIdx, rMax);
bgt(CCR0, L_2);
mr(rMax, rIdx);
BIND(L_2);
subf(rIdx, rMax, rIdx);
// our main loop does 128 bytes at a time
srdi(rMax, rMax, 7);
/*
* Work out the offset into the constants table to start at. Each
* constant is 16 bytes, and it is used against 128 bytes of input
* data - 128 / 16 = 8
*/
sldi(rTmp1, rMax, 4);
srdi(rTmp2, rTmp2, 3);
subf(rTmp1, rTmp1, rTmp2);
// We reduce our final 128 bytes in a separate step
addi(rMax, rMax, -1);
mtctr(rMax);
// Find the start of our constants
add(constantsPos, constants, rTmp1);
// zero VR0-v7 which will contain our checksums
vxor(VR0, VR0, VR0);
vxor(VR1, VR1, VR1);
vxor(VR2, VR2, VR2);
vxor(VR3, VR3, VR3);
vxor(VR4, VR4, VR4);
vxor(VR5, VR5, VR5);
vxor(VR6, VR6, VR6);
vxor(VR7, VR7, VR7);
lvx(const1, constantsPos);
/*
* If we are looping back to consume more data we use the values
* already in VR16-v23.
*/
cmpdi(CCR0, rLoaded, 1);
beq(CCR0, L_3);
{
// First warm up pass
lvx(VR16, buf);
lvx(VR17, off16, buf);
lvx(VR18, off32, buf);
lvx(VR19, off48, buf);
lvx(VR20, off64, buf);
lvx(VR21, off80, buf);
lvx(VR22, off96, buf);
lvx(VR23, off112, buf);
addi(buf, buf, 8*16);
// xor in initial value
vxor(VR16, VR16, VR8);
}
BIND(L_3);
bdz(L_first_warm_up_done);
addi(constantsPos, constantsPos, 16);
lvx(const2, constantsPos);
// Second warm up pass
vpmsumd(VR8, VR16, const1);
lvx(VR16, buf);
vpmsumd(VR9, VR17, const1);
lvx(VR17, off16, buf);
vpmsumd(VR10, VR18, const1);
lvx(VR18, off32, buf);
vpmsumd(VR11, VR19, const1);
lvx(VR19, off48, buf);
vpmsumd(VR12, VR20, const1);
lvx(VR20, off64, buf);
vpmsumd(VR13, VR21, const1);
lvx(VR21, off80, buf);
vpmsumd(VR14, VR22, const1);
lvx(VR22, off96, buf);
vpmsumd(VR15, VR23, const1);
lvx(VR23, off112, buf);
addi(buf, buf, 8 * 16);
bdz(L_first_cool_down);
/*
* main loop. We modulo schedule it such that it takes three iterations
* to complete - first iteration load, second iteration vpmsum, third
* iteration xor.
*/
{
BIND(L_4);
lvx(const1, constantsPos); addi(constantsPos, constantsPos, 16);
vxor(VR0, VR0, VR8);
vpmsumd(VR8, VR16, const2);
lvx(VR16, buf);
vxor(VR1, VR1, VR9);
vpmsumd(VR9, VR17, const2);
lvx(VR17, off16, buf);
vxor(VR2, VR2, VR10);
vpmsumd(VR10, VR18, const2);
lvx(VR18, off32, buf);
vxor(VR3, VR3, VR11);
vpmsumd(VR11, VR19, const2);
lvx(VR19, off48, buf);
lvx(const2, constantsPos);
vxor(VR4, VR4, VR12);
vpmsumd(VR12, VR20, const1);
lvx(VR20, off64, buf);
vxor(VR5, VR5, VR13);
vpmsumd(VR13, VR21, const1);
lvx(VR21, off80, buf);
vxor(VR6, VR6, VR14);
vpmsumd(VR14, VR22, const1);
lvx(VR22, off96, buf);
vxor(VR7, VR7, VR15);
vpmsumd(VR15, VR23, const1);
lvx(VR23, off112, buf);
addi(buf, buf, 8 * 16);
bdnz(L_4);
}
BIND(L_first_cool_down);
// First cool down pass
lvx(const1, constantsPos);
addi(constantsPos, constantsPos, 16);
vxor(VR0, VR0, VR8);
vpmsumd(VR8, VR16, const1);
vxor(VR1, VR1, VR9);
vpmsumd(VR9, VR17, const1);
vxor(VR2, VR2, VR10);
vpmsumd(VR10, VR18, const1);
vxor(VR3, VR3, VR11);
vpmsumd(VR11, VR19, const1);
vxor(VR4, VR4, VR12);
vpmsumd(VR12, VR20, const1);
vxor(VR5, VR5, VR13);
vpmsumd(VR13, VR21, const1);
vxor(VR6, VR6, VR14);
vpmsumd(VR14, VR22, const1);
vxor(VR7, VR7, VR15);
vpmsumd(VR15, VR23, const1);
BIND(L_second_cool_down);
// Second cool down pass
vxor(VR0, VR0, VR8);
vxor(VR1, VR1, VR9);
vxor(VR2, VR2, VR10);
vxor(VR3, VR3, VR11);
vxor(VR4, VR4, VR12);
vxor(VR5, VR5, VR13);
vxor(VR6, VR6, VR14);
vxor(VR7, VR7, VR15);
/*
* vpmsumd produces a 96 bit result in the least significant bits
* of the register. Since we are bit reflected we have to shift it
* left 32 bits so it occupies the least significant bits in the
* bit reflected domain.
*/
vsldoi(VR0, VR0, zeroes, 4);
vsldoi(VR1, VR1, zeroes, 4);
vsldoi(VR2, VR2, zeroes, 4);
vsldoi(VR3, VR3, zeroes, 4);
vsldoi(VR4, VR4, zeroes, 4);
vsldoi(VR5, VR5, zeroes, 4);
vsldoi(VR6, VR6, zeroes, 4);
vsldoi(VR7, VR7, zeroes, 4);
// xor with last 1024 bits
lvx(VR8, buf);
lvx(VR9, off16, buf);
lvx(VR10, off32, buf);
lvx(VR11, off48, buf);
lvx(VR12, off64, buf);
lvx(VR13, off80, buf);
lvx(VR14, off96, buf);
lvx(VR15, off112, buf);
addi(buf, buf, 8 * 16);
vxor(VR16, VR0, VR8);
vxor(VR17, VR1, VR9);
vxor(VR18, VR2, VR10);
vxor(VR19, VR3, VR11);
vxor(VR20, VR4, VR12);
vxor(VR21, VR5, VR13);
vxor(VR22, VR6, VR14);
vxor(VR23, VR7, VR15);
li(rLoaded, 1);
cmpdi(CCR0, rIdx, 0);
addi(rIdx, rIdx, 128);
bne(CCR0, L_1);
}
// Work out how many bytes we have left
andi_(len, len, 127);
// Calculate where in the constant table we need to start
subfic(rTmp1, len, 128);
add(constantsPos, constantsPos, rTmp1);
// How many 16 byte chunks are in the tail
srdi(rIdx, len, 4);
mtctr(rIdx);
/*
* Reduce the previously calculated 1024 bits to 64 bits, shifting
* 32 bits to include the trailing 32 bits of zeros
*/
lvx(VR0, constantsPos);
lvx(VR1, off16, constantsPos);
lvx(VR2, off32, constantsPos);
lvx(VR3, off48, constantsPos);
lvx(VR4, off64, constantsPos);
lvx(VR5, off80, constantsPos);
lvx(VR6, off96, constantsPos);
lvx(VR7, off112, constantsPos);
addi(constantsPos, constantsPos, 8 * 16);
vpmsumw(VR0, VR16, VR0);
vpmsumw(VR1, VR17, VR1);
vpmsumw(VR2, VR18, VR2);
vpmsumw(VR3, VR19, VR3);
vpmsumw(VR4, VR20, VR4);
vpmsumw(VR5, VR21, VR5);
vpmsumw(VR6, VR22, VR6);
vpmsumw(VR7, VR23, VR7);
// Now reduce the tail (0 - 112 bytes)
cmpdi(CCR0, rIdx, 0);
beq(CCR0, L_XOR);
lvx(VR16, buf); addi(buf, buf, 16);
lvx(VR17, constantsPos);
vpmsumw(VR16, VR16, VR17);
vxor(VR0, VR0, VR16);
beq(CCR0, L_XOR);
lvx(VR16, buf); addi(buf, buf, 16);
lvx(VR17, off16, constantsPos);
vpmsumw(VR16, VR16, VR17);
vxor(VR0, VR0, VR16);
beq(CCR0, L_XOR);
lvx(VR16, buf); addi(buf, buf, 16);
lvx(VR17, off32, constantsPos);
vpmsumw(VR16, VR16, VR17);
vxor(VR0, VR0, VR16);
beq(CCR0, L_XOR);
lvx(VR16, buf); addi(buf, buf, 16);
lvx(VR17, off48,constantsPos);
vpmsumw(VR16, VR16, VR17);
vxor(VR0, VR0, VR16);
beq(CCR0, L_XOR);
lvx(VR16, buf); addi(buf, buf, 16);
lvx(VR17, off64, constantsPos);
vpmsumw(VR16, VR16, VR17);
vxor(VR0, VR0, VR16);
beq(CCR0, L_XOR);
lvx(VR16, buf); addi(buf, buf, 16);
lvx(VR17, off80, constantsPos);
vpmsumw(VR16, VR16, VR17);
vxor(VR0, VR0, VR16);
beq(CCR0, L_XOR);
lvx(VR16, buf); addi(buf, buf, 16);
lvx(VR17, off96, constantsPos);
vpmsumw(VR16, VR16, VR17);
vxor(VR0, VR0, VR16);
// Now xor all the parallel chunks together
BIND(L_XOR);
vxor(VR0, VR0, VR1);
vxor(VR2, VR2, VR3);
vxor(VR4, VR4, VR5);
vxor(VR6, VR6, VR7);
vxor(VR0, VR0, VR2);
vxor(VR4, VR4, VR6);
vxor(VR0, VR0, VR4);
b(L_barrett_reduction);
BIND(L_first_warm_up_done);
lvx(const1, constantsPos);
addi(constantsPos, constantsPos, 16);
vpmsumd(VR8, VR16, const1);
vpmsumd(VR9, VR17, const1);
vpmsumd(VR10, VR18, const1);
vpmsumd(VR11, VR19, const1);
vpmsumd(VR12, VR20, const1);
vpmsumd(VR13, VR21, const1);
vpmsumd(VR14, VR22, const1);
vpmsumd(VR15, VR23, const1);
b(L_second_cool_down);
BIND(L_barrett_reduction);
lvx(const1, barretConstants);
addi(barretConstants, barretConstants, 16);
lvx(const2, barretConstants);
vsldoi(VR1, VR0, VR0, -8);
vxor(VR0, VR0, VR1); // xor two 64 bit results together
// shift left one bit
vspltisb(VR1, 1);
vsl(VR0, VR0, VR1);
vand(VR0, VR0, mask_64bit);
/*
* The reflected version of Barrett reduction. Instead of bit
* reflecting our data (which is expensive to do), we bit reflect our
* constants and our algorithm, which means the intermediate data in
* our vector registers goes from 0-63 instead of 63-0. We can reflect
* the algorithm because we don't carry in mod 2 arithmetic.
*/
vand(VR1, VR0, mask_32bit); // bottom 32 bits of a
vpmsumd(VR1, VR1, const1); // ma
vand(VR1, VR1, mask_32bit); // bottom 32bits of ma
vpmsumd(VR1, VR1, const2); // qn */
vxor(VR0, VR0, VR1); // a - qn, subtraction is xor in GF(2)
/*
* Since we are bit reflected, the result (ie the low 32 bits) is in
* the high 32 bits. We just need to shift it left 4 bytes
* V0 [ 0 1 X 3 ]
* V0 [ 0 X 2 3 ]
*/
vsldoi(VR0, VR0, zeroes, 4); // shift result into top 64 bits of
// Get it into r3
mfvrd(crc, VR0);
BIND(L_end);
offsetInt = 0;
// Restore non-volatile Vector registers (frameless).
offsetInt -= 16; li(offset, -16); lvx(VR20, offset, R1_SP);
offsetInt -= 16; addi(offset, offset, -16); lvx(VR21, offset, R1_SP);
offsetInt -= 16; addi(offset, offset, -16); lvx(VR22, offset, R1_SP);
offsetInt -= 16; addi(offset, offset, -16); lvx(VR23, offset, R1_SP);
offsetInt -= 16; addi(offset, offset, -16); lvx(VR24, offset, R1_SP);
offsetInt -= 16; addi(offset, offset, -16); lvx(VR25, offset, R1_SP);
offsetInt -= 16; addi(offset, offset, -16); lvx(VR26, offset, R1_SP);
offsetInt -= 16; addi(offset, offset, -16); lvx(VR27, offset, R1_SP);
offsetInt -= 16; addi(offset, offset, -16); lvx(VR28, offset, R1_SP);
offsetInt -= 8; ld(R22, offsetInt, R1_SP);
offsetInt -= 8; ld(R23, offsetInt, R1_SP);
offsetInt -= 8; ld(R24, offsetInt, R1_SP);
offsetInt -= 8; ld(R25, offsetInt, R1_SP);
offsetInt -= 8; ld(R26, offsetInt, R1_SP);
offsetInt -= 8; ld(R27, offsetInt, R1_SP);
offsetInt -= 8; ld(R28, offsetInt, R1_SP);
offsetInt -= 8; ld(R29, offsetInt, R1_SP);
offsetInt -= 8; ld(R30, offsetInt, R1_SP);
offsetInt -= 8; ld(R31, offsetInt, R1_SP);
}
void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) {
assert_different_registers(crc, buf, /* len, not used!! */ table, tmp);
......
......@@ -656,6 +656,13 @@ class MacroAssembler: public Assembler {
Register tc0, Register tc1, Register tc2, Register tc3);
void kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
Register t0, Register t1, Register t2, Register t3);
void kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table,
Register constants, Register barretConstants,
Register t0, Register t1, Register t2, Register t3, Register t4);
void kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
Register constants, Register barretConstants,
Register t0, Register t1, Register t2);
void kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp);
//
......
......@@ -2482,9 +2482,7 @@ class StubGenerator: public StubCodeGenerator {
* R5_ARG3 - int length (of buffer)
*
* scratch:
* R6_ARG4 - crc table address
* R7_ARG5 - tmp1
* R8_ARG6 - tmp2
* R2, R6-R12
*
* Ouput:
* R3_RET - int crc result
......@@ -2496,28 +2494,62 @@ class StubGenerator: public StubCodeGenerator {
address start = __ function_entry(); // Remember stub start address (is rtn value).
// arguments to kernel_crc32:
Register crc = R3_ARG1; // Current checksum, preset by caller or result from previous call.
Register data = R4_ARG2; // source byte array
Register dataLen = R5_ARG3; // #bytes to process
Register table = R6_ARG4; // crc table address
const Register crc = R3_ARG1; // Current checksum, preset by caller or result from previous call.
const Register data = R4_ARG2; // source byte array
const Register dataLen = R5_ARG3; // #bytes to process
Register t0 = R9; // work reg for kernel* emitters
Register t1 = R10; // work reg for kernel* emitters
Register t2 = R11; // work reg for kernel* emitters
Register t3 = R12; // work reg for kernel* emitters
const Register table = R6; // crc table address
#ifdef VM_LITTLE_ENDIAN
if (VM_Version::has_vpmsumb()) {
const Register constants = R2; // constants address
const Register bconstants = R8; // barret table address
const Register t0 = R9;
const Register t1 = R10;
const Register t2 = R11;
const Register t3 = R12;
const Register t4 = R7;
BLOCK_COMMENT("Stub body {");
assert_different_registers(crc, data, dataLen, table);
StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
StubRoutines::ppc64::generate_load_crc_constants_addr(_masm, constants);
StubRoutines::ppc64::generate_load_crc_barret_constants_addr(_masm, bconstants);
__ kernel_crc32_1word_vpmsumd(crc, data, dataLen, table, constants, bconstants, t0, t1, t2, t3, t4);
BLOCK_COMMENT("return");
__ mr_if_needed(R3_RET, crc); // Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
__ blr();
BLOCK_COMMENT("} Stub body");
} else
#endif
{
const Register t0 = R2;
const Register t1 = R7;
const Register t2 = R8;
const Register t3 = R9;
const Register tc0 = R10;
const Register tc1 = R11;
const Register tc2 = R12;
BLOCK_COMMENT("Stub body {");
assert_different_registers(crc, data, dataLen, table);
StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
__ kernel_crc32_1byte(crc, data, dataLen, table, t0, t1, t2, t3);
__ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, tc0, tc1, tc2, table);
BLOCK_COMMENT("return");
__ mr_if_needed(R3_RET, crc); // Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
__ blr();
BLOCK_COMMENT("} Stub body");
}
return start;
}
......
......@@ -45,6 +45,8 @@ enum platform_dependent_constants {
#else
#define CRC32_TABLES 1
#endif
#define CRC32_CONSTANTS_SIZE 1084
#define CRC32_BARRET_CONSTANTS 10
class ppc64 {
friend class StubGenerator;
......@@ -53,11 +55,17 @@ class ppc64 {
// CRC32 Intrinsics.
static juint _crc_table[CRC32_TABLES][CRC32_COLUMN_SIZE];
static juint* _constants;
static juint* _barret_constants;
public:
// CRC32 Intrinsics.
static void generate_load_crc_table_addr(MacroAssembler* masm, Register table);
static void generate_load_crc_constants_addr(MacroAssembler* masm, Register table);
static void generate_load_crc_barret_constants_addr(MacroAssembler* masm, Register table);
static juint* generate_crc_constants();
static juint* generate_crc_barret_constants();
};
......
......@@ -102,7 +102,7 @@ void VM_Version::initialize() {
// Create and print feature-string.
char buf[(num_features+1) * 16]; // Max 16 chars per feature.
jio_snprintf(buf, sizeof(buf),
"ppc64%s%s%s%s%s%s%s%s%s",
"ppc64%s%s%s%s%s%s%s%s%s%s",
(has_fsqrt() ? " fsqrt" : ""),
(has_isel() ? " isel" : ""),
(has_lxarxeh() ? " lxarxeh" : ""),
......@@ -112,7 +112,8 @@ void VM_Version::initialize() {
(has_popcntw() ? " popcntw" : ""),
(has_fcfids() ? " fcfids" : ""),
(has_vand() ? " vand" : ""),
(has_vcipher() ? " aes" : "")
(has_vcipher() ? " aes" : ""),
(has_vpmsumb() ? " vpmsumb" : "")
// Make sure number of %s matches num_features!
);
_features_str = strdup(buf);
......@@ -485,6 +486,7 @@ void VM_Version::determine_features() {
a->fcfids(F3, F4); // code[8] -> fcfids
a->vand(VR0, VR0, VR0); // code[9] -> vand
a->vcipher(VR0, VR1, VR2); // code[10] -> vcipher
a->vpmsumb(VR0, VR1, VR2); // code[11] -> vpmsumb
a->blr();
// Emit function to set one cache line to zero. Emit function descriptor and get pointer to it.
......@@ -529,6 +531,7 @@ void VM_Version::determine_features() {
if (code[feature_cntr++]) features |= fcfids_m;
if (code[feature_cntr++]) features |= vand_m;
if (code[feature_cntr++]) features |= vcipher_m;
if (code[feature_cntr++]) features |= vpmsumb_m;
// Print the detection code.
if (PrintAssembly) {
......
......@@ -43,6 +43,7 @@ protected:
vand,
dcba,
vcipher,
vpmsumb,
num_features // last entry to count features
};
enum Feature_Flag_Set {
......@@ -58,6 +59,7 @@ protected:
vand_m = (1 << vand ),
dcba_m = (1 << dcba ),
vcipher_m = (1 << vcipher),
vpmsumb_m = (1 << vpmsumb),
all_features_m = -1
};
static int _features;
......@@ -86,6 +88,7 @@ public:
static bool has_vand() { return (_features & vand_m) != 0; }
static bool has_dcba() { return (_features & dcba_m) != 0; }
static bool has_vcipher() { return (_features & vcipher_m) != 0; }
static bool has_vpmsumb() { return (_features & vpmsumb_m) != 0; }
static const char* cpu_features() { return _features_str; }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册