8164920: ppc: enhancement of CRC32 intrinsic

Reviewed-by: goetz, mdoerr Contributed-by: N Hiroshi H Horii <horii@jp.ibm.com>

8164920: ppc: enhancement of CRC32 intrinsic
Reviewed-by: goetz, mdoerr Contributed-by: N Hiroshi H Horii <horii@jp.ibm.com>
1b55810b · mdoerr · d9bd1ec1 · 1b55810b · 1b55810b · 1b55810b
9 changed file
--- a/src/cpu/ppc/vm/assembler_ppc.hpp
+++ b/src/cpu/ppc/vm/assembler_ppc.hpp
@@ -468,6 +468,10 @@ class Assembler : public AbstractAssembler {
    LVSL_OPCODE    = (31u << OPCODE_SHIFT |    6u << 1),
    LVSR_OPCODE    = (31u << OPCODE_SHIFT |   38u << 1),

+    // Vector-Scalar (VSX) instruction support.
+    MTVSRD_OPCODE  = (31u << OPCODE_SHIFT |  179u << 1),
+    MFVSRD_OPCODE  = (31u << OPCODE_SHIFT |   51u << 1),
+
    // Vector Permute and Formatting
    VPKPX_OPCODE   = (4u  << OPCODE_SHIFT |  782u     ),
    VPKSHSS_OPCODE = (4u  << OPCODE_SHIFT |  398u     ),
@@ -1938,6 +1942,10 @@ class Assembler : public AbstractAssembler {
  inline void mtvscr(   VectorRegister b);
  inline void mfvscr(   VectorRegister d);

+  // Vector-Scalar (VSX) instructions.
+  inline void mtvrd(    VectorRegister  d, Register a);
+  inline void mfvrd(    Register        a, VectorRegister d);
+
  // AES (introduced with Power 8)
  inline void vcipher(     VectorRegister d, VectorRegister a, VectorRegister b);
  inline void vcipherlast( VectorRegister d, VectorRegister a, VectorRegister b);

--- a/src/cpu/ppc/vm/assembler_ppc.inline.hpp
+++ b/src/cpu/ppc/vm/assembler_ppc.inline.hpp
@@ -623,6 +623,10 @@ inline void Assembler::stvxl( VectorRegister d, Register s1, Register s2) { emit
 inline void Assembler::lvsl(  VectorRegister d, Register s1, Register s2) { emit_int32( LVSL_OPCODE   | vrt(d) | ra0mem(s1) | rb(s2)); }
 inline void Assembler::lvsr(  VectorRegister d, Register s1, Register s2) { emit_int32( LVSR_OPCODE   | vrt(d) | ra0mem(s1) | rb(s2)); }

+// Vector-Scalar (VSX) instructions.
+inline void Assembler::mtvrd(  VectorRegister  d, Register a)               { emit_int32( MTVSRD_OPCODE  | vrt(d)  | ra(a)  | 1u); } // 1u: d is treated as Vector (VMX/Altivec).
+inline void Assembler::mfvrd(  Register        a, VectorRegister d)         { emit_int32( MFVSRD_OPCODE  | vrt(d)  | ra(a)  | 1u); } // 1u: d is treated as Vector (VMX/Altivec).
+
 inline void Assembler::vpkpx(   VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPKPX_OPCODE   | vrt(d) | vra(a) | vrb(b)); }
 inline void Assembler::vpkshss( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPKSHSS_OPCODE | vrt(d) | vra(a) | vrb(b)); }
 inline void Assembler::vpkswss( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPKSWSS_OPCODE | vrt(d) | vra(a) | vrb(b)); }

--- a/src/cpu/ppc/vm/macroAssembler_ppc.cpp
+++ b/src/cpu/ppc/vm/macroAssembler_ppc.cpp
@@ -3423,6 +3423,565 @@ void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len
  BLOCK_COMMENT("} kernel_crc32_1byte");
 }

+/**
+ * @param crc             register containing existing CRC (32-bit)
+ * @param buf             register pointing to input byte buffer (byte*)
+ * @param len             register containing number of bytes
+ * @param table           register pointing to CRC table
+ * @param constants       register pointing to CRC table for 128-bit aligned memory
+ * @param barretConstants register pointing to table for barrett reduction
+ * @param t0              volatile register
+ * @param t1              volatile register
+ * @param t2              volatile register
+ * @param t3              volatile register
+ */
+void MacroAssembler::kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table,
+                        Register constants,  Register barretConstants,
+                        Register t0,  Register t1, Register t2, Register t3, Register t4) {
+  assert_different_registers(crc, buf, len, table);
+
+  Label L_alignedHead, L_tail, L_alignTail, L_start, L_end;
+
+  Register  prealign     = t0;
+  Register  postalign    = t0;
+
+  BLOCK_COMMENT("kernel_crc32_1word_vpmsumb {");
+
+  // 1. use kernel_crc32_1word for shorter than 384bit
+  clrldi(len, len, 32);
+  cmpdi(CCR0, len, 384);
+  bge(CCR0, L_start);
+
+    Register tc0 = t4;
+    Register tc1 = constants;
+    Register tc2 = barretConstants;
+    kernel_crc32_1word(crc, buf, len, table,t0, t1, t2, t3, tc0, tc1, tc2, table);
+    b(L_end);
+
+  BIND(L_start);
+
+    // 2. ~c
+    nand(crc, crc, crc);
+
+    // 3. calculate from 0 to first 128bit-aligned address
+    clrldi_(prealign, buf, 57);
+    beq(CCR0, L_alignedHead);
+
+    subfic(prealign, prealign, 128);
+
+    subf(len, prealign, len);
+    update_byteLoop_crc32(crc, buf, prealign, table, t2, false, false);
+
+    // 4. calculate from first 128bit-aligned address to last 128bit-aligned address
+    BIND(L_alignedHead);
+
+    clrldi(postalign, len, 57);
+    subf(len, postalign, len);
+
+    // len must be more than 256bit
+    kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t1, t2, t3);
+
+    // 5. calculate remaining
+    cmpdi(CCR0, postalign, 0);
+    beq(CCR0, L_tail);
+
+    update_byteLoop_crc32(crc, buf, postalign, table, t2, false, false);
+
+    BIND(L_tail);
+
+    // 6. ~c
+    nand(crc, crc, crc);
+
+  BIND(L_end);
+
+  BLOCK_COMMENT("} kernel_crc32_1word_vpmsumb");
+}
+
+/**
+ * @param crc             register containing existing CRC (32-bit)
+ * @param buf             register pointing to input byte buffer (byte*)
+ * @param len             register containing number of bytes
+ * @param constants       register pointing to CRC table for 128-bit aligned memory
+ * @param barretConstants register pointing to table for barrett reduction
+ * @param t0              volatile register
+ * @param t1              volatile register
+ * @param t2              volatile register
+ */
+void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
+    Register constants, Register barretConstants, Register t0, Register t1, Register t2) {
+  Label L_mainLoop, L_tail, L_alignTail, L_barrett_reduction, L_end, L_first_warm_up_done, L_first_cool_down, L_second_cool_down, L_XOR, L_test;
+  Label L_lv0, L_lv1, L_lv2, L_lv3, L_lv4, L_lv5, L_lv6, L_lv7, L_lv8, L_lv9, L_lv10, L_lv11, L_lv12, L_lv13, L_lv14, L_lv15;
+  Label L_1, L_2, L_3, L_4;
+
+  Register  rLoaded      = t0;
+  Register  rTmp1        = t1;
+  Register  rTmp2        = t2;
+  Register  off16        = R22;
+  Register  off32        = R23;
+  Register  off48        = R24;
+  Register  off64        = R25;
+  Register  off80        = R26;
+  Register  off96        = R27;
+  Register  off112       = R28;
+  Register  rIdx         = R29;
+  Register  rMax         = R30;
+  Register  constantsPos = R31;
+
+  VectorRegister mask_32bit = VR24;
+  VectorRegister mask_64bit = VR25;
+  VectorRegister zeroes     = VR26;
+  VectorRegister const1     = VR27;
+  VectorRegister const2     = VR28;
+
+  // Save non-volatile vector registers (frameless).
+  Register offset = t1;   int offsetInt = 0;
+  offsetInt -= 16; li(offset, -16);           stvx(VR20, offset, R1_SP);
+  offsetInt -= 16; addi(offset, offset, -16); stvx(VR21, offset, R1_SP);
+  offsetInt -= 16; addi(offset, offset, -16); stvx(VR22, offset, R1_SP);
+  offsetInt -= 16; addi(offset, offset, -16); stvx(VR23, offset, R1_SP);
+  offsetInt -= 16; addi(offset, offset, -16); stvx(VR24, offset, R1_SP);
+  offsetInt -= 16; addi(offset, offset, -16); stvx(VR25, offset, R1_SP);
+  offsetInt -= 16; addi(offset, offset, -16); stvx(VR26, offset, R1_SP);
+  offsetInt -= 16; addi(offset, offset, -16); stvx(VR27, offset, R1_SP);
+  offsetInt -= 16; addi(offset, offset, -16); stvx(VR28, offset, R1_SP);
+  offsetInt -= 8; std(R22, offsetInt, R1_SP);
+  offsetInt -= 8; std(R23, offsetInt, R1_SP);
+  offsetInt -= 8; std(R24, offsetInt, R1_SP);
+  offsetInt -= 8; std(R25, offsetInt, R1_SP);
+  offsetInt -= 8; std(R26, offsetInt, R1_SP);
+  offsetInt -= 8; std(R27, offsetInt, R1_SP);
+  offsetInt -= 8; std(R28, offsetInt, R1_SP);
+  offsetInt -= 8; std(R29, offsetInt, R1_SP);
+  offsetInt -= 8; std(R30, offsetInt, R1_SP);
+  offsetInt -= 8; std(R31, offsetInt, R1_SP);
+
+  // Set constants
+  li(off16, 16);
+  li(off32, 32);
+  li(off48, 48);
+  li(off64, 64);
+  li(off80, 80);
+  li(off96, 96);
+  li(off112, 112);
+
+  clrldi(crc, crc, 32);
+
+  vxor(zeroes, zeroes, zeroes);
+  vspltisw(VR0, -1);
+
+  vsldoi(mask_32bit, zeroes, VR0, 4);
+  vsldoi(mask_64bit, zeroes, VR0, -8);
+
+  // Get the initial value into v8
+  vxor(VR8, VR8, VR8);
+  mtvrd(VR8, crc);
+  vsldoi(VR8, zeroes, VR8, -8); // shift into bottom 32 bits
+
+  li (rLoaded, 0);
+
+  rldicr(rIdx, len, 0, 56);
+
+  {
+    BIND(L_1);
+    // Checksum in blocks of MAX_SIZE (32768)
+    lis(rMax, 0);
+    ori(rMax, rMax, 32768);
+    mr(rTmp2, rMax);
+    cmpd(CCR0, rIdx, rMax);
+    bgt(CCR0, L_2);
+    mr(rMax, rIdx);
+
+    BIND(L_2);
+    subf(rIdx, rMax, rIdx);
+
+    // our main loop does 128 bytes at a time
+    srdi(rMax, rMax, 7);
+
+    /*
+     * Work out the offset into the constants table to start at. Each
+     * constant is 16 bytes, and it is used against 128 bytes of input
+     * data - 128 / 16 = 8
+     */
+    sldi(rTmp1, rMax, 4);
+    srdi(rTmp2, rTmp2, 3);
+    subf(rTmp1, rTmp1, rTmp2);
+
+    // We reduce our final 128 bytes in a separate step
+    addi(rMax, rMax, -1);
+    mtctr(rMax);
+
+    // Find the start of our constants
+    add(constantsPos, constants, rTmp1);
+
+    // zero VR0-v7 which will contain our checksums
+    vxor(VR0, VR0, VR0);
+    vxor(VR1, VR1, VR1);
+    vxor(VR2, VR2, VR2);
+    vxor(VR3, VR3, VR3);
+    vxor(VR4, VR4, VR4);
+    vxor(VR5, VR5, VR5);
+    vxor(VR6, VR6, VR6);
+    vxor(VR7, VR7, VR7);
+
+    lvx(const1, constantsPos);
+
+    /*
+     * If we are looping back to consume more data we use the values
+     * already in VR16-v23.
+     */
+    cmpdi(CCR0, rLoaded, 1);
+    beq(CCR0, L_3);
+    {
+
+      // First warm up pass
+      lvx(VR16, buf);
+      lvx(VR17, off16, buf);
+      lvx(VR18, off32, buf);
+      lvx(VR19, off48, buf);
+      lvx(VR20, off64, buf);
+      lvx(VR21, off80, buf);
+      lvx(VR22, off96, buf);
+      lvx(VR23, off112, buf);
+      addi(buf, buf, 8*16);
+
+      // xor in initial value
+      vxor(VR16, VR16, VR8);
+    }
+
+    BIND(L_3);
+    bdz(L_first_warm_up_done);
+
+    addi(constantsPos, constantsPos, 16);
+    lvx(const2, constantsPos);
+
+    // Second warm up pass
+    vpmsumd(VR8, VR16, const1);
+    lvx(VR16, buf);
+
+    vpmsumd(VR9, VR17, const1);
+    lvx(VR17, off16, buf);
+
+    vpmsumd(VR10, VR18, const1);
+    lvx(VR18, off32, buf);
+
+    vpmsumd(VR11, VR19, const1);
+    lvx(VR19, off48, buf);
+
+    vpmsumd(VR12, VR20, const1);
+    lvx(VR20, off64, buf);
+
+    vpmsumd(VR13, VR21, const1);
+    lvx(VR21, off80, buf);
+
+    vpmsumd(VR14, VR22, const1);
+    lvx(VR22, off96, buf);
+
+    vpmsumd(VR15, VR23, const1);
+    lvx(VR23, off112, buf);
+
+    addi(buf, buf, 8 * 16);
+
+    bdz(L_first_cool_down);
+
+    /*
+     * main loop. We modulo schedule it such that it takes three iterations
+     * to complete - first iteration load, second iteration vpmsum, third
+     * iteration xor.
+     */
+    {
+      BIND(L_4);
+      lvx(const1, constantsPos); addi(constantsPos, constantsPos, 16);
+
+      vxor(VR0, VR0, VR8);
+      vpmsumd(VR8, VR16, const2);
+      lvx(VR16, buf);
+
+      vxor(VR1, VR1, VR9);
+      vpmsumd(VR9, VR17, const2);
+      lvx(VR17, off16, buf);
+
+      vxor(VR2, VR2, VR10);
+      vpmsumd(VR10, VR18, const2);
+      lvx(VR18, off32, buf);
+
+      vxor(VR3, VR3, VR11);
+      vpmsumd(VR11, VR19, const2);
+      lvx(VR19, off48, buf);
+      lvx(const2, constantsPos);
+
+      vxor(VR4, VR4, VR12);
+      vpmsumd(VR12, VR20, const1);
+      lvx(VR20, off64, buf);
+
+      vxor(VR5, VR5, VR13);
+      vpmsumd(VR13, VR21, const1);
+      lvx(VR21, off80, buf);
+
+      vxor(VR6, VR6, VR14);
+      vpmsumd(VR14, VR22, const1);
+      lvx(VR22, off96, buf);
+
+      vxor(VR7, VR7, VR15);
+      vpmsumd(VR15, VR23, const1);
+      lvx(VR23, off112, buf);
+
+      addi(buf, buf, 8 * 16);
+
+      bdnz(L_4);
+    }
+
+    BIND(L_first_cool_down);
+
+    // First cool down pass
+    lvx(const1, constantsPos);
+    addi(constantsPos, constantsPos, 16);
+
+    vxor(VR0, VR0, VR8);
+    vpmsumd(VR8, VR16, const1);
+
+    vxor(VR1, VR1, VR9);
+    vpmsumd(VR9, VR17, const1);
+
+    vxor(VR2, VR2, VR10);
+    vpmsumd(VR10, VR18, const1);
+
+    vxor(VR3, VR3, VR11);
+    vpmsumd(VR11, VR19, const1);
+
+    vxor(VR4, VR4, VR12);
+    vpmsumd(VR12, VR20, const1);
+
+    vxor(VR5, VR5, VR13);
+    vpmsumd(VR13, VR21, const1);
+
+    vxor(VR6, VR6, VR14);
+    vpmsumd(VR14, VR22, const1);
+
+    vxor(VR7, VR7, VR15);
+    vpmsumd(VR15, VR23, const1);
+
+    BIND(L_second_cool_down);
+    // Second cool down pass
+    vxor(VR0, VR0, VR8);
+    vxor(VR1, VR1, VR9);
+    vxor(VR2, VR2, VR10);
+    vxor(VR3, VR3, VR11);
+    vxor(VR4, VR4, VR12);
+    vxor(VR5, VR5, VR13);
+    vxor(VR6, VR6, VR14);
+    vxor(VR7, VR7, VR15);
+
+    /*
+     * vpmsumd produces a 96 bit result in the least significant bits
+     * of the register. Since we are bit reflected we have to shift it
+     * left 32 bits so it occupies the least significant bits in the
+     * bit reflected domain.
+     */
+    vsldoi(VR0, VR0, zeroes, 4);
+    vsldoi(VR1, VR1, zeroes, 4);
+    vsldoi(VR2, VR2, zeroes, 4);
+    vsldoi(VR3, VR3, zeroes, 4);
+    vsldoi(VR4, VR4, zeroes, 4);
+    vsldoi(VR5, VR5, zeroes, 4);
+    vsldoi(VR6, VR6, zeroes, 4);
+    vsldoi(VR7, VR7, zeroes, 4);
+
+    // xor with last 1024 bits
+    lvx(VR8, buf);
+    lvx(VR9, off16, buf);
+    lvx(VR10, off32, buf);
+    lvx(VR11, off48, buf);
+    lvx(VR12, off64, buf);
+    lvx(VR13, off80, buf);
+    lvx(VR14, off96, buf);
+    lvx(VR15, off112, buf);
+    addi(buf, buf, 8 * 16);
+
+    vxor(VR16, VR0, VR8);
+    vxor(VR17, VR1, VR9);
+    vxor(VR18, VR2, VR10);
+    vxor(VR19, VR3, VR11);
+    vxor(VR20, VR4, VR12);
+    vxor(VR21, VR5, VR13);
+    vxor(VR22, VR6, VR14);
+    vxor(VR23, VR7, VR15);
+
+    li(rLoaded, 1);
+    cmpdi(CCR0, rIdx, 0);
+    addi(rIdx, rIdx, 128);
+    bne(CCR0, L_1);
+  }
+
+  // Work out how many bytes we have left
+  andi_(len, len, 127);
+
+  // Calculate where in the constant table we need to start
+  subfic(rTmp1, len, 128);
+  add(constantsPos, constantsPos, rTmp1);
+
+  // How many 16 byte chunks are in the tail
+  srdi(rIdx, len, 4);
+  mtctr(rIdx);
+
+  /*
+   * Reduce the previously calculated 1024 bits to 64 bits, shifting
+   * 32 bits to include the trailing 32 bits of zeros
+   */
+  lvx(VR0, constantsPos);
+  lvx(VR1, off16, constantsPos);
+  lvx(VR2, off32, constantsPos);
+  lvx(VR3, off48, constantsPos);
+  lvx(VR4, off64, constantsPos);
+  lvx(VR5, off80, constantsPos);
+  lvx(VR6, off96, constantsPos);
+  lvx(VR7, off112, constantsPos);
+  addi(constantsPos, constantsPos, 8 * 16);
+
+  vpmsumw(VR0, VR16, VR0);
+  vpmsumw(VR1, VR17, VR1);
+  vpmsumw(VR2, VR18, VR2);
+  vpmsumw(VR3, VR19, VR3);
+  vpmsumw(VR4, VR20, VR4);
+  vpmsumw(VR5, VR21, VR5);
+  vpmsumw(VR6, VR22, VR6);
+  vpmsumw(VR7, VR23, VR7);
+
+  // Now reduce the tail (0 - 112 bytes)
+  cmpdi(CCR0, rIdx, 0);
+  beq(CCR0, L_XOR);
+
+  lvx(VR16, buf); addi(buf, buf, 16);
+  lvx(VR17, constantsPos);
+  vpmsumw(VR16, VR16, VR17);
+  vxor(VR0, VR0, VR16);
+  beq(CCR0, L_XOR);
+
+  lvx(VR16, buf); addi(buf, buf, 16);
+  lvx(VR17, off16, constantsPos);
+  vpmsumw(VR16, VR16, VR17);
+  vxor(VR0, VR0, VR16);
+  beq(CCR0, L_XOR);
+
+  lvx(VR16, buf); addi(buf, buf, 16);
+  lvx(VR17, off32, constantsPos);
+  vpmsumw(VR16, VR16, VR17);
+  vxor(VR0, VR0, VR16);
+  beq(CCR0, L_XOR);
+
+  lvx(VR16, buf); addi(buf, buf, 16);
+  lvx(VR17, off48,constantsPos);
+  vpmsumw(VR16, VR16, VR17);
+  vxor(VR0, VR0, VR16);
+  beq(CCR0, L_XOR);
+
+  lvx(VR16, buf); addi(buf, buf, 16);
+  lvx(VR17, off64, constantsPos);
+  vpmsumw(VR16, VR16, VR17);
+  vxor(VR0, VR0, VR16);
+  beq(CCR0, L_XOR);
+
+  lvx(VR16, buf); addi(buf, buf, 16);
+  lvx(VR17, off80, constantsPos);
+  vpmsumw(VR16, VR16, VR17);
+  vxor(VR0, VR0, VR16);
+  beq(CCR0, L_XOR);
+
+  lvx(VR16, buf); addi(buf, buf, 16);
+  lvx(VR17, off96, constantsPos);
+  vpmsumw(VR16, VR16, VR17);
+  vxor(VR0, VR0, VR16);
+
+  // Now xor all the parallel chunks together
+  BIND(L_XOR);
+  vxor(VR0, VR0, VR1);
+  vxor(VR2, VR2, VR3);
+  vxor(VR4, VR4, VR5);
+  vxor(VR6, VR6, VR7);
+
+  vxor(VR0, VR0, VR2);
+  vxor(VR4, VR4, VR6);
+
+  vxor(VR0, VR0, VR4);
+
+  b(L_barrett_reduction);
+
+  BIND(L_first_warm_up_done);
+  lvx(const1, constantsPos);
+  addi(constantsPos, constantsPos, 16);
+  vpmsumd(VR8,  VR16, const1);
+  vpmsumd(VR9,  VR17, const1);
+  vpmsumd(VR10, VR18, const1);
+  vpmsumd(VR11, VR19, const1);
+  vpmsumd(VR12, VR20, const1);
+  vpmsumd(VR13, VR21, const1);
+  vpmsumd(VR14, VR22, const1);
+  vpmsumd(VR15, VR23, const1);
+  b(L_second_cool_down);
+
+  BIND(L_barrett_reduction);
+
+  lvx(const1, barretConstants);
+  addi(barretConstants, barretConstants, 16);
+  lvx(const2, barretConstants);
+
+  vsldoi(VR1, VR0, VR0, -8);
+  vxor(VR0, VR0, VR1);    // xor two 64 bit results together
+
+  // shift left one bit
+  vspltisb(VR1, 1);
+  vsl(VR0, VR0, VR1);
+
+  vand(VR0, VR0, mask_64bit);
+
+  /*
+   * The reflected version of Barrett reduction. Instead of bit
+   * reflecting our data (which is expensive to do), we bit reflect our
+   * constants and our algorithm, which means the intermediate data in
+   * our vector registers goes from 0-63 instead of 63-0. We can reflect
+   * the algorithm because we don't carry in mod 2 arithmetic.
+   */
+  vand(VR1, VR0, mask_32bit);  // bottom 32 bits of a
+  vpmsumd(VR1, VR1, const1);   // ma
+  vand(VR1, VR1, mask_32bit);  // bottom 32bits of ma
+  vpmsumd(VR1, VR1, const2);   // qn */
+  vxor(VR0, VR0, VR1);         // a - qn, subtraction is xor in GF(2)
+
+  /*
+   * Since we are bit reflected, the result (ie the low 32 bits) is in
+   * the high 32 bits. We just need to shift it left 4 bytes
+   * V0 [ 0 1 X 3 ]
+   * V0 [ 0 X 2 3 ]
+   */
+  vsldoi(VR0, VR0, zeroes, 4);    // shift result into top 64 bits of
+
+  // Get it into r3
+  mfvrd(crc, VR0);
+
+  BIND(L_end);
+
+  offsetInt = 0;
+  // Restore non-volatile Vector registers (frameless).
+  offsetInt -= 16; li(offset, -16);           lvx(VR20, offset, R1_SP);
+  offsetInt -= 16; addi(offset, offset, -16); lvx(VR21, offset, R1_SP);
+  offsetInt -= 16; addi(offset, offset, -16); lvx(VR22, offset, R1_SP);
+  offsetInt -= 16; addi(offset, offset, -16); lvx(VR23, offset, R1_SP);
+  offsetInt -= 16; addi(offset, offset, -16); lvx(VR24, offset, R1_SP);
+  offsetInt -= 16; addi(offset, offset, -16); lvx(VR25, offset, R1_SP);
+  offsetInt -= 16; addi(offset, offset, -16); lvx(VR26, offset, R1_SP);
+  offsetInt -= 16; addi(offset, offset, -16); lvx(VR27, offset, R1_SP);
+  offsetInt -= 16; addi(offset, offset, -16); lvx(VR28, offset, R1_SP);
+  offsetInt -= 8;  ld(R22, offsetInt, R1_SP);
+  offsetInt -= 8;  ld(R23, offsetInt, R1_SP);
+  offsetInt -= 8;  ld(R24, offsetInt, R1_SP);
+  offsetInt -= 8;  ld(R25, offsetInt, R1_SP);
+  offsetInt -= 8;  ld(R26, offsetInt, R1_SP);
+  offsetInt -= 8;  ld(R27, offsetInt, R1_SP);
+  offsetInt -= 8;  ld(R28, offsetInt, R1_SP);
+  offsetInt -= 8;  ld(R29, offsetInt, R1_SP);
+  offsetInt -= 8;  ld(R30, offsetInt, R1_SP);
+  offsetInt -= 8;  ld(R31, offsetInt, R1_SP);
+}
+
 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) {
  assert_different_registers(crc, buf, /* len,  not used!! */ table, tmp);


--- a/src/cpu/ppc/vm/macroAssembler_ppc.hpp
+++ b/src/cpu/ppc/vm/macroAssembler_ppc.hpp
@@ -656,6 +656,13 @@ class MacroAssembler: public Assembler {
                          Register tc0, Register tc1, Register tc2, Register tc3);
  void kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
                          Register t0,  Register t1,  Register t2,  Register t3);
+  void kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table,
+                          Register constants, Register barretConstants,
+                          Register t0,  Register t1, Register t2, Register t3, Register t4);
+  void kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
+                          Register constants, Register barretConstants,
+                          Register t0, Register t1, Register t2);
+
  void kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp);

  //

--- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp
+++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp
@@ -2482,9 +2482,7 @@ class StubGenerator: public StubCodeGenerator {
   *   R5_ARG3    - int   length (of buffer)
   *
   * scratch:
-   *   R6_ARG4    - crc table address
-   *   R7_ARG5    - tmp1
-   *   R8_ARG6    - tmp2
+   *   R2, R6-R12
   *
   * Ouput:
   *   R3_RET     - int   crc result
@@ -2496,28 +2494,62 @@ class StubGenerator: public StubCodeGenerator {
    address start = __ function_entry();  // Remember stub start address (is rtn value).

    // arguments to kernel_crc32:
-    Register       crc     = R3_ARG1;  // Current checksum, preset by caller or result from previous call.
-    Register       data    = R4_ARG2;  // source byte array
-    Register       dataLen = R5_ARG3;  // #bytes to process
-    Register       table   = R6_ARG4;  // crc table address
+    const Register crc     = R3_ARG1;  // Current checksum, preset by caller or result from previous call.
+    const Register data    = R4_ARG2;  // source byte array
+    const Register dataLen = R5_ARG3;  // #bytes to process

-    Register       t0      = R9;       // work reg for kernel* emitters
-    Register       t1      = R10;      // work reg for kernel* emitters
-    Register       t2      = R11;      // work reg for kernel* emitters
-    Register       t3      = R12;      // work reg for kernel* emitters
+    const Register table   = R6;       // crc table address

-    BLOCK_COMMENT("Stub body {");
-    assert_different_registers(crc, data, dataLen, table);
+#ifdef VM_LITTLE_ENDIAN
+    if (VM_Version::has_vpmsumb()) {
+      const Register constants    = R2;  // constants address
+      const Register bconstants   = R8;  // barret table address

-    StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
+      const Register t0      = R9;
+      const Register t1      = R10;
+      const Register t2      = R11;
+      const Register t3      = R12;
+      const Register t4      = R7;

-    __ kernel_crc32_1byte(crc, data, dataLen, table, t0, t1, t2, t3);
+      BLOCK_COMMENT("Stub body {");
+      assert_different_registers(crc, data, dataLen, table);

-    BLOCK_COMMENT("return");
-    __ mr_if_needed(R3_RET, crc);      // Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
-    __ blr();
+      StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
+      StubRoutines::ppc64::generate_load_crc_constants_addr(_masm, constants);
+      StubRoutines::ppc64::generate_load_crc_barret_constants_addr(_masm, bconstants);
+
+      __ kernel_crc32_1word_vpmsumd(crc, data, dataLen, table, constants, bconstants, t0, t1, t2, t3, t4);
+
+      BLOCK_COMMENT("return");
+      __ mr_if_needed(R3_RET, crc);      // Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
+      __ blr();
+
+      BLOCK_COMMENT("} Stub body");
+    } else
+#endif
+    {
+      const Register t0      = R2;
+      const Register t1      = R7;
+      const Register t2      = R8;
+      const Register t3      = R9;
+      const Register tc0     = R10;
+      const Register tc1     = R11;
+      const Register tc2     = R12;
+
+      BLOCK_COMMENT("Stub body {");
+      assert_different_registers(crc, data, dataLen, table);
+
+      StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
+
+      __ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, tc0, tc1, tc2, table);
+
+      BLOCK_COMMENT("return");
+      __ mr_if_needed(R3_RET, crc);      // Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
+      __ blr();
+
+      BLOCK_COMMENT("} Stub body");
+    }

-    BLOCK_COMMENT("} Stub body");
    return start;
  }


--- a/src/cpu/ppc/vm/stubRoutines_ppc_64.cpp
+++ b/src/cpu/ppc/vm/stubRoutines_ppc_64.cpp
--- a/src/cpu/ppc/vm/stubRoutines_ppc_64.hpp
+++ b/src/cpu/ppc/vm/stubRoutines_ppc_64.hpp
@@ -45,6 +45,8 @@ enum platform_dependent_constants {
 #else
  #define CRC32_TABLES 1
 #endif
+#define CRC32_CONSTANTS_SIZE 1084
+#define CRC32_BARRET_CONSTANTS 10

 class ppc64 {
 friend class StubGenerator;
@@ -53,11 +55,17 @@ class ppc64 {

  // CRC32 Intrinsics.
  static juint _crc_table[CRC32_TABLES][CRC32_COLUMN_SIZE];
+  static juint* _constants;
+  static juint* _barret_constants;

 public:

  // CRC32 Intrinsics.
  static void generate_load_crc_table_addr(MacroAssembler* masm, Register table);
+  static void generate_load_crc_constants_addr(MacroAssembler* masm, Register table);
+  static void generate_load_crc_barret_constants_addr(MacroAssembler* masm, Register table);
+  static juint* generate_crc_constants();
+  static juint* generate_crc_barret_constants();

 };


--- a/src/cpu/ppc/vm/vm_version_ppc.cpp
+++ b/src/cpu/ppc/vm/vm_version_ppc.cpp
@@ -102,7 +102,7 @@ void VM_Version::initialize() {
  // Create and print feature-string.
  char buf[(num_features+1) * 16]; // Max 16 chars per feature.
  jio_snprintf(buf, sizeof(buf),
-               "ppc64%s%s%s%s%s%s%s%s%s",
+               "ppc64%s%s%s%s%s%s%s%s%s%s",
               (has_fsqrt()   ? " fsqrt"   : ""),
               (has_isel()    ? " isel"    : ""),
               (has_lxarxeh() ? " lxarxeh" : ""),
@@ -112,7 +112,8 @@ void VM_Version::initialize() {
               (has_popcntw() ? " popcntw" : ""),
               (has_fcfids()  ? " fcfids"  : ""),
               (has_vand()    ? " vand"    : ""),
-               (has_vcipher() ? " aes"     : "")
+               (has_vcipher() ? " aes"     : ""),
+               (has_vpmsumb() ? " vpmsumb" : "")
               // Make sure number of %s matches num_features!
              );
  _features_str = strdup(buf);
@@ -485,6 +486,7 @@ void VM_Version::determine_features() {
  a->fcfids(F3, F4);                           // code[8] -> fcfids
  a->vand(VR0, VR0, VR0);                      // code[9] -> vand
  a->vcipher(VR0, VR1, VR2);                   // code[10] -> vcipher
+  a->vpmsumb(VR0, VR1, VR2);                   // code[11] -> vpmsumb
  a->blr();

  // Emit function to set one cache line to zero. Emit function descriptor and get pointer to it.
@@ -529,6 +531,7 @@ void VM_Version::determine_features() {
  if (code[feature_cntr++]) features |= fcfids_m;
  if (code[feature_cntr++]) features |= vand_m;
  if (code[feature_cntr++]) features |= vcipher_m;
+  if (code[feature_cntr++]) features |= vpmsumb_m;

  // Print the detection code.
  if (PrintAssembly) {

--- a/src/cpu/ppc/vm/vm_version_ppc.hpp
+++ b/src/cpu/ppc/vm/vm_version_ppc.hpp
@@ -43,6 +43,7 @@ protected:
    vand,
    dcba,
    vcipher,
+    vpmsumb,
    num_features // last entry to count features
  };
  enum Feature_Flag_Set {
@@ -58,6 +59,7 @@ protected:
    vand_m                = (1 << vand   ),
    dcba_m                = (1 << dcba   ),
    vcipher_m             = (1 << vcipher),
+    vpmsumb_m             = (1 << vpmsumb),
    all_features_m        = -1
  };
  static int  _features;
@@ -86,6 +88,7 @@ public:
  static bool has_vand()    { return (_features & vand_m) != 0; }
  static bool has_dcba()    { return (_features & dcba_m) != 0; }
  static bool has_vcipher() { return (_features & vcipher_m) != 0; }
+  static bool has_vpmsumb() { return (_features & vpmsumb_m) != 0; }

  static const char* cpu_features() { return _features_str; }