8154156: PPC64: improve array copy stubs by using vector instructions

Reviewed-by: goetz, mdoerr Contributed-by: N Kazunori Ogata <ogatak@jp.ibm.com>

8154156: PPC64: improve array copy stubs by using vector instructions
Reviewed-by: goetz, mdoerr Contributed-by: N Kazunori Ogata <ogatak@jp.ibm.com>
95c29028 · gromero · a44dcc97 · 95c29028 · 95c29028 · 95c29028
8 changed file
--- a/src/cpu/ppc/vm/assembler_ppc.hpp
+++ b/src/cpu/ppc/vm/assembler_ppc.hpp
@@ -469,6 +469,8 @@ class Assembler : public AbstractAssembler {
    LVSR_OPCODE    = (31u << OPCODE_SHIFT |   38u << 1),

    // Vector-Scalar (VSX) instruction support.
+    LXVD2X_OPCODE  = (31u << OPCODE_SHIFT |  844u << 1),
+    STXVD2X_OPCODE = (31u << OPCODE_SHIFT |  972u << 1),
    MTVSRD_OPCODE  = (31u << OPCODE_SHIFT |  179u << 1),
    MFVSRD_OPCODE  = (31u << OPCODE_SHIFT |   51u << 1),

@@ -670,8 +672,10 @@ class Assembler : public AbstractAssembler {
    // Atomics.
    LWARX_OPCODE   = (31u << OPCODE_SHIFT |   20u << 1),
    LDARX_OPCODE   = (31u << OPCODE_SHIFT |   84u << 1),
+    LQARX_OPCODE   = (31u << OPCODE_SHIFT |  276u << 1),
    STWCX_OPCODE   = (31u << OPCODE_SHIFT |  150u << 1),
-    STDCX_OPCODE   = (31u << OPCODE_SHIFT |  214u << 1)
+    STDCX_OPCODE   = (31u << OPCODE_SHIFT |  214u << 1),
+    STQCX_OPCODE   = (31u << OPCODE_SHIFT |  182u << 1)

  };

@@ -1052,6 +1056,19 @@ class Assembler : public AbstractAssembler {
  static int vrs(   VectorRegister r)  { return  vrs(r->encoding());}
  static int vrt(   VectorRegister r)  { return  vrt(r->encoding());}

+  // Support Vector-Scalar (VSX) instructions.
+  static int vsra(      int         x)  { return  opp_u_field(x,            15, 11); }
+  static int vsrb(      int         x)  { return  opp_u_field(x,            20, 16); }
+  static int vsrc(      int         x)  { return  opp_u_field(x,            25, 21); }
+  static int vsrs(      int         x)  { return  opp_u_field(x,            10,  6); }
+  static int vsrt(      int         x)  { return  opp_u_field(x,            10,  6); }
+
+  static int vsra(   VectorSRegister r)  { return  vsra(r->encoding());}
+  static int vsrb(   VectorSRegister r)  { return  vsrb(r->encoding());}
+  static int vsrc(   VectorSRegister r)  { return  vsrc(r->encoding());}
+  static int vsrs(   VectorSRegister r)  { return  vsrs(r->encoding());}
+  static int vsrt(   VectorSRegister r)  { return  vsrt(r->encoding());}
+
  static int vsplt_uim( int        x)  { return  opp_u_field(x,             15, 12); } // for vsplt* instructions
  static int vsplti_sim(int        x)  { return  opp_u_field(x,             15, 11); } // for vsplti* instructions
  static int vsldoi_shb(int        x)  { return  opp_u_field(x,             25, 22); } // for vsldoi instruction
@@ -1663,11 +1680,14 @@ class Assembler : public AbstractAssembler {
  // atomics
  inline void lwarx_unchecked(Register d, Register a, Register b, int eh1 = 0);
  inline void ldarx_unchecked(Register d, Register a, Register b, int eh1 = 0);
+  inline void lqarx_unchecked(Register d, Register a, Register b, int eh1 = 0);
  inline bool lxarx_hint_exclusive_access();
  inline void lwarx(  Register d, Register a, Register b, bool hint_exclusive_access = false);
  inline void ldarx(  Register d, Register a, Register b, bool hint_exclusive_access = false);
+  inline void lqarx(  Register d, Register a, Register b, bool hint_exclusive_access = false);
  inline void stwcx_( Register s, Register a, Register b);
  inline void stdcx_( Register s, Register a, Register b);
+  inline void stqcx_( Register s, Register a, Register b);

  // Instructions for adjusting thread priority for simultaneous
  // multithreading (SMT) on Power5.
@@ -1943,6 +1963,8 @@ class Assembler : public AbstractAssembler {
  inline void mfvscr(   VectorRegister d);

  // Vector-Scalar (VSX) instructions.
+  inline void lxvd2x(   VectorSRegister d, Register a, Register b);
+  inline void stxvd2x(  VectorSRegister d, Register a, Register b);
  inline void mtvrd(    VectorRegister  d, Register a);
  inline void mfvrd(    Register        a, VectorRegister d);

@@ -2022,10 +2044,13 @@ class Assembler : public AbstractAssembler {
  // Atomics: use ra0mem to disallow R0 as base.
  inline void lwarx_unchecked(Register d, Register b, int eh1);
  inline void ldarx_unchecked(Register d, Register b, int eh1);
+  inline void lqarx_unchecked(Register d, Register b, int eh1);
  inline void lwarx( Register d, Register b, bool hint_exclusive_access);
  inline void ldarx( Register d, Register b, bool hint_exclusive_access);
+  inline void lqarx( Register d, Register b, bool hint_exclusive_access);
  inline void stwcx_(Register s, Register b);
  inline void stdcx_(Register s, Register b);
+  inline void stqcx_(Register s, Register b);
  inline void lfs(   FloatRegister d, int si16);
  inline void lfsx(  FloatRegister d, Register b);
  inline void lfd(   FloatRegister d, int si16);

--- a/src/cpu/ppc/vm/assembler_ppc.inline.hpp
+++ b/src/cpu/ppc/vm/assembler_ppc.inline.hpp
 /*
- * Copyright (c) 2002, 2018, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018, SAP SE. All rights reserved.
+ * Copyright (c) 2002, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2019, SAP SE. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -504,11 +504,14 @@ inline void Assembler::elemental_membar(int e) { assert(0 < e && e < 16, "invali
 // Use ra0mem to disallow R0 as base.
 inline void Assembler::lwarx_unchecked(Register d, Register a, Register b, int eh1)           { emit_int32( LWARX_OPCODE | rt(d) | ra0mem(a) | rb(b) | eh(eh1)); }
 inline void Assembler::ldarx_unchecked(Register d, Register a, Register b, int eh1)           { emit_int32( LDARX_OPCODE | rt(d) | ra0mem(a) | rb(b) | eh(eh1)); }
+inline void Assembler::lqarx_unchecked(Register d, Register a, Register b, int eh1)           { emit_int32( LQARX_OPCODE | rt(d) | ra0mem(a) | rb(b) | eh(eh1)); }
 inline bool Assembler::lxarx_hint_exclusive_access()                                          { return VM_Version::has_lxarxeh(); }
 inline void Assembler::lwarx( Register d, Register a, Register b, bool hint_exclusive_access) { lwarx_unchecked(d, a, b, (hint_exclusive_access && lxarx_hint_exclusive_access() && UseExtendedLoadAndReserveInstructionsPPC64) ? 1 : 0); }
 inline void Assembler::ldarx( Register d, Register a, Register b, bool hint_exclusive_access) { ldarx_unchecked(d, a, b, (hint_exclusive_access && lxarx_hint_exclusive_access() && UseExtendedLoadAndReserveInstructionsPPC64) ? 1 : 0); }
+inline void Assembler::lqarx( Register d, Register a, Register b, bool hint_exclusive_access) { lqarx_unchecked(d, a, b, (hint_exclusive_access && lxarx_hint_exclusive_access() && UseExtendedLoadAndReserveInstructionsPPC64) ? 1 : 0); }
 inline void Assembler::stwcx_(Register s, Register a, Register b)                             { emit_int32( STWCX_OPCODE | rs(s) | ra0mem(a) | rb(b) | rc(1)); }
 inline void Assembler::stdcx_(Register s, Register a, Register b)                             { emit_int32( STDCX_OPCODE | rs(s) | ra0mem(a) | rb(b) | rc(1)); }
+inline void Assembler::stqcx_(Register s, Register a, Register b)                             { emit_int32( STQCX_OPCODE | rs(s) | ra0mem(a) | rb(b) | rc(1)); }

 // Instructions for adjusting thread priority
 // for simultaneous multithreading (SMT) on POWER5.
@@ -624,6 +627,8 @@ inline void Assembler::lvsl(  VectorRegister d, Register s1, Register s2) { emit
 inline void Assembler::lvsr(  VectorRegister d, Register s1, Register s2) { emit_int32( LVSR_OPCODE   | vrt(d) | ra0mem(s1) | rb(s2)); }

 // Vector-Scalar (VSX) instructions.
+inline void Assembler::lxvd2x (VectorSRegister d, Register s1, Register s2) { emit_int32( LXVD2X_OPCODE  | vsrt(d) | ra(s1) | rb(s2)); }
+inline void Assembler::stxvd2x(VectorSRegister d, Register s1, Register s2) { emit_int32( STXVD2X_OPCODE | vsrt(d) | ra(s1) | rb(s2)); }
 inline void Assembler::mtvrd(  VectorRegister  d, Register a)               { emit_int32( MTVSRD_OPCODE  | vrt(d)  | ra(a)  | 1u); } // 1u: d is treated as Vector (VMX/Altivec).
 inline void Assembler::mfvrd(  Register        a, VectorRegister d)         { emit_int32( MFVSRD_OPCODE  | vrt(d)  | ra(a)  | 1u); } // 1u: d is treated as Vector (VMX/Altivec).

@@ -833,10 +838,13 @@ inline void Assembler::dcbtstct(Register s2, int ct)  { emit_int32( DCBTST_OPCOD
 // ra0 version
 inline void Assembler::lwarx_unchecked(Register d, Register b, int eh1)          { emit_int32( LWARX_OPCODE | rt(d) | rb(b) | eh(eh1)); }
 inline void Assembler::ldarx_unchecked(Register d, Register b, int eh1)          { emit_int32( LDARX_OPCODE | rt(d) | rb(b) | eh(eh1)); }
+inline void Assembler::lqarx_unchecked(Register d, Register b, int eh1)          { emit_int32( LQARX_OPCODE | rt(d) | rb(b) | eh(eh1)); }
 inline void Assembler::lwarx( Register d, Register b, bool hint_exclusive_access){ lwarx_unchecked(d, b, (hint_exclusive_access && lxarx_hint_exclusive_access() && UseExtendedLoadAndReserveInstructionsPPC64) ? 1 : 0); }
 inline void Assembler::ldarx( Register d, Register b, bool hint_exclusive_access){ ldarx_unchecked(d, b, (hint_exclusive_access && lxarx_hint_exclusive_access() && UseExtendedLoadAndReserveInstructionsPPC64) ? 1 : 0); }
+inline void Assembler::lqarx( Register d, Register b, bool hint_exclusive_access){ lqarx_unchecked(d, b, (hint_exclusive_access && lxarx_hint_exclusive_access() && UseExtendedLoadAndReserveInstructionsPPC64) ? 1 : 0); }
 inline void Assembler::stwcx_(Register s, Register b)                            { emit_int32( STWCX_OPCODE | rs(s) | rb(b) | rc(1)); }
 inline void Assembler::stdcx_(Register s, Register b)                            { emit_int32( STDCX_OPCODE | rs(s) | rb(b) | rc(1)); }
+inline void Assembler::stqcx_(Register s, Register b)                            { emit_int32( STQCX_OPCODE | rs(s) | rb(b) | rc(1)); }

 // ra0 version
 inline void Assembler::lfs( FloatRegister d, int si16)   { emit_int32( LFS_OPCODE  | frt(d) | simm(si16,16)); }

--- a/src/cpu/ppc/vm/globals_ppc.hpp
+++ b/src/cpu/ppc/vm/globals_ppc.hpp
 /*
- * Copyright (c) 2002, 2013, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2012, 2013 SAP AG. All rights reserved.
+ * Copyright (c) 2002, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2012, 2018 SAP AG. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -81,6 +81,19 @@ define_pd_global(uintx, TypeProfileLevel, 0);
  product(bool, ReoptimizeCallSequences, true,                              \
          "Reoptimize code-sequences of calls at runtime.")                 \
                                                                            \
+  /* Power 8: Configure Data Stream Control Register. */                    \
+  product(uint64_t,DSCR_PPC64, (uintx)-1,                                   \
+          "Power8 or later: Specify encoded value for Data Stream Control " \
+          "Register")                                                       \
+  product(uint64_t,DSCR_DPFD_PPC64, 8,                                      \
+          "Power8 or later: DPFD (default prefetch depth) value of the "    \
+          "Data Stream Control Register."                                   \
+          " 0: hardware default, 1: none, 2-7: min-max, 8: don't touch")    \
+  product(uint64_t,DSCR_URG_PPC64, 8,                                       \
+          "Power8 or later: URG (depth attainment urgency) value of the "   \
+          "Data Stream Control Register."                                   \
+          " 0: hardware default, 1: none, 2-7: min-max, 8: don't touch")    \
+                                                                            \
  product(bool, UseLoadInstructionsForStackBangingPPC64, false,             \
          "Use load instructions for stack banging.")                       \
                                                                            \

--- a/src/cpu/ppc/vm/register_ppc.cpp
+++ b/src/cpu/ppc/vm/register_ppc.cpp
 /*
- * Copyright (c) 2000, 2013, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2012, 2013 SAP AG. All rights reserved.
+ * Copyright (c) 2000, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2012, 2018 SAP AG. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -75,3 +75,14 @@ const char* VectorRegisterImpl::name() const {
  };
  return is_valid() ? names[encoding()] : "vnoreg";
 }
+
+const char* VectorSRegisterImpl::name() const {
+  const char* names[number_of_registers] = {
+    "VSR0",  "VSR1",  "VSR2",  "VSR3",  "VSR4",  "VSR5",  "VSR6",  "VSR7",
+    "VSR8",  "VSR9",  "VSR10", "VSR11", "VSR12", "VSR13", "VSR14", "VSR15",
+    "VSR16", "VSR17", "VSR18", "VSR19", "VSR20", "VSR21", "VSR22", "VSR23",
+    "VSR24", "VSR25", "VSR26", "VSR27", "VSR28", "VSR29", "VSR30", "VSR31"
+  };
+  return is_valid() ? names[encoding()] : "vsnoreg";
+}
+
--- a/src/cpu/ppc/vm/register_ppc.hpp
+++ b/src/cpu/ppc/vm/register_ppc.hpp
 /*
- * Copyright (c) 2000, 2013, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2012, 2014 SAP AG. All rights reserved.
+ * Copyright (c) 2000, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2012, 2018 SAP AG. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -492,6 +492,106 @@ CONSTANT_REGISTER_DECLARATION(VectorRegister, VR31, (31));
 #endif // DONT_USE_REGISTER_DEFINES


+// Use VectorSRegister as a shortcut.
+class VectorSRegisterImpl;
+typedef VectorSRegisterImpl* VectorSRegister;
+
+inline VectorSRegister as_VectorSRegister(int encoding) {
+  return (VectorSRegister)(intptr_t)encoding;
+}
+
+// The implementation of Vector-Scalar (VSX) registers on POWER architecture.
+class VectorSRegisterImpl: public AbstractRegisterImpl {
+ public:
+  enum {
+    number_of_registers = 32
+  };
+
+  // construction
+  inline friend VectorSRegister as_VectorSRegister(int encoding);
+
+  // accessors
+  int encoding() const { assert(is_valid(), "invalid register"); return value(); }
+
+  // testers
+  bool is_valid() const { return 0 <=  value() &&  value() < number_of_registers; }
+
+  const char* name() const;
+};
+
+// The Vector-Scalar (VSX) registers of the POWER architecture.
+
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, vsnoreg, (-1));
+
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR0,  ( 0));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR1,  ( 1));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR2,  ( 2));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR3,  ( 3));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR4,  ( 4));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR5,  ( 5));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR6,  ( 6));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR7,  ( 7));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR8,  ( 8));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR9,  ( 9));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR10, (10));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR11, (11));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR12, (12));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR13, (13));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR14, (14));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR15, (15));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR16, (16));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR17, (17));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR18, (18));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR19, (19));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR20, (20));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR21, (21));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR22, (22));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR23, (23));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR24, (24));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR25, (25));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR26, (26));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR27, (27));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR28, (28));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR29, (29));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR30, (30));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR31, (31));
+
+#ifndef DONT_USE_REGISTER_DEFINES
+#define vsnoregi ((VectorSRegister)(vsnoreg_VectorSRegisterEnumValue))
+#define VSR0    ((VectorSRegister)(   VSR0_VectorSRegisterEnumValue))
+#define VSR1    ((VectorSRegister)(   VSR1_VectorSRegisterEnumValue))
+#define VSR2    ((VectorSRegister)(   VSR2_VectorSRegisterEnumValue))
+#define VSR3    ((VectorSRegister)(   VSR3_VectorSRegisterEnumValue))
+#define VSR4    ((VectorSRegister)(   VSR4_VectorSRegisterEnumValue))
+#define VSR5    ((VectorSRegister)(   VSR5_VectorSRegisterEnumValue))
+#define VSR6    ((VectorSRegister)(   VSR6_VectorSRegisterEnumValue))
+#define VSR7    ((VectorSRegister)(   VSR7_VectorSRegisterEnumValue))
+#define VSR8    ((VectorSRegister)(   VSR8_VectorSRegisterEnumValue))
+#define VSR9    ((VectorSRegister)(   VSR9_VectorSRegisterEnumValue))
+#define VSR10   ((VectorSRegister)(  VSR10_VectorSRegisterEnumValue))
+#define VSR11   ((VectorSRegister)(  VSR11_VectorSRegisterEnumValue))
+#define VSR12   ((VectorSRegister)(  VSR12_VectorSRegisterEnumValue))
+#define VSR13   ((VectorSRegister)(  VSR13_VectorSRegisterEnumValue))
+#define VSR14   ((VectorSRegister)(  VSR14_VectorSRegisterEnumValue))
+#define VSR15   ((VectorSRegister)(  VSR15_VectorSRegisterEnumValue))
+#define VSR16   ((VectorSRegister)(  VSR16_VectorSRegisterEnumValue))
+#define VSR17   ((VectorSRegister)(  VSR17_VectorSRegisterEnumValue))
+#define VSR18   ((VectorSRegister)(  VSR18_VectorSRegisterEnumValue))
+#define VSR19   ((VectorSRegister)(  VSR19_VectorSRegisterEnumValue))
+#define VSR20   ((VectorSRegister)(  VSR20_VectorSRegisterEnumValue))
+#define VSR21   ((VectorSRegister)(  VSR21_VectorSRegisterEnumValue))
+#define VSR22   ((VectorSRegister)(  VSR22_VectorSRegisterEnumValue))
+#define VSR23   ((VectorSRegister)(  VSR23_VectorSRegisterEnumValue))
+#define VSR24   ((VectorSRegister)(  VSR24_VectorSRegisterEnumValue))
+#define VSR25   ((VectorSRegister)(  VSR25_VectorSRegisterEnumValue))
+#define VSR26   ((VectorSRegister)(  VSR26_VectorSRegisterEnumValue))
+#define VSR27   ((VectorSRegister)(  VSR27_VectorSRegisterEnumValue))
+#define VSR28   ((VectorSRegister)(  VSR28_VectorSRegisterEnumValue))
+#define VSR29   ((VectorSRegister)(  VSR29_VectorSRegisterEnumValue))
+#define VSR30   ((VectorSRegister)(  VSR30_VectorSRegisterEnumValue))
+#define VSR31   ((VectorSRegister)(  VSR31_VectorSRegisterEnumValue))
+#endif // DONT_USE_REGISTER_DEFINES
+
 // Maximum number of incoming arguments that can be passed in i registers.
 const int PPC_ARGS_IN_REGS_NUM = 8;


--- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp
+++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp
 /*
- * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018, SAP SE. All rights reserved.
+ * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2019, SAP SE. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -1352,9 +1352,13 @@ class StubGenerator: public StubCodeGenerator {
    Register tmp3 = R8_ARG6;
    Register tmp4 = R9_ARG7;

+    VectorSRegister tmp_vsr1  = VSR1;
+    VectorSRegister tmp_vsr2  = VSR2;
+
    address start = __ function_entry();

-      Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8;
+    Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
+
    // don't try anything fancy if arrays don't have many elements
    __ li(tmp3, 0);
    __ cmpwi(CCR0, R5_ARG3, 9);
@@ -1412,22 +1416,60 @@ class StubGenerator: public StubCodeGenerator {
      __ andi_(R5_ARG3, R5_ARG3, 15);
      __ mtctr(tmp1);

-      __ bind(l_8);
-      // Use unrolled version for mass copying (copy 16 elements a time).
-      // Load feeding store gets zero latency on Power6, however not on Power5.
-      // Therefore, the following sequence is made for the good of both.
-      __ ld(tmp1, 0, R3_ARG1);
-      __ ld(tmp2, 8, R3_ARG1);
-      __ ld(tmp3, 16, R3_ARG1);
-      __ ld(tmp4, 24, R3_ARG1);
-      __ std(tmp1, 0, R4_ARG2);
-      __ std(tmp2, 8, R4_ARG2);
-      __ std(tmp3, 16, R4_ARG2);
-      __ std(tmp4, 24, R4_ARG2);
-      __ addi(R3_ARG1, R3_ARG1, 32);
-      __ addi(R4_ARG2, R4_ARG2, 32);
-      __ bdnz(l_8);
-    }
+      if (!VM_Version::has_vsx()) {
+
+        __ bind(l_8);
+        // Use unrolled version for mass copying (copy 16 elements a time).
+        // Load feeding store gets zero latency on Power6, however not on Power5.
+        // Therefore, the following sequence is made for the good of both.
+        __ ld(tmp1, 0, R3_ARG1);
+        __ ld(tmp2, 8, R3_ARG1);
+        __ ld(tmp3, 16, R3_ARG1);
+        __ ld(tmp4, 24, R3_ARG1);
+        __ std(tmp1, 0, R4_ARG2);
+        __ std(tmp2, 8, R4_ARG2);
+        __ std(tmp3, 16, R4_ARG2);
+        __ std(tmp4, 24, R4_ARG2);
+        __ addi(R3_ARG1, R3_ARG1, 32);
+        __ addi(R4_ARG2, R4_ARG2, 32);
+        __ bdnz(l_8);
+
+      } else { // Processor supports VSX, so use it to mass copy.
+
+        // Prefetch src data into L2 cache.
+        __ dcbt(R3_ARG1, 0);
+
+        // If supported set DSCR pre-fetch to deepest.
+        if (VM_Version::has_mfdscr()) {
+          __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
+          __ mtdscr(tmp2);
+        }
+        __ li(tmp1, 16);
+
+        // Backbranch target aligned to 32-byte. It's not aligned 16-byte
+        // as loop contains < 8 instructions that fit inside a single
+        // i-cache sector.
+        __ align(32);
+
+        __ bind(l_9);
+        // Use loop with VSX load/store instructions to
+        // copy 16 elements a time.
+        __ lxvd2x(tmp_vsr1, 0, R3_ARG1);     // Load from src.
+        __ stxvd2x(tmp_vsr1, 0, R4_ARG2);    // Store to dst.
+        __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1);  // Load from src + 16.
+        __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
+        __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32.
+        __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32.
+        __ bdnz(l_9);                        // Dec CTR and loop if not zero.
+
+        // Restore DSCR pre-fetch value.
+        if (VM_Version::has_mfdscr()) {
+          __ load_const_optimized(tmp2, VM_Version::_dscr_val);
+          __ mtdscr(tmp2);
+        }
+
+      }
+    } // FasterArrayCopy
    __ bind(l_6);

    // copy 2 elements at a time

--- a/src/cpu/ppc/vm/vm_version_ppc.cpp
+++ b/src/cpu/ppc/vm/vm_version_ppc.cpp
 /*
- * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2012, 2014 SAP AG. All rights reserved.
+ * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2012, 2018 SAP AG. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -45,7 +45,7 @@ int VM_Version::_features = VM_Version::unknown_m;
 int VM_Version::_measured_cache_line_size = 128; // default value
 const char* VM_Version::_features_str = "";
 bool VM_Version::_is_determine_features_test_running = false;
-
+uint64_t VM_Version::_dscr_val = 0;

 #define MSG(flag)   \
  if (flag && !FLAG_IS_DEFAULT(flag))                                  \
@@ -60,7 +60,9 @@ void VM_Version::initialize() {

  // If PowerArchitecturePPC64 hasn't been specified explicitly determine from features.
  if (FLAG_IS_DEFAULT(PowerArchitecturePPC64)) {
-    if (VM_Version::has_popcntw()) {
+    if (VM_Version::has_lqarx()) {
+      FLAG_SET_ERGO(uintx, PowerArchitecturePPC64, 8);
+    } else if (VM_Version::has_popcntw()) {
      FLAG_SET_ERGO(uintx, PowerArchitecturePPC64, 7);
    } else if (VM_Version::has_cmpb()) {
      FLAG_SET_ERGO(uintx, PowerArchitecturePPC64, 6);
@@ -71,8 +73,14 @@ void VM_Version::initialize() {
    }
  }
  guarantee(PowerArchitecturePPC64 == 0 || PowerArchitecturePPC64 == 5 ||
-            PowerArchitecturePPC64 == 6 || PowerArchitecturePPC64 == 7,
-            "PowerArchitecturePPC64 should be 0, 5, 6 or 7");
+            PowerArchitecturePPC64 == 6 || PowerArchitecturePPC64 == 7 ||
+            PowerArchitecturePPC64 == 8,
+            "PowerArchitecturePPC64 should be 0, 5, 6, 7, or 8");
+
+  // Power 8: Configure Data Stream Control Register.
+  if (PowerArchitecturePPC64 >= 8) {
+    config_dscr();
+  }

  if (!UseSIGTRAP) {
    MSG(TrapBasedICMissChecks);
@@ -102,7 +110,7 @@ void VM_Version::initialize() {
  // Create and print feature-string.
  char buf[(num_features+1) * 16]; // Max 16 chars per feature.
  jio_snprintf(buf, sizeof(buf),
-               "ppc64%s%s%s%s%s%s%s%s%s%s",
+               "ppc64%s%s%s%s%s%s%s%s%s%s%s%s%s",
               (has_fsqrt()   ? " fsqrt"   : ""),
               (has_isel()    ? " isel"    : ""),
               (has_lxarxeh() ? " lxarxeh" : ""),
@@ -112,12 +120,17 @@ void VM_Version::initialize() {
               (has_popcntw() ? " popcntw" : ""),
               (has_fcfids()  ? " fcfids"  : ""),
               (has_vand()    ? " vand"    : ""),
+               (has_lqarx()   ? " lqarx"   : ""),
               (has_vcipher() ? " aes"     : ""),
-               (has_vpmsumb() ? " vpmsumb" : "")
+               (has_vpmsumb() ? " vpmsumb" : ""),
+               (has_mfdscr()  ? " mfdscr"  : ""),
+               (has_vsx()     ? " vsx"     : "")
               // Make sure number of %s matches num_features!
              );
  _features_str = strdup(buf);
-  NOT_PRODUCT(if (Verbose) print_features(););
+  if (Verbose) {
+    print_features();
+  }

  // PPC64 supports 8-byte compare-exchange operations (see
  // Atomic::cmpxchg and StubGenerator::generate_atomic_cmpxchg_ptr)
@@ -485,8 +498,11 @@ void VM_Version::determine_features() {
  a->popcntw(R7, R5);                          // code[7] -> popcntw
  a->fcfids(F3, F4);                           // code[8] -> fcfids
  a->vand(VR0, VR0, VR0);                      // code[9] -> vand
-  a->vcipher(VR0, VR1, VR2);                   // code[10] -> vcipher
-  a->vpmsumb(VR0, VR1, VR2);                   // code[11] -> vpmsumb
+  a->lqarx_unchecked(R7, R3_ARG1, R4_ARG2, 1); // code[10] -> lqarx_m
+  a->vcipher(VR0, VR1, VR2);                   // code[11] -> vcipher
+  a->vpmsumb(VR0, VR1, VR2);                   // code[12] -> vpmsumb
+  a->mfdscr(R0);                               // code[13] -> mfdscr
+  a->lxvd2x(VSR0, 0, R3_ARG1);                 // code[14] -> vsx
  a->blr();

  // Emit function to set one cache line to zero. Emit function descriptor and get pointer to it.
@@ -530,8 +546,11 @@ void VM_Version::determine_features() {
  if (code[feature_cntr++]) features |= popcntw_m;
  if (code[feature_cntr++]) features |= fcfids_m;
  if (code[feature_cntr++]) features |= vand_m;
+  if (code[feature_cntr++]) features |= lqarx_m;
  if (code[feature_cntr++]) features |= vcipher_m;
  if (code[feature_cntr++]) features |= vpmsumb_m;
+  if (code[feature_cntr++]) features |= mfdscr_m;
+  if (code[feature_cntr++]) features |= vsx_m;

  // Print the detection code.
  if (PrintAssembly) {
@@ -543,6 +562,69 @@ void VM_Version::determine_features() {
  _features = features;
 }

+// Power 8: Configure Data Stream Control Register.
+void VM_Version::config_dscr() {
+  assert(has_lqarx(), "Only execute on Power 8 or later!");
+
+  // 7 InstWords for each call (function descriptor + blr instruction).
+  const int code_size = (2+2*7)*BytesPerInstWord;
+
+  // Allocate space for the code.
+  ResourceMark rm;
+  CodeBuffer cb("config_dscr", code_size, 0);
+  MacroAssembler* a = new MacroAssembler(&cb);
+
+  // Emit code.
+  uint64_t (*get_dscr)() = (uint64_t(*)())(void *)a->function_entry();
+  uint32_t *code = (uint32_t *)a->pc();
+  a->mfdscr(R3);
+  a->blr();
+
+  void (*set_dscr)(long) = (void(*)(long))(void *)a->function_entry();
+  a->mtdscr(R3);
+  a->blr();
+
+  uint32_t *code_end = (uint32_t *)a->pc();
+  a->flush();
+
+  // Print the detection code.
+  if (PrintAssembly) {
+    ttyLocker ttyl;
+    tty->print_cr("Decoding dscr configuration stub at " INTPTR_FORMAT " before execution:", p2i(code));
+    Disassembler::decode((u_char*)code, (u_char*)code_end, tty);
+  }
+
+  // Apply the configuration if needed.
+  _dscr_val = (*get_dscr)();
+  if (Verbose) {
+    tty->print_cr("dscr value was 0x%lx" , _dscr_val);
+  }
+  bool change_requested = false;
+  if (DSCR_PPC64 != (uintx)-1) {
+    _dscr_val = DSCR_PPC64;
+    change_requested = true;
+  }
+  if (DSCR_DPFD_PPC64 <= 7) {
+    uint64_t mask = 0x7;
+    if ((_dscr_val & mask) != DSCR_DPFD_PPC64) {
+      _dscr_val = (_dscr_val & ~mask) | (DSCR_DPFD_PPC64);
+      change_requested = true;
+    }
+  }
+  if (DSCR_URG_PPC64 <= 7) {
+    uint64_t mask = 0x7 << 6;
+    if ((_dscr_val & mask) != DSCR_DPFD_PPC64 << 6) {
+      _dscr_val = (_dscr_val & ~mask) | (DSCR_URG_PPC64 << 6);
+      change_requested = true;
+    }
+  }
+  if (change_requested) {
+    (*set_dscr)(_dscr_val);
+    if (Verbose) {
+      tty->print_cr("dscr was set to 0x%lx" , (*get_dscr)());
+    }
+  }
+}

 static int saved_features = 0;


--- a/src/cpu/ppc/vm/vm_version_ppc.hpp
+++ b/src/cpu/ppc/vm/vm_version_ppc.hpp
 /*
- * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2012, 2014 SAP AG. All rights reserved.
+ * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2012, 2018 SAP AG. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -42,8 +42,11 @@ protected:
    fcfids,
    vand,
    dcba,
+    lqarx,
    vcipher,
    vpmsumb,
+    mfdscr,
+    vsx,
    num_features // last entry to count features
  };
  enum Feature_Flag_Set {
@@ -58,8 +61,11 @@ protected:
    fcfids_m              = (1 << fcfids ),
    vand_m                = (1 << vand   ),
    dcba_m                = (1 << dcba   ),
+    lqarx_m               = (1 << lqarx  ),
    vcipher_m             = (1 << vcipher),
    vpmsumb_m             = (1 << vpmsumb),
+    mfdscr_m              = (1 << mfdscr ),
+    vsx_m                 = (1 << vsx    ),
    all_features_m        = -1
  };
  static int  _features;
@@ -69,6 +75,7 @@ protected:

  static void print_features();
  static void determine_features(); // also measures cache line size
+  static void config_dscr(); // Power 8: Configure Data Stream Control Register.
  static void determine_section_size();
  static void power6_micro_bench();
 public:
@@ -87,8 +94,11 @@ public:
  static bool has_fcfids()  { return (_features & fcfids_m) != 0; }
  static bool has_vand()    { return (_features & vand_m) != 0; }
  static bool has_dcba()    { return (_features & dcba_m) != 0; }
+  static bool has_lqarx()   { return (_features & lqarx_m) != 0; }
  static bool has_vcipher() { return (_features & vcipher_m) != 0; }
  static bool has_vpmsumb() { return (_features & vpmsumb_m) != 0; }
+  static bool has_mfdscr()  { return (_features & mfdscr_m) != 0; }
+  static bool has_vsx()     { return (_features & vsx_m) != 0; }

  static const char* cpu_features() { return _features_str; }

@@ -97,6 +107,9 @@ public:
  // Assembler testing
  static void allow_all();
  static void revert();
+
+  // POWER 8: DSCR current value.
+  static uint64_t _dscr_val;
 };

 #endif // CPU_PPC_VM_VM_VERSION_PPC_HPP