From 95c29028471f2feac09f081da6f8af063d2ed5af Mon Sep 17 00:00:00 2001 From: gromero Date: Wed, 24 Apr 2019 11:48:37 -0400 Subject: [PATCH] 8154156: PPC64: improve array copy stubs by using vector instructions Reviewed-by: goetz, mdoerr Contributed-by: Kazunori Ogata --- src/cpu/ppc/vm/assembler_ppc.hpp | 27 +++++- src/cpu/ppc/vm/assembler_ppc.inline.hpp | 12 ++- src/cpu/ppc/vm/globals_ppc.hpp | 17 +++- src/cpu/ppc/vm/register_ppc.cpp | 15 +++- src/cpu/ppc/vm/register_ppc.hpp | 104 +++++++++++++++++++++++- src/cpu/ppc/vm/stubGenerator_ppc.cpp | 80 +++++++++++++----- src/cpu/ppc/vm/vm_version_ppc.cpp | 104 +++++++++++++++++++++--- src/cpu/ppc/vm/vm_version_ppc.hpp | 17 +++- 8 files changed, 335 insertions(+), 41 deletions(-) diff --git a/src/cpu/ppc/vm/assembler_ppc.hpp b/src/cpu/ppc/vm/assembler_ppc.hpp index ddd07518a..1c742edae 100644 --- a/src/cpu/ppc/vm/assembler_ppc.hpp +++ b/src/cpu/ppc/vm/assembler_ppc.hpp @@ -469,6 +469,8 @@ class Assembler : public AbstractAssembler { LVSR_OPCODE = (31u << OPCODE_SHIFT | 38u << 1), // Vector-Scalar (VSX) instruction support. + LXVD2X_OPCODE = (31u << OPCODE_SHIFT | 844u << 1), + STXVD2X_OPCODE = (31u << OPCODE_SHIFT | 972u << 1), MTVSRD_OPCODE = (31u << OPCODE_SHIFT | 179u << 1), MFVSRD_OPCODE = (31u << OPCODE_SHIFT | 51u << 1), @@ -670,8 +672,10 @@ class Assembler : public AbstractAssembler { // Atomics. LWARX_OPCODE = (31u << OPCODE_SHIFT | 20u << 1), LDARX_OPCODE = (31u << OPCODE_SHIFT | 84u << 1), + LQARX_OPCODE = (31u << OPCODE_SHIFT | 276u << 1), STWCX_OPCODE = (31u << OPCODE_SHIFT | 150u << 1), - STDCX_OPCODE = (31u << OPCODE_SHIFT | 214u << 1) + STDCX_OPCODE = (31u << OPCODE_SHIFT | 214u << 1), + STQCX_OPCODE = (31u << OPCODE_SHIFT | 182u << 1) }; @@ -1052,6 +1056,19 @@ class Assembler : public AbstractAssembler { static int vrs( VectorRegister r) { return vrs(r->encoding());} static int vrt( VectorRegister r) { return vrt(r->encoding());} + // Support Vector-Scalar (VSX) instructions. + static int vsra( int x) { return opp_u_field(x, 15, 11); } + static int vsrb( int x) { return opp_u_field(x, 20, 16); } + static int vsrc( int x) { return opp_u_field(x, 25, 21); } + static int vsrs( int x) { return opp_u_field(x, 10, 6); } + static int vsrt( int x) { return opp_u_field(x, 10, 6); } + + static int vsra( VectorSRegister r) { return vsra(r->encoding());} + static int vsrb( VectorSRegister r) { return vsrb(r->encoding());} + static int vsrc( VectorSRegister r) { return vsrc(r->encoding());} + static int vsrs( VectorSRegister r) { return vsrs(r->encoding());} + static int vsrt( VectorSRegister r) { return vsrt(r->encoding());} + static int vsplt_uim( int x) { return opp_u_field(x, 15, 12); } // for vsplt* instructions static int vsplti_sim(int x) { return opp_u_field(x, 15, 11); } // for vsplti* instructions static int vsldoi_shb(int x) { return opp_u_field(x, 25, 22); } // for vsldoi instruction @@ -1663,11 +1680,14 @@ class Assembler : public AbstractAssembler { // atomics inline void lwarx_unchecked(Register d, Register a, Register b, int eh1 = 0); inline void ldarx_unchecked(Register d, Register a, Register b, int eh1 = 0); + inline void lqarx_unchecked(Register d, Register a, Register b, int eh1 = 0); inline bool lxarx_hint_exclusive_access(); inline void lwarx( Register d, Register a, Register b, bool hint_exclusive_access = false); inline void ldarx( Register d, Register a, Register b, bool hint_exclusive_access = false); + inline void lqarx( Register d, Register a, Register b, bool hint_exclusive_access = false); inline void stwcx_( Register s, Register a, Register b); inline void stdcx_( Register s, Register a, Register b); + inline void stqcx_( Register s, Register a, Register b); // Instructions for adjusting thread priority for simultaneous // multithreading (SMT) on Power5. @@ -1943,6 +1963,8 @@ class Assembler : public AbstractAssembler { inline void mfvscr( VectorRegister d); // Vector-Scalar (VSX) instructions. + inline void lxvd2x( VectorSRegister d, Register a, Register b); + inline void stxvd2x( VectorSRegister d, Register a, Register b); inline void mtvrd( VectorRegister d, Register a); inline void mfvrd( Register a, VectorRegister d); @@ -2022,10 +2044,13 @@ class Assembler : public AbstractAssembler { // Atomics: use ra0mem to disallow R0 as base. inline void lwarx_unchecked(Register d, Register b, int eh1); inline void ldarx_unchecked(Register d, Register b, int eh1); + inline void lqarx_unchecked(Register d, Register b, int eh1); inline void lwarx( Register d, Register b, bool hint_exclusive_access); inline void ldarx( Register d, Register b, bool hint_exclusive_access); + inline void lqarx( Register d, Register b, bool hint_exclusive_access); inline void stwcx_(Register s, Register b); inline void stdcx_(Register s, Register b); + inline void stqcx_(Register s, Register b); inline void lfs( FloatRegister d, int si16); inline void lfsx( FloatRegister d, Register b); inline void lfd( FloatRegister d, int si16); diff --git a/src/cpu/ppc/vm/assembler_ppc.inline.hpp b/src/cpu/ppc/vm/assembler_ppc.inline.hpp index 3c66cbb54..c86e01627 100644 --- a/src/cpu/ppc/vm/assembler_ppc.inline.hpp +++ b/src/cpu/ppc/vm/assembler_ppc.inline.hpp @@ -1,6 +1,6 @@ /* - * Copyright (c) 2002, 2018, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018, SAP SE. All rights reserved. + * Copyright (c) 2002, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2019, SAP SE. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -504,11 +504,14 @@ inline void Assembler::elemental_membar(int e) { assert(0 < e && e < 16, "invali // Use ra0mem to disallow R0 as base. inline void Assembler::lwarx_unchecked(Register d, Register a, Register b, int eh1) { emit_int32( LWARX_OPCODE | rt(d) | ra0mem(a) | rb(b) | eh(eh1)); } inline void Assembler::ldarx_unchecked(Register d, Register a, Register b, int eh1) { emit_int32( LDARX_OPCODE | rt(d) | ra0mem(a) | rb(b) | eh(eh1)); } +inline void Assembler::lqarx_unchecked(Register d, Register a, Register b, int eh1) { emit_int32( LQARX_OPCODE | rt(d) | ra0mem(a) | rb(b) | eh(eh1)); } inline bool Assembler::lxarx_hint_exclusive_access() { return VM_Version::has_lxarxeh(); } inline void Assembler::lwarx( Register d, Register a, Register b, bool hint_exclusive_access) { lwarx_unchecked(d, a, b, (hint_exclusive_access && lxarx_hint_exclusive_access() && UseExtendedLoadAndReserveInstructionsPPC64) ? 1 : 0); } inline void Assembler::ldarx( Register d, Register a, Register b, bool hint_exclusive_access) { ldarx_unchecked(d, a, b, (hint_exclusive_access && lxarx_hint_exclusive_access() && UseExtendedLoadAndReserveInstructionsPPC64) ? 1 : 0); } +inline void Assembler::lqarx( Register d, Register a, Register b, bool hint_exclusive_access) { lqarx_unchecked(d, a, b, (hint_exclusive_access && lxarx_hint_exclusive_access() && UseExtendedLoadAndReserveInstructionsPPC64) ? 1 : 0); } inline void Assembler::stwcx_(Register s, Register a, Register b) { emit_int32( STWCX_OPCODE | rs(s) | ra0mem(a) | rb(b) | rc(1)); } inline void Assembler::stdcx_(Register s, Register a, Register b) { emit_int32( STDCX_OPCODE | rs(s) | ra0mem(a) | rb(b) | rc(1)); } +inline void Assembler::stqcx_(Register s, Register a, Register b) { emit_int32( STQCX_OPCODE | rs(s) | ra0mem(a) | rb(b) | rc(1)); } // Instructions for adjusting thread priority // for simultaneous multithreading (SMT) on POWER5. @@ -624,6 +627,8 @@ inline void Assembler::lvsl( VectorRegister d, Register s1, Register s2) { emit inline void Assembler::lvsr( VectorRegister d, Register s1, Register s2) { emit_int32( LVSR_OPCODE | vrt(d) | ra0mem(s1) | rb(s2)); } // Vector-Scalar (VSX) instructions. +inline void Assembler::lxvd2x (VectorSRegister d, Register s1, Register s2) { emit_int32( LXVD2X_OPCODE | vsrt(d) | ra(s1) | rb(s2)); } +inline void Assembler::stxvd2x(VectorSRegister d, Register s1, Register s2) { emit_int32( STXVD2X_OPCODE | vsrt(d) | ra(s1) | rb(s2)); } inline void Assembler::mtvrd( VectorRegister d, Register a) { emit_int32( MTVSRD_OPCODE | vrt(d) | ra(a) | 1u); } // 1u: d is treated as Vector (VMX/Altivec). inline void Assembler::mfvrd( Register a, VectorRegister d) { emit_int32( MFVSRD_OPCODE | vrt(d) | ra(a) | 1u); } // 1u: d is treated as Vector (VMX/Altivec). @@ -833,10 +838,13 @@ inline void Assembler::dcbtstct(Register s2, int ct) { emit_int32( DCBTST_OPCOD // ra0 version inline void Assembler::lwarx_unchecked(Register d, Register b, int eh1) { emit_int32( LWARX_OPCODE | rt(d) | rb(b) | eh(eh1)); } inline void Assembler::ldarx_unchecked(Register d, Register b, int eh1) { emit_int32( LDARX_OPCODE | rt(d) | rb(b) | eh(eh1)); } +inline void Assembler::lqarx_unchecked(Register d, Register b, int eh1) { emit_int32( LQARX_OPCODE | rt(d) | rb(b) | eh(eh1)); } inline void Assembler::lwarx( Register d, Register b, bool hint_exclusive_access){ lwarx_unchecked(d, b, (hint_exclusive_access && lxarx_hint_exclusive_access() && UseExtendedLoadAndReserveInstructionsPPC64) ? 1 : 0); } inline void Assembler::ldarx( Register d, Register b, bool hint_exclusive_access){ ldarx_unchecked(d, b, (hint_exclusive_access && lxarx_hint_exclusive_access() && UseExtendedLoadAndReserveInstructionsPPC64) ? 1 : 0); } +inline void Assembler::lqarx( Register d, Register b, bool hint_exclusive_access){ lqarx_unchecked(d, b, (hint_exclusive_access && lxarx_hint_exclusive_access() && UseExtendedLoadAndReserveInstructionsPPC64) ? 1 : 0); } inline void Assembler::stwcx_(Register s, Register b) { emit_int32( STWCX_OPCODE | rs(s) | rb(b) | rc(1)); } inline void Assembler::stdcx_(Register s, Register b) { emit_int32( STDCX_OPCODE | rs(s) | rb(b) | rc(1)); } +inline void Assembler::stqcx_(Register s, Register b) { emit_int32( STQCX_OPCODE | rs(s) | rb(b) | rc(1)); } // ra0 version inline void Assembler::lfs( FloatRegister d, int si16) { emit_int32( LFS_OPCODE | frt(d) | simm(si16,16)); } diff --git a/src/cpu/ppc/vm/globals_ppc.hpp b/src/cpu/ppc/vm/globals_ppc.hpp index 386bae680..28ca2c0c8 100644 --- a/src/cpu/ppc/vm/globals_ppc.hpp +++ b/src/cpu/ppc/vm/globals_ppc.hpp @@ -1,6 +1,6 @@ /* - * Copyright (c) 2002, 2013, Oracle and/or its affiliates. All rights reserved. - * Copyright 2012, 2013 SAP AG. All rights reserved. + * Copyright (c) 2002, 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012, 2018 SAP AG. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -81,6 +81,19 @@ define_pd_global(uintx, TypeProfileLevel, 0); product(bool, ReoptimizeCallSequences, true, \ "Reoptimize code-sequences of calls at runtime.") \ \ + /* Power 8: Configure Data Stream Control Register. */ \ + product(uint64_t,DSCR_PPC64, (uintx)-1, \ + "Power8 or later: Specify encoded value for Data Stream Control " \ + "Register") \ + product(uint64_t,DSCR_DPFD_PPC64, 8, \ + "Power8 or later: DPFD (default prefetch depth) value of the " \ + "Data Stream Control Register." \ + " 0: hardware default, 1: none, 2-7: min-max, 8: don't touch") \ + product(uint64_t,DSCR_URG_PPC64, 8, \ + "Power8 or later: URG (depth attainment urgency) value of the " \ + "Data Stream Control Register." \ + " 0: hardware default, 1: none, 2-7: min-max, 8: don't touch") \ + \ product(bool, UseLoadInstructionsForStackBangingPPC64, false, \ "Use load instructions for stack banging.") \ \ diff --git a/src/cpu/ppc/vm/register_ppc.cpp b/src/cpu/ppc/vm/register_ppc.cpp index edff7485a..60ba9ed91 100644 --- a/src/cpu/ppc/vm/register_ppc.cpp +++ b/src/cpu/ppc/vm/register_ppc.cpp @@ -1,6 +1,6 @@ /* - * Copyright (c) 2000, 2013, Oracle and/or its affiliates. All rights reserved. - * Copyright 2012, 2013 SAP AG. All rights reserved. + * Copyright (c) 2000, 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012, 2018 SAP AG. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -75,3 +75,14 @@ const char* VectorRegisterImpl::name() const { }; return is_valid() ? names[encoding()] : "vnoreg"; } + +const char* VectorSRegisterImpl::name() const { + const char* names[number_of_registers] = { + "VSR0", "VSR1", "VSR2", "VSR3", "VSR4", "VSR5", "VSR6", "VSR7", + "VSR8", "VSR9", "VSR10", "VSR11", "VSR12", "VSR13", "VSR14", "VSR15", + "VSR16", "VSR17", "VSR18", "VSR19", "VSR20", "VSR21", "VSR22", "VSR23", + "VSR24", "VSR25", "VSR26", "VSR27", "VSR28", "VSR29", "VSR30", "VSR31" + }; + return is_valid() ? names[encoding()] : "vsnoreg"; +} + diff --git a/src/cpu/ppc/vm/register_ppc.hpp b/src/cpu/ppc/vm/register_ppc.hpp index 107c5bab8..9fa72a32b 100644 --- a/src/cpu/ppc/vm/register_ppc.hpp +++ b/src/cpu/ppc/vm/register_ppc.hpp @@ -1,6 +1,6 @@ /* - * Copyright (c) 2000, 2013, Oracle and/or its affiliates. All rights reserved. - * Copyright 2012, 2014 SAP AG. All rights reserved. + * Copyright (c) 2000, 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012, 2018 SAP AG. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -492,6 +492,106 @@ CONSTANT_REGISTER_DECLARATION(VectorRegister, VR31, (31)); #endif // DONT_USE_REGISTER_DEFINES +// Use VectorSRegister as a shortcut. +class VectorSRegisterImpl; +typedef VectorSRegisterImpl* VectorSRegister; + +inline VectorSRegister as_VectorSRegister(int encoding) { + return (VectorSRegister)(intptr_t)encoding; +} + +// The implementation of Vector-Scalar (VSX) registers on POWER architecture. +class VectorSRegisterImpl: public AbstractRegisterImpl { + public: + enum { + number_of_registers = 32 + }; + + // construction + inline friend VectorSRegister as_VectorSRegister(int encoding); + + // accessors + int encoding() const { assert(is_valid(), "invalid register"); return value(); } + + // testers + bool is_valid() const { return 0 <= value() && value() < number_of_registers; } + + const char* name() const; +}; + +// The Vector-Scalar (VSX) registers of the POWER architecture. + +CONSTANT_REGISTER_DECLARATION(VectorSRegister, vsnoreg, (-1)); + +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR0, ( 0)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR1, ( 1)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR2, ( 2)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR3, ( 3)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR4, ( 4)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR5, ( 5)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR6, ( 6)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR7, ( 7)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR8, ( 8)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR9, ( 9)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR10, (10)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR11, (11)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR12, (12)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR13, (13)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR14, (14)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR15, (15)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR16, (16)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR17, (17)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR18, (18)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR19, (19)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR20, (20)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR21, (21)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR22, (22)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR23, (23)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR24, (24)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR25, (25)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR26, (26)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR27, (27)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR28, (28)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR29, (29)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR30, (30)); +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR31, (31)); + +#ifndef DONT_USE_REGISTER_DEFINES +#define vsnoregi ((VectorSRegister)(vsnoreg_VectorSRegisterEnumValue)) +#define VSR0 ((VectorSRegister)( VSR0_VectorSRegisterEnumValue)) +#define VSR1 ((VectorSRegister)( VSR1_VectorSRegisterEnumValue)) +#define VSR2 ((VectorSRegister)( VSR2_VectorSRegisterEnumValue)) +#define VSR3 ((VectorSRegister)( VSR3_VectorSRegisterEnumValue)) +#define VSR4 ((VectorSRegister)( VSR4_VectorSRegisterEnumValue)) +#define VSR5 ((VectorSRegister)( VSR5_VectorSRegisterEnumValue)) +#define VSR6 ((VectorSRegister)( VSR6_VectorSRegisterEnumValue)) +#define VSR7 ((VectorSRegister)( VSR7_VectorSRegisterEnumValue)) +#define VSR8 ((VectorSRegister)( VSR8_VectorSRegisterEnumValue)) +#define VSR9 ((VectorSRegister)( VSR9_VectorSRegisterEnumValue)) +#define VSR10 ((VectorSRegister)( VSR10_VectorSRegisterEnumValue)) +#define VSR11 ((VectorSRegister)( VSR11_VectorSRegisterEnumValue)) +#define VSR12 ((VectorSRegister)( VSR12_VectorSRegisterEnumValue)) +#define VSR13 ((VectorSRegister)( VSR13_VectorSRegisterEnumValue)) +#define VSR14 ((VectorSRegister)( VSR14_VectorSRegisterEnumValue)) +#define VSR15 ((VectorSRegister)( VSR15_VectorSRegisterEnumValue)) +#define VSR16 ((VectorSRegister)( VSR16_VectorSRegisterEnumValue)) +#define VSR17 ((VectorSRegister)( VSR17_VectorSRegisterEnumValue)) +#define VSR18 ((VectorSRegister)( VSR18_VectorSRegisterEnumValue)) +#define VSR19 ((VectorSRegister)( VSR19_VectorSRegisterEnumValue)) +#define VSR20 ((VectorSRegister)( VSR20_VectorSRegisterEnumValue)) +#define VSR21 ((VectorSRegister)( VSR21_VectorSRegisterEnumValue)) +#define VSR22 ((VectorSRegister)( VSR22_VectorSRegisterEnumValue)) +#define VSR23 ((VectorSRegister)( VSR23_VectorSRegisterEnumValue)) +#define VSR24 ((VectorSRegister)( VSR24_VectorSRegisterEnumValue)) +#define VSR25 ((VectorSRegister)( VSR25_VectorSRegisterEnumValue)) +#define VSR26 ((VectorSRegister)( VSR26_VectorSRegisterEnumValue)) +#define VSR27 ((VectorSRegister)( VSR27_VectorSRegisterEnumValue)) +#define VSR28 ((VectorSRegister)( VSR28_VectorSRegisterEnumValue)) +#define VSR29 ((VectorSRegister)( VSR29_VectorSRegisterEnumValue)) +#define VSR30 ((VectorSRegister)( VSR30_VectorSRegisterEnumValue)) +#define VSR31 ((VectorSRegister)( VSR31_VectorSRegisterEnumValue)) +#endif // DONT_USE_REGISTER_DEFINES + // Maximum number of incoming arguments that can be passed in i registers. const int PPC_ARGS_IN_REGS_NUM = 8; diff --git a/src/cpu/ppc/vm/stubGenerator_ppc.cpp b/src/cpu/ppc/vm/stubGenerator_ppc.cpp index 1b7fb419a..3d302c4b0 100644 --- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp +++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp @@ -1,6 +1,6 @@ /* - * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018, SAP SE. All rights reserved. + * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2019, SAP SE. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -1352,9 +1352,13 @@ class StubGenerator: public StubCodeGenerator { Register tmp3 = R8_ARG6; Register tmp4 = R9_ARG7; + VectorSRegister tmp_vsr1 = VSR1; + VectorSRegister tmp_vsr2 = VSR2; + address start = __ function_entry(); - Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8; + Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9; + // don't try anything fancy if arrays don't have many elements __ li(tmp3, 0); __ cmpwi(CCR0, R5_ARG3, 9); @@ -1412,22 +1416,60 @@ class StubGenerator: public StubCodeGenerator { __ andi_(R5_ARG3, R5_ARG3, 15); __ mtctr(tmp1); - __ bind(l_8); - // Use unrolled version for mass copying (copy 16 elements a time). - // Load feeding store gets zero latency on Power6, however not on Power5. - // Therefore, the following sequence is made for the good of both. - __ ld(tmp1, 0, R3_ARG1); - __ ld(tmp2, 8, R3_ARG1); - __ ld(tmp3, 16, R3_ARG1); - __ ld(tmp4, 24, R3_ARG1); - __ std(tmp1, 0, R4_ARG2); - __ std(tmp2, 8, R4_ARG2); - __ std(tmp3, 16, R4_ARG2); - __ std(tmp4, 24, R4_ARG2); - __ addi(R3_ARG1, R3_ARG1, 32); - __ addi(R4_ARG2, R4_ARG2, 32); - __ bdnz(l_8); - } + if (!VM_Version::has_vsx()) { + + __ bind(l_8); + // Use unrolled version for mass copying (copy 16 elements a time). + // Load feeding store gets zero latency on Power6, however not on Power5. + // Therefore, the following sequence is made for the good of both. + __ ld(tmp1, 0, R3_ARG1); + __ ld(tmp2, 8, R3_ARG1); + __ ld(tmp3, 16, R3_ARG1); + __ ld(tmp4, 24, R3_ARG1); + __ std(tmp1, 0, R4_ARG2); + __ std(tmp2, 8, R4_ARG2); + __ std(tmp3, 16, R4_ARG2); + __ std(tmp4, 24, R4_ARG2); + __ addi(R3_ARG1, R3_ARG1, 32); + __ addi(R4_ARG2, R4_ARG2, 32); + __ bdnz(l_8); + + } else { // Processor supports VSX, so use it to mass copy. + + // Prefetch src data into L2 cache. + __ dcbt(R3_ARG1, 0); + + // If supported set DSCR pre-fetch to deepest. + if (VM_Version::has_mfdscr()) { + __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7); + __ mtdscr(tmp2); + } + __ li(tmp1, 16); + + // Backbranch target aligned to 32-byte. It's not aligned 16-byte + // as loop contains < 8 instructions that fit inside a single + // i-cache sector. + __ align(32); + + __ bind(l_9); + // Use loop with VSX load/store instructions to + // copy 16 elements a time. + __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load from src. + __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst. + __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1); // Load from src + 16. + __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16. + __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32. + __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32. + __ bdnz(l_9); // Dec CTR and loop if not zero. + + // Restore DSCR pre-fetch value. + if (VM_Version::has_mfdscr()) { + __ load_const_optimized(tmp2, VM_Version::_dscr_val); + __ mtdscr(tmp2); + } + + } + } // FasterArrayCopy __ bind(l_6); // copy 2 elements at a time diff --git a/src/cpu/ppc/vm/vm_version_ppc.cpp b/src/cpu/ppc/vm/vm_version_ppc.cpp index eb47bb2a6..5620895c0 100644 --- a/src/cpu/ppc/vm/vm_version_ppc.cpp +++ b/src/cpu/ppc/vm/vm_version_ppc.cpp @@ -1,6 +1,6 @@ /* - * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved. - * Copyright 2012, 2014 SAP AG. All rights reserved. + * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012, 2018 SAP AG. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -45,7 +45,7 @@ int VM_Version::_features = VM_Version::unknown_m; int VM_Version::_measured_cache_line_size = 128; // default value const char* VM_Version::_features_str = ""; bool VM_Version::_is_determine_features_test_running = false; - +uint64_t VM_Version::_dscr_val = 0; #define MSG(flag) \ if (flag && !FLAG_IS_DEFAULT(flag)) \ @@ -60,7 +60,9 @@ void VM_Version::initialize() { // If PowerArchitecturePPC64 hasn't been specified explicitly determine from features. if (FLAG_IS_DEFAULT(PowerArchitecturePPC64)) { - if (VM_Version::has_popcntw()) { + if (VM_Version::has_lqarx()) { + FLAG_SET_ERGO(uintx, PowerArchitecturePPC64, 8); + } else if (VM_Version::has_popcntw()) { FLAG_SET_ERGO(uintx, PowerArchitecturePPC64, 7); } else if (VM_Version::has_cmpb()) { FLAG_SET_ERGO(uintx, PowerArchitecturePPC64, 6); @@ -71,8 +73,14 @@ void VM_Version::initialize() { } } guarantee(PowerArchitecturePPC64 == 0 || PowerArchitecturePPC64 == 5 || - PowerArchitecturePPC64 == 6 || PowerArchitecturePPC64 == 7, - "PowerArchitecturePPC64 should be 0, 5, 6 or 7"); + PowerArchitecturePPC64 == 6 || PowerArchitecturePPC64 == 7 || + PowerArchitecturePPC64 == 8, + "PowerArchitecturePPC64 should be 0, 5, 6, 7, or 8"); + + // Power 8: Configure Data Stream Control Register. + if (PowerArchitecturePPC64 >= 8) { + config_dscr(); + } if (!UseSIGTRAP) { MSG(TrapBasedICMissChecks); @@ -102,7 +110,7 @@ void VM_Version::initialize() { // Create and print feature-string. char buf[(num_features+1) * 16]; // Max 16 chars per feature. jio_snprintf(buf, sizeof(buf), - "ppc64%s%s%s%s%s%s%s%s%s%s", + "ppc64%s%s%s%s%s%s%s%s%s%s%s%s%s", (has_fsqrt() ? " fsqrt" : ""), (has_isel() ? " isel" : ""), (has_lxarxeh() ? " lxarxeh" : ""), @@ -112,12 +120,17 @@ void VM_Version::initialize() { (has_popcntw() ? " popcntw" : ""), (has_fcfids() ? " fcfids" : ""), (has_vand() ? " vand" : ""), + (has_lqarx() ? " lqarx" : ""), (has_vcipher() ? " aes" : ""), - (has_vpmsumb() ? " vpmsumb" : "") + (has_vpmsumb() ? " vpmsumb" : ""), + (has_mfdscr() ? " mfdscr" : ""), + (has_vsx() ? " vsx" : "") // Make sure number of %s matches num_features! ); _features_str = strdup(buf); - NOT_PRODUCT(if (Verbose) print_features();); + if (Verbose) { + print_features(); + } // PPC64 supports 8-byte compare-exchange operations (see // Atomic::cmpxchg and StubGenerator::generate_atomic_cmpxchg_ptr) @@ -485,8 +498,11 @@ void VM_Version::determine_features() { a->popcntw(R7, R5); // code[7] -> popcntw a->fcfids(F3, F4); // code[8] -> fcfids a->vand(VR0, VR0, VR0); // code[9] -> vand - a->vcipher(VR0, VR1, VR2); // code[10] -> vcipher - a->vpmsumb(VR0, VR1, VR2); // code[11] -> vpmsumb + a->lqarx_unchecked(R7, R3_ARG1, R4_ARG2, 1); // code[10] -> lqarx_m + a->vcipher(VR0, VR1, VR2); // code[11] -> vcipher + a->vpmsumb(VR0, VR1, VR2); // code[12] -> vpmsumb + a->mfdscr(R0); // code[13] -> mfdscr + a->lxvd2x(VSR0, 0, R3_ARG1); // code[14] -> vsx a->blr(); // Emit function to set one cache line to zero. Emit function descriptor and get pointer to it. @@ -530,8 +546,11 @@ void VM_Version::determine_features() { if (code[feature_cntr++]) features |= popcntw_m; if (code[feature_cntr++]) features |= fcfids_m; if (code[feature_cntr++]) features |= vand_m; + if (code[feature_cntr++]) features |= lqarx_m; if (code[feature_cntr++]) features |= vcipher_m; if (code[feature_cntr++]) features |= vpmsumb_m; + if (code[feature_cntr++]) features |= mfdscr_m; + if (code[feature_cntr++]) features |= vsx_m; // Print the detection code. if (PrintAssembly) { @@ -543,6 +562,69 @@ void VM_Version::determine_features() { _features = features; } +// Power 8: Configure Data Stream Control Register. +void VM_Version::config_dscr() { + assert(has_lqarx(), "Only execute on Power 8 or later!"); + + // 7 InstWords for each call (function descriptor + blr instruction). + const int code_size = (2+2*7)*BytesPerInstWord; + + // Allocate space for the code. + ResourceMark rm; + CodeBuffer cb("config_dscr", code_size, 0); + MacroAssembler* a = new MacroAssembler(&cb); + + // Emit code. + uint64_t (*get_dscr)() = (uint64_t(*)())(void *)a->function_entry(); + uint32_t *code = (uint32_t *)a->pc(); + a->mfdscr(R3); + a->blr(); + + void (*set_dscr)(long) = (void(*)(long))(void *)a->function_entry(); + a->mtdscr(R3); + a->blr(); + + uint32_t *code_end = (uint32_t *)a->pc(); + a->flush(); + + // Print the detection code. + if (PrintAssembly) { + ttyLocker ttyl; + tty->print_cr("Decoding dscr configuration stub at " INTPTR_FORMAT " before execution:", p2i(code)); + Disassembler::decode((u_char*)code, (u_char*)code_end, tty); + } + + // Apply the configuration if needed. + _dscr_val = (*get_dscr)(); + if (Verbose) { + tty->print_cr("dscr value was 0x%lx" , _dscr_val); + } + bool change_requested = false; + if (DSCR_PPC64 != (uintx)-1) { + _dscr_val = DSCR_PPC64; + change_requested = true; + } + if (DSCR_DPFD_PPC64 <= 7) { + uint64_t mask = 0x7; + if ((_dscr_val & mask) != DSCR_DPFD_PPC64) { + _dscr_val = (_dscr_val & ~mask) | (DSCR_DPFD_PPC64); + change_requested = true; + } + } + if (DSCR_URG_PPC64 <= 7) { + uint64_t mask = 0x7 << 6; + if ((_dscr_val & mask) != DSCR_DPFD_PPC64 << 6) { + _dscr_val = (_dscr_val & ~mask) | (DSCR_URG_PPC64 << 6); + change_requested = true; + } + } + if (change_requested) { + (*set_dscr)(_dscr_val); + if (Verbose) { + tty->print_cr("dscr was set to 0x%lx" , (*get_dscr)()); + } + } +} static int saved_features = 0; diff --git a/src/cpu/ppc/vm/vm_version_ppc.hpp b/src/cpu/ppc/vm/vm_version_ppc.hpp index 55442f720..6245e1c64 100644 --- a/src/cpu/ppc/vm/vm_version_ppc.hpp +++ b/src/cpu/ppc/vm/vm_version_ppc.hpp @@ -1,6 +1,6 @@ /* - * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved. - * Copyright 2012, 2014 SAP AG. All rights reserved. + * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012, 2018 SAP AG. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -42,8 +42,11 @@ protected: fcfids, vand, dcba, + lqarx, vcipher, vpmsumb, + mfdscr, + vsx, num_features // last entry to count features }; enum Feature_Flag_Set { @@ -58,8 +61,11 @@ protected: fcfids_m = (1 << fcfids ), vand_m = (1 << vand ), dcba_m = (1 << dcba ), + lqarx_m = (1 << lqarx ), vcipher_m = (1 << vcipher), vpmsumb_m = (1 << vpmsumb), + mfdscr_m = (1 << mfdscr ), + vsx_m = (1 << vsx ), all_features_m = -1 }; static int _features; @@ -69,6 +75,7 @@ protected: static void print_features(); static void determine_features(); // also measures cache line size + static void config_dscr(); // Power 8: Configure Data Stream Control Register. static void determine_section_size(); static void power6_micro_bench(); public: @@ -87,8 +94,11 @@ public: static bool has_fcfids() { return (_features & fcfids_m) != 0; } static bool has_vand() { return (_features & vand_m) != 0; } static bool has_dcba() { return (_features & dcba_m) != 0; } + static bool has_lqarx() { return (_features & lqarx_m) != 0; } static bool has_vcipher() { return (_features & vcipher_m) != 0; } static bool has_vpmsumb() { return (_features & vpmsumb_m) != 0; } + static bool has_mfdscr() { return (_features & mfdscr_m) != 0; } + static bool has_vsx() { return (_features & vsx_m) != 0; } static const char* cpu_features() { return _features_str; } @@ -97,6 +107,9 @@ public: // Assembler testing static void allow_all(); static void revert(); + + // POWER 8: DSCR current value. + static uint64_t _dscr_val; }; #endif // CPU_PPC_VM_VM_VERSION_PPC_HPP -- GitLab