提交 28b2e111 编写于 作者: K kevinw

Merge

......@@ -992,3 +992,4 @@ d63e731a896c711b8e0c30ac9ae2d900d4e024f6 jdk8u152-b03
0bd600d6d77b5b41780074bcbfa133032dadf657 jdk8u152-b04
68758c5ab0c1ef01e89bea8a9b799714831a177f jdk8u152-b05
7b96cfeed22242bb68a387d1680e602e37e48050 jdk8u162-b00
92693f9dd704467ddd5fbae5a5908c1713a08ee0 jdk8u162-b01
......@@ -1180,6 +1180,8 @@ class Assembler : public AbstractAssembler {
inline void mullw_( Register d, Register a, Register b);
inline void mulhw( Register d, Register a, Register b);
inline void mulhw_( Register d, Register a, Register b);
inline void mulhwu( Register d, Register a, Register b);
inline void mulhwu_(Register d, Register a, Register b);
inline void mulhd( Register d, Register a, Register b);
inline void mulhd_( Register d, Register a, Register b);
inline void mulhdu( Register d, Register a, Register b);
......
......@@ -109,6 +109,8 @@ inline void Assembler::mullw( Register d, Register a, Register b) { emit_int32(
inline void Assembler::mullw_( Register d, Register a, Register b) { emit_int32(MULLW_OPCODE | rt(d) | ra(a) | rb(b) | oe(0) | rc(1)); }
inline void Assembler::mulhw( Register d, Register a, Register b) { emit_int32(MULHW_OPCODE | rt(d) | ra(a) | rb(b) | rc(0)); }
inline void Assembler::mulhw_( Register d, Register a, Register b) { emit_int32(MULHW_OPCODE | rt(d) | ra(a) | rb(b) | rc(1)); }
inline void Assembler::mulhwu( Register d, Register a, Register b) { emit_int32(MULHWU_OPCODE | rt(d) | ra(a) | rb(b) | rc(0)); }
inline void Assembler::mulhwu_(Register d, Register a, Register b) { emit_int32(MULHWU_OPCODE | rt(d) | ra(a) | rb(b) | rc(1)); }
inline void Assembler::mulhd( Register d, Register a, Register b) { emit_int32(MULHD_OPCODE | rt(d) | ra(a) | rb(b) | rc(0)); }
inline void Assembler::mulhd_( Register d, Register a, Register b) { emit_int32(MULHD_OPCODE | rt(d) | ra(a) | rb(b) | rc(1)); }
inline void Assembler::mulhdu( Register d, Register a, Register b) { emit_int32(MULHDU_OPCODE | rt(d) | ra(a) | rb(b) | rc(0)); }
......
......@@ -45,4 +45,10 @@ void Compile::pd_compiler2_init() {
FLAG_SET_ERGO(bool, InsertEndGroupPPC64, true);
}
}
if (OptimizeFill) {
warning("OptimizeFill is not supported on this CPU.");
FLAG_SET_DEFAULT(OptimizeFill, false);
}
}
//
// Copyright (c) 2011, 2014, Oracle and/or its affiliates. All rights reserved.
// Copyright 2012, 2014 SAP AG. All rights reserved.
// Copyright (c) 2012, 2017 SAP SE. All rights reserved.
// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
//
// This code is free software; you can redistribute it and/or modify it
......@@ -8610,6 +8610,44 @@ instruct shrP_convP2X_reg_imm6(iRegLdst dst, iRegP_N2P src1, uimmI6 src2) %{
ins_pipe(pipe_class_default);
%}
// Bitfield Extract: URShiftI + AndI
instruct andI_urShiftI_regI_immI_immIpow2minus1(iRegIdst dst, iRegIsrc src1, immI src2, immIpow2minus1 src3) %{
match(Set dst (AndI (URShiftI src1 src2) src3));
format %{ "EXTRDI $dst, $src1, shift=$src2, mask=$src3 \t// int bitfield extract" %}
size(4);
ins_encode %{
// TODO: PPC port $archOpcode(ppc64Opcode_rldicl);
int rshift = ($src2$$constant) & 0x1f;
int length = log2_long(((jlong) $src3$$constant) + 1);
if (rshift + length > 32) {
// if necessary, adjust mask to omit rotated bits.
length = 32 - rshift;
}
__ extrdi($dst$$Register, $src1$$Register, length, 64 - (rshift + length));
%}
ins_pipe(pipe_class_default);
%}
// Bitfield Extract: URShiftL + AndL
instruct andL_urShiftL_regL_immI_immLpow2minus1(iRegLdst dst, iRegLsrc src1, immI src2, immLpow2minus1 src3) %{
match(Set dst (AndL (URShiftL src1 src2) src3));
format %{ "EXTRDI $dst, $src1, shift=$src2, mask=$src3 \t// long bitfield extract" %}
size(4);
ins_encode %{
// TODO: PPC port $archOpcode(ppc64Opcode_rldicl);
int rshift = ($src2$$constant) & 0x3f;
int length = log2_long(((jlong) $src3$$constant) + 1);
if (rshift + length > 64) {
// if necessary, adjust mask to omit rotated bits.
length = 64 - rshift;
}
__ extrdi($dst$$Register, $src1$$Register, length, 64 - (rshift + length));
%}
ins_pipe(pipe_class_default);
%}
instruct sxtI_reg(iRegIdst dst, iRegIsrc src) %{
match(Set dst (ConvL2I (ConvI2L src)));
......@@ -8889,6 +8927,19 @@ instruct andI_reg_reg(iRegIdst dst, iRegIsrc src1, iRegIsrc src2) %{
ins_pipe(pipe_class_default);
%}
// Left shifted Immediate And
instruct andI_reg_immIhi16(iRegIdst dst, iRegIsrc src1, immIhi16 src2, flagsRegCR0 cr0) %{
match(Set dst (AndI src1 src2));
effect(KILL cr0);
format %{ "ANDIS $dst, $src1, $src2.hi" %}
size(4);
ins_encode %{
// TODO: PPC port $archOpcode(ppc64Opcode_andis_);
__ andis_($dst$$Register, $src1$$Register, (int)((unsigned short)(($src2$$constant & 0xFFFF0000) >> 16)));
%}
ins_pipe(pipe_class_default);
%}
// Immediate And
instruct andI_reg_uimm16(iRegIdst dst, iRegIsrc src1, uimmI16 src2, flagsRegCR0 cr0) %{
match(Set dst (AndI src1 src2));
......@@ -10571,6 +10622,17 @@ instruct cmpP_reg_reg(flagsReg crx, iRegP_N2P src1, iRegP_N2P src2) %{
ins_pipe(pipe_class_compare);
%}
instruct cmpP_reg_null(flagsReg crx, iRegP_N2P src1, immP_0or1 src2) %{
match(Set crx (CmpP src1 src2));
format %{ "CMPLDI $crx, $src1, $src2 \t// ptr" %}
size(4);
ins_encode %{
// TODO: PPC port $archOpcode(ppc64Opcode_cmpl);
__ cmpldi($crx$$CondRegister, $src1$$Register, (int)((short)($src2$$constant & 0xFFFF)));
%}
ins_pipe(pipe_class_compare);
%}
// Used in postalloc expand.
instruct cmpP_reg_imm16(flagsReg crx, iRegPsrc src1, immL16 src2) %{
// This match rule prevents reordering of node before a safepoint.
......
......@@ -42,6 +42,8 @@
#include "opto/runtime.hpp"
#endif
#include <alloca.h>
#define __ masm->
#ifdef PRODUCT
......@@ -3268,3 +3270,245 @@ RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const cha
return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_bytes/wordSize,
oop_maps, true);
}
//------------------------------Montgomery multiplication------------------------
//
// Subtract 0:b from carry:a. Return carry.
static unsigned long
sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
long i = 0;
unsigned long tmp, tmp2;
__asm__ __volatile__ (
"subfc %[tmp], %[tmp], %[tmp] \n" // pre-set CA
"mtctr %[len] \n"
"0: \n"
"ldx %[tmp], %[i], %[a] \n"
"ldx %[tmp2], %[i], %[b] \n"
"subfe %[tmp], %[tmp2], %[tmp] \n" // subtract extended
"stdx %[tmp], %[i], %[a] \n"
"addi %[i], %[i], 8 \n"
"bdnz 0b \n"
"addme %[tmp], %[carry] \n" // carry + CA - 1
: [i]"+b"(i), [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2)
: [a]"r"(a), [b]"r"(b), [carry]"r"(carry), [len]"r"(len)
: "ctr", "xer", "memory"
);
return tmp;
}
// Multiply (unsigned) Long A by Long B, accumulating the double-
// length result into the accumulator formed of T0, T1, and T2.
inline void MACC(unsigned long A, unsigned long B, unsigned long &T0, unsigned long &T1, unsigned long &T2) {
unsigned long hi, lo;
__asm__ __volatile__ (
"mulld %[lo], %[A], %[B] \n"
"mulhdu %[hi], %[A], %[B] \n"
"addc %[T0], %[T0], %[lo] \n"
"adde %[T1], %[T1], %[hi] \n"
"addze %[T2], %[T2] \n"
: [hi]"=&r"(hi), [lo]"=&r"(lo), [T0]"+r"(T0), [T1]"+r"(T1), [T2]"+r"(T2)
: [A]"r"(A), [B]"r"(B)
: "xer"
);
}
// As above, but add twice the double-length result into the
// accumulator.
inline void MACC2(unsigned long A, unsigned long B, unsigned long &T0, unsigned long &T1, unsigned long &T2) {
unsigned long hi, lo;
__asm__ __volatile__ (
"mulld %[lo], %[A], %[B] \n"
"mulhdu %[hi], %[A], %[B] \n"
"addc %[T0], %[T0], %[lo] \n"
"adde %[T1], %[T1], %[hi] \n"
"addze %[T2], %[T2] \n"
"addc %[T0], %[T0], %[lo] \n"
"adde %[T1], %[T1], %[hi] \n"
"addze %[T2], %[T2] \n"
: [hi]"=&r"(hi), [lo]"=&r"(lo), [T0]"+r"(T0), [T1]"+r"(T1), [T2]"+r"(T2)
: [A]"r"(A), [B]"r"(B)
: "xer"
);
}
// Fast Montgomery multiplication. The derivation of the algorithm is
// in "A Cryptographic Library for the Motorola DSP56000,
// Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237".
static void
montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
unsigned long m[], unsigned long inv, int len) {
unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
int i;
assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
for (i = 0; i < len; i++) {
int j;
for (j = 0; j < i; j++) {
MACC(a[j], b[i-j], t0, t1, t2);
MACC(m[j], n[i-j], t0, t1, t2);
}
MACC(a[i], b[0], t0, t1, t2);
m[i] = t0 * inv;
MACC(m[i], n[0], t0, t1, t2);
assert(t0 == 0, "broken Montgomery multiply");
t0 = t1; t1 = t2; t2 = 0;
}
for (i = len; i < 2*len; i++) {
int j;
for (j = i-len+1; j < len; j++) {
MACC(a[j], b[i-j], t0, t1, t2);
MACC(m[j], n[i-j], t0, t1, t2);
}
m[i-len] = t0;
t0 = t1; t1 = t2; t2 = 0;
}
while (t0) {
t0 = sub(m, n, t0, len);
}
}
// Fast Montgomery squaring. This uses asymptotically 25% fewer
// multiplies so it should be up to 25% faster than Montgomery
// multiplication. However, its loop control is more complex and it
// may actually run slower on some machines.
static void
montgomery_square(unsigned long a[], unsigned long n[],
unsigned long m[], unsigned long inv, int len) {
unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
int i;
assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
for (i = 0; i < len; i++) {
int j;
int end = (i+1)/2;
for (j = 0; j < end; j++) {
MACC2(a[j], a[i-j], t0, t1, t2);
MACC(m[j], n[i-j], t0, t1, t2);
}
if ((i & 1) == 0) {
MACC(a[j], a[j], t0, t1, t2);
}
for (; j < i; j++) {
MACC(m[j], n[i-j], t0, t1, t2);
}
m[i] = t0 * inv;
MACC(m[i], n[0], t0, t1, t2);
assert(t0 == 0, "broken Montgomery square");
t0 = t1; t1 = t2; t2 = 0;
}
for (i = len; i < 2*len; i++) {
int start = i-len+1;
int end = start + (len - start)/2;
int j;
for (j = start; j < end; j++) {
MACC2(a[j], a[i-j], t0, t1, t2);
MACC(m[j], n[i-j], t0, t1, t2);
}
if ((i & 1) == 0) {
MACC(a[j], a[j], t0, t1, t2);
}
for (; j < len; j++) {
MACC(m[j], n[i-j], t0, t1, t2);
}
m[i-len] = t0;
t0 = t1; t1 = t2; t2 = 0;
}
while (t0) {
t0 = sub(m, n, t0, len);
}
}
// The threshold at which squaring is advantageous was determined
// experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
// Doesn't seem to be relevant for Power8 so we use the same value.
#define MONTGOMERY_SQUARING_THRESHOLD 64
// Copy len longwords from s to d, word-swapping as we go. The
// destination array is reversed.
static void reverse_words(unsigned long *s, unsigned long *d, int len) {
d += len;
while(len-- > 0) {
d--;
unsigned long s_val = *s;
// Swap words in a longword on little endian machines.
#ifdef VM_LITTLE_ENDIAN
s_val = (s_val << 32) | (s_val >> 32);
#endif
*d = s_val;
s++;
}
}
void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
jint len, jlong inv,
jint *m_ints) {
assert(len % 2 == 0, "array length in montgomery_multiply must be even");
int longwords = len/2;
assert(longwords > 0, "unsupported");
// Make very sure we don't use so much space that the stack might
// overflow. 512 jints corresponds to an 16384-bit integer and
// will use here a total of 8k bytes of stack space.
int total_allocation = longwords * sizeof (unsigned long) * 4;
guarantee(total_allocation <= 8192, "must be");
unsigned long *scratch = (unsigned long *)alloca(total_allocation);
// Local scratch arrays
unsigned long
*a = scratch + 0 * longwords,
*b = scratch + 1 * longwords,
*n = scratch + 2 * longwords,
*m = scratch + 3 * longwords;
reverse_words((unsigned long *)a_ints, a, longwords);
reverse_words((unsigned long *)b_ints, b, longwords);
reverse_words((unsigned long *)n_ints, n, longwords);
::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords);
reverse_words(m, (unsigned long *)m_ints, longwords);
}
void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
jint len, jlong inv,
jint *m_ints) {
assert(len % 2 == 0, "array length in montgomery_square must be even");
int longwords = len/2;
assert(longwords > 0, "unsupported");
// Make very sure we don't use so much space that the stack might
// overflow. 512 jints corresponds to an 16384-bit integer and
// will use here a total of 6k bytes of stack space.
int total_allocation = longwords * sizeof (unsigned long) * 3;
guarantee(total_allocation <= 8192, "must be");
unsigned long *scratch = (unsigned long *)alloca(total_allocation);
// Local scratch arrays
unsigned long
*a = scratch + 0 * longwords,
*n = scratch + 1 * longwords,
*m = scratch + 2 * longwords;
reverse_words((unsigned long *)a_ints, a, longwords);
reverse_words((unsigned long *)n_ints, n, longwords);
if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
::montgomery_square(a, n, m, (unsigned long)inv, longwords);
} else {
::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords);
}
reverse_words(m, (unsigned long *)m_ints, longwords);
}
......@@ -2524,6 +2524,14 @@ class StubGenerator: public StubCodeGenerator {
StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
}
if (UseMontgomeryMultiplyIntrinsic) {
StubRoutines::_montgomeryMultiply
= CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
}
if (UseMontgomerySquareIntrinsic) {
StubRoutines::_montgomerySquare
= CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
}
}
public:
......
......@@ -265,7 +265,7 @@ void TemplateInterpreterGenerator::generate_counter_incr(Label* overflow, Label*
__ cmpdi(CCR0, Rmdo, 0);
__ beq(CCR0, no_mdo);
// Increment backedge counter in the MDO.
// Increment invocation counter in the MDO.
const int mdo_bc_offs = in_bytes(MethodData::backedge_counter_offset()) + in_bytes(InvocationCounter::counter_offset());
__ lwz(Rscratch2, mdo_bc_offs, Rmdo);
__ addi(Rscratch2, Rscratch2, increment);
......@@ -277,12 +277,12 @@ void TemplateInterpreterGenerator::generate_counter_incr(Label* overflow, Label*
}
// Increment counter in MethodCounters*.
const int mo_bc_offs = in_bytes(MethodCounters::backedge_counter_offset()) + in_bytes(InvocationCounter::counter_offset());
const int mo_ic_offs = in_bytes(MethodCounters::invocation_counter_offset()) + in_bytes(InvocationCounter::counter_offset());
__ bind(no_mdo);
__ get_method_counters(R19_method, R3_counters, done);
__ lwz(Rscratch2, mo_bc_offs, R3_counters);
__ lwz(Rscratch2, mo_ic_offs, R3_counters);
__ addi(Rscratch2, Rscratch2, increment);
__ stw(Rscratch2, mo_bc_offs, R3_counters);
__ stw(Rscratch2, mo_ic_offs, R3_counters);
__ load_const_optimized(Rscratch1, mask, R0);
__ and_(Rscratch1, Rscratch2, Rscratch1);
__ beq(CCR0, *overflow);
......
......@@ -201,6 +201,12 @@ void VM_Version::initialize() {
FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
}
if (FLAG_IS_DEFAULT(UseMontgomeryMultiplyIntrinsic)) {
UseMontgomeryMultiplyIntrinsic = true;
}
if (FLAG_IS_DEFAULT(UseMontgomerySquareIntrinsic)) {
UseMontgomerySquareIntrinsic = true;
}
}
void VM_Version::print_features() {
......
......@@ -36,7 +36,7 @@ LANGUAGE LANG_NEUTRAL, SUBLANG_NEUTRAL
//
VS_VERSION_INFO VERSIONINFO
FILEVERSION HS_VER
FILEVERSION JDK_VER
PRODUCTVERSION JDK_VER
FILEFLAGSMASK 0x3fL
#ifdef _DEBUG
......@@ -56,7 +56,7 @@ BEGIN
BEGIN
VALUE "CompanyName", XSTR(HS_COMPANY) "\0"
VALUE "FileDescription", XSTR(HS_FILEDESC) "\0"
VALUE "FileVersion", XSTR(HS_DOTVER) "\0"
VALUE "FileVersion", XSTR(JDK_DOTVER) "\0"
VALUE "Full Version", XSTR(HS_BUILD_ID) "\0"
VALUE "InternalName", XSTR(HS_INTERNAL_NAME) "\0"
VALUE "LegalCopyright", XSTR(HS_COPYRIGHT) "\0"
......
......@@ -6068,11 +6068,21 @@ bool LibraryCallKit::inline_montgomeryMultiply() {
Node* n_start = array_element_address(n, intcon(0), n_elem);
Node* m_start = array_element_address(m, intcon(0), m_elem);
Node* call = make_runtime_call(RC_LEAF,
Node* call = NULL;
if (CCallingConventionRequiresIntsAsLongs) {
Node* len_I2L = ConvI2L(len);
call = make_runtime_call(RC_LEAF,
OptoRuntime::montgomeryMultiply_Type(),
stubAddr, stubName, TypePtr::BOTTOM,
a_start, b_start, n_start, len_I2L XTOP, inv,
top(), m_start);
} else {
call = make_runtime_call(RC_LEAF,
OptoRuntime::montgomeryMultiply_Type(),
stubAddr, stubName, TypePtr::BOTTOM,
a_start, b_start, n_start, len, inv, top(),
m_start);
}
set_result(m);
}
......@@ -6122,11 +6132,22 @@ bool LibraryCallKit::inline_montgomerySquare() {
Node* n_start = array_element_address(n, intcon(0), n_elem);
Node* m_start = array_element_address(m, intcon(0), m_elem);
Node* call = make_runtime_call(RC_LEAF,
Node* call = NULL;
if (CCallingConventionRequiresIntsAsLongs) {
Node* len_I2L = ConvI2L(len);
call = make_runtime_call(RC_LEAF,
OptoRuntime::montgomerySquare_Type(),
stubAddr, stubName, TypePtr::BOTTOM,
a_start, n_start, len_I2L XTOP, inv, top(),
m_start);
} else {
call = make_runtime_call(RC_LEAF,
OptoRuntime::montgomerySquare_Type(),
stubAddr, stubName, TypePtr::BOTTOM,
a_start, n_start, len, inv, top(),
m_start);
}
set_result(m);
}
......
......@@ -1003,12 +1003,20 @@ const TypeFunc* OptoRuntime::montgomeryMultiply_Type() {
// create input type (domain)
int num_args = 7;
int argcnt = num_args;
if (CCallingConventionRequiresIntsAsLongs) {
argcnt++; // additional placeholder
}
const Type** fields = TypeTuple::fields(argcnt);
int argp = TypeFunc::Parms;
fields[argp++] = TypePtr::NOTNULL; // a
fields[argp++] = TypePtr::NOTNULL; // b
fields[argp++] = TypePtr::NOTNULL; // n
if (CCallingConventionRequiresIntsAsLongs) {
fields[argp++] = TypeLong::LONG; // len
fields[argp++] = TypeLong::HALF; // placeholder
} else {
fields[argp++] = TypeInt::INT; // len
}
fields[argp++] = TypeLong::LONG; // inv
fields[argp++] = Type::HALF;
fields[argp++] = TypePtr::NOTNULL; // result
......@@ -1027,11 +1035,19 @@ const TypeFunc* OptoRuntime::montgomerySquare_Type() {
// create input type (domain)
int num_args = 6;
int argcnt = num_args;
if (CCallingConventionRequiresIntsAsLongs) {
argcnt++; // additional placeholder
}
const Type** fields = TypeTuple::fields(argcnt);
int argp = TypeFunc::Parms;
fields[argp++] = TypePtr::NOTNULL; // a
fields[argp++] = TypePtr::NOTNULL; // n
if (CCallingConventionRequiresIntsAsLongs) {
fields[argp++] = TypeLong::LONG; // len
fields[argp++] = TypeLong::HALF; // placeholder
} else {
fields[argp++] = TypeInt::INT; // len
}
fields[argp++] = TypeLong::LONG; // inv
fields[argp++] = Type::HALF;
fields[argp++] = TypePtr::NOTNULL; // result
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册