提交 510019d2 编写于 作者: A amurillo

Merge

...@@ -641,3 +641,4 @@ ced08ed4924fc6581626c7ce2d769fc18d7b23e0 jdk8u60-b13 ...@@ -641,3 +641,4 @@ ced08ed4924fc6581626c7ce2d769fc18d7b23e0 jdk8u60-b13
c9f8b7319d0a5ab07310cf53507642a8fd91589b jdk8u60-b14 c9f8b7319d0a5ab07310cf53507642a8fd91589b jdk8u60-b14
4187dc92e90b16b4097627b8af4f5e6e63f3b497 hs25.60-b15 4187dc92e90b16b4097627b8af4f5e6e63f3b497 hs25.60-b15
b99f1bf208f385277b03a985d35b6614b4095f3e jdk8u60-b15 b99f1bf208f385277b03a985d35b6614b4095f3e jdk8u60-b15
f5800068c61d0627c14e99836e9ce5cf0ef00075 hs25.60-b16
...@@ -35,7 +35,7 @@ HOTSPOT_VM_COPYRIGHT=Copyright 2015 ...@@ -35,7 +35,7 @@ HOTSPOT_VM_COPYRIGHT=Copyright 2015
HS_MAJOR_VER=25 HS_MAJOR_VER=25
HS_MINOR_VER=60 HS_MINOR_VER=60
HS_BUILD_NUMBER=15 HS_BUILD_NUMBER=16
JDK_MAJOR_VER=1 JDK_MAJOR_VER=1
JDK_MINOR_VER=8 JDK_MINOR_VER=8
......
...@@ -6690,7 +6690,7 @@ void MacroAssembler::string_compare(Register str1, Register str2, ...@@ -6690,7 +6690,7 @@ void MacroAssembler::string_compare(Register str1, Register str2,
subl(cnt2, stride2); subl(cnt2, stride2);
jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
// clean upper bits of YMM registers // clean upper bits of YMM registers
vzeroupper(); vpxor(vec1, vec1);
// compare wide vectors tail // compare wide vectors tail
bind(COMPARE_WIDE_TAIL); bind(COMPARE_WIDE_TAIL);
...@@ -6705,7 +6705,7 @@ void MacroAssembler::string_compare(Register str1, Register str2, ...@@ -6705,7 +6705,7 @@ void MacroAssembler::string_compare(Register str1, Register str2,
// Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
bind(VECTOR_NOT_EQUAL); bind(VECTOR_NOT_EQUAL);
// clean upper bits of YMM registers // clean upper bits of YMM registers
vzeroupper(); vpxor(vec1, vec1);
lea(str1, Address(str1, result, scale)); lea(str1, Address(str1, result, scale));
lea(str2, Address(str2, result, scale)); lea(str2, Address(str2, result, scale));
jmp(COMPARE_16_CHARS); jmp(COMPARE_16_CHARS);
...@@ -6964,7 +6964,8 @@ void MacroAssembler::char_arrays_equals(bool is_array_equ, Register ary1, Regist ...@@ -6964,7 +6964,8 @@ void MacroAssembler::char_arrays_equals(bool is_array_equ, Register ary1, Regist
bind(DONE); bind(DONE);
if (UseAVX >= 2) { if (UseAVX >= 2) {
// clean upper bits of YMM registers // clean upper bits of YMM registers
vzeroupper(); vpxor(vec1, vec1);
vpxor(vec2, vec2);
} }
} }
...@@ -7098,7 +7099,8 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned, ...@@ -7098,7 +7099,8 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
BIND(L_check_fill_8_bytes); BIND(L_check_fill_8_bytes);
// clean upper bits of YMM registers // clean upper bits of YMM registers
vzeroupper(); movdl(xtmp, value);
pshufd(xtmp, xtmp, 0);
} else { } else {
// Fill 32-byte chunks // Fill 32-byte chunks
pshufd(xtmp, xtmp, 0); pshufd(xtmp, xtmp, 0);
...@@ -7261,7 +7263,11 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, Register len, ...@@ -7261,7 +7263,11 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
bind(L_copy_16_chars_exit); bind(L_copy_16_chars_exit);
if (UseAVX >= 2) { if (UseAVX >= 2) {
// clean upper bits of YMM registers // clean upper bits of YMM registers
vzeroupper(); vpxor(tmp2Reg, tmp2Reg);
vpxor(tmp3Reg, tmp3Reg);
vpxor(tmp4Reg, tmp4Reg);
movdl(tmp1Reg, tmp5);
pshufd(tmp1Reg, tmp1Reg, 0);
} }
subptr(len, 8); subptr(len, 8);
jccb(Assembler::greater, L_copy_8_chars_exit); jccb(Assembler::greater, L_copy_8_chars_exit);
......
...@@ -837,7 +837,8 @@ class StubGenerator: public StubCodeGenerator { ...@@ -837,7 +837,8 @@ class StubGenerator: public StubCodeGenerator {
if (UseUnalignedLoadStores && (UseAVX >= 2)) { if (UseUnalignedLoadStores && (UseAVX >= 2)) {
// clean upper bits of YMM registers // clean upper bits of YMM registers
__ vzeroupper(); __ vpxor(xmm0, xmm0);
__ vpxor(xmm1, xmm1);
} }
__ addl(qword_count, 8); __ addl(qword_count, 8);
__ jccb(Assembler::zero, L_exit); __ jccb(Assembler::zero, L_exit);
......
...@@ -1328,7 +1328,8 @@ class StubGenerator: public StubCodeGenerator { ...@@ -1328,7 +1328,8 @@ class StubGenerator: public StubCodeGenerator {
__ BIND(L_end); __ BIND(L_end);
if (UseAVX >= 2) { if (UseAVX >= 2) {
// clean upper bits of YMM registers // clean upper bits of YMM registers
__ vzeroupper(); __ vpxor(xmm0, xmm0);
__ vpxor(xmm1, xmm1);
} }
} else { } else {
// Copy 32-bytes per iteration // Copy 32-bytes per iteration
...@@ -1405,7 +1406,8 @@ class StubGenerator: public StubCodeGenerator { ...@@ -1405,7 +1406,8 @@ class StubGenerator: public StubCodeGenerator {
__ BIND(L_end); __ BIND(L_end);
if (UseAVX >= 2) { if (UseAVX >= 2) {
// clean upper bits of YMM registers // clean upper bits of YMM registers
__ vzeroupper(); __ vpxor(xmm0, xmm0);
__ vpxor(xmm1, xmm1);
} }
} else { } else {
// Copy 32-bytes per iteration // Copy 32-bytes per iteration
......
...@@ -232,6 +232,13 @@ void SuperWord::find_adjacent_refs() { ...@@ -232,6 +232,13 @@ void SuperWord::find_adjacent_refs() {
// if unaligned memory access is not allowed because number of // if unaligned memory access is not allowed because number of
// iterations in pre-loop will be not enough to align it. // iterations in pre-loop will be not enough to align it.
create_pack = false; create_pack = false;
} else {
SWPointer p2(best_align_to_mem_ref, this);
if (align_to_ref_p.invar() != p2.invar()) {
// Do not vectorize memory accesses with different invariants
// if unaligned memory accesses are not allowed.
create_pack = false;
}
} }
} }
} else { } else {
...@@ -445,29 +452,57 @@ bool SuperWord::ref_is_alignable(SWPointer& p) { ...@@ -445,29 +452,57 @@ bool SuperWord::ref_is_alignable(SWPointer& p) {
int preloop_stride = pre_end->stride_con(); int preloop_stride = pre_end->stride_con();
int span = preloop_stride * p.scale_in_bytes(); int span = preloop_stride * p.scale_in_bytes();
int mem_size = p.memory_size();
// Stride one accesses are alignable. int offset = p.offset_in_bytes();
if (ABS(span) == p.memory_size()) // Stride one accesses are alignable if offset is aligned to memory operation size.
// Offset can be unaligned when UseUnalignedAccesses is used.
if (ABS(span) == mem_size && (ABS(offset) % mem_size) == 0) {
return true; return true;
}
// If initial offset from start of object is computable, // If the initial offset from start of the object is computable,
// compute alignment within the vector. // check if the pre-loop can align the final offset accordingly.
//
// In other words: Can we find an i such that the offset
// after i pre-loop iterations is aligned to vw?
// (init_offset + pre_loop) % vw == 0 (1)
// where
// pre_loop = i * span
// is the number of bytes added to the offset by i pre-loop iterations.
//
// For this to hold we need pre_loop to increase init_offset by
// pre_loop = vw - (init_offset % vw)
//
// This is only possible if pre_loop is divisible by span because each
// pre-loop iteration increases the initial offset by 'span' bytes:
// (vw - (init_offset % vw)) % span == 0
//
int vw = vector_width_in_bytes(p.mem()); int vw = vector_width_in_bytes(p.mem());
assert(vw > 1, "sanity"); assert(vw > 1, "sanity");
if (vw % span == 0) {
Node* init_nd = pre_end->init_trip(); Node* init_nd = pre_end->init_trip();
if (init_nd->is_Con() && p.invar() == NULL) { if (init_nd->is_Con() && p.invar() == NULL) {
int init = init_nd->bottom_type()->is_int()->get_con(); int init = init_nd->bottom_type()->is_int()->get_con();
int init_offset = init * p.scale_in_bytes() + offset;
int init_offset = init * p.scale_in_bytes() + p.offset_in_bytes();
assert(init_offset >= 0, "positive offset from object start"); assert(init_offset >= 0, "positive offset from object start");
if (vw % span == 0) {
// If vm is a multiple of span, we use formula (1).
if (span > 0) { if (span > 0) {
return (vw - (init_offset % vw)) % span == 0; return (vw - (init_offset % vw)) % span == 0;
} else { } else {
assert(span < 0, "nonzero stride * scale"); assert(span < 0, "nonzero stride * scale");
return (init_offset % vw) % -span == 0; return (init_offset % vw) % -span == 0;
} }
} else if (span % vw == 0) {
// If span is a multiple of vw, we can simplify formula (1) to:
// (init_offset + i * span) % vw == 0
// =>
// (init_offset % vw) + ((i * span) % vw) == 0
// =>
// init_offset % vw == 0
//
// Because we add a multiple of vw to the initial offset, the final
// offset is a multiple of vw if and only if init_offset is a multiple.
//
return (init_offset % vw) == 0;
} }
} }
return false; return false;
...@@ -479,17 +514,23 @@ int SuperWord::get_iv_adjustment(MemNode* mem_ref) { ...@@ -479,17 +514,23 @@ int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
SWPointer align_to_ref_p(mem_ref, this); SWPointer align_to_ref_p(mem_ref, this);
int offset = align_to_ref_p.offset_in_bytes(); int offset = align_to_ref_p.offset_in_bytes();
int scale = align_to_ref_p.scale_in_bytes(); int scale = align_to_ref_p.scale_in_bytes();
int elt_size = align_to_ref_p.memory_size();
int vw = vector_width_in_bytes(mem_ref); int vw = vector_width_in_bytes(mem_ref);
assert(vw > 1, "sanity"); assert(vw > 1, "sanity");
int iv_adjustment;
if (scale != 0) {
int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1; int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1;
// At least one iteration is executed in pre-loop by default. As result // At least one iteration is executed in pre-loop by default. As result
// several iterations are needed to align memory operations in main-loop even // several iterations are needed to align memory operations in main-loop even
// if offset is 0. // if offset is 0.
int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw)); int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
int elt_size = align_to_ref_p.memory_size();
assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0), assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0),
err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size)); err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size));
int iv_adjustment = iv_adjustment_in_bytes/elt_size; iv_adjustment = iv_adjustment_in_bytes/elt_size;
} else {
// This memory op is not dependent on iv (scale == 0)
iv_adjustment = 0;
}
#ifndef PRODUCT #ifndef PRODUCT
if (TraceSuperWord) if (TraceSuperWord)
...@@ -2247,6 +2288,11 @@ SWPointer::SWPointer(MemNode* mem, SuperWord* slp) : ...@@ -2247,6 +2288,11 @@ SWPointer::SWPointer(MemNode* mem, SuperWord* slp) :
} }
// Match AddP(base, AddP(ptr, k*iv [+ invariant]), constant) // Match AddP(base, AddP(ptr, k*iv [+ invariant]), constant)
Node* base = adr->in(AddPNode::Base); Node* base = adr->in(AddPNode::Base);
// The base address should be loop invariant
if (!invariant(base)) {
assert(!valid(), "base address is loop variant");
return;
}
//unsafe reference could not be aligned appropriately without runtime checking //unsafe reference could not be aligned appropriately without runtime checking
if (base == NULL || base->bottom_type() == Type::TOP) { if (base == NULL || base->bottom_type() == Type::TOP) {
assert(!valid(), "unsafe access"); assert(!valid(), "unsafe access");
......
...@@ -41,7 +41,7 @@ ...@@ -41,7 +41,7 @@
// Exploiting SuperWord Level Parallelism with // Exploiting SuperWord Level Parallelism with
// Multimedia Instruction Sets // Multimedia Instruction Sets
// by // by
// Samuel Larsen and Saman Amarasighe // Samuel Larsen and Saman Amarasinghe
// MIT Laboratory for Computer Science // MIT Laboratory for Computer Science
// date // date
// May 2000 // May 2000
...@@ -432,7 +432,7 @@ class SWPointer VALUE_OBJ_CLASS_SPEC { ...@@ -432,7 +432,7 @@ class SWPointer VALUE_OBJ_CLASS_SPEC {
Node* _base; // NULL if unsafe nonheap reference Node* _base; // NULL if unsafe nonheap reference
Node* _adr; // address pointer Node* _adr; // address pointer
jint _scale; // multipler for iv (in bytes), 0 if no loop iv jint _scale; // multiplier for iv (in bytes), 0 if no loop iv
jint _offset; // constant offset (in bytes) jint _offset; // constant offset (in bytes)
Node* _invar; // invariant offset (in bytes), NULL if none Node* _invar; // invariant offset (in bytes), NULL if none
bool _negate_invar; // if true then use: (0 - _invar) bool _negate_invar; // if true then use: (0 - _invar)
......
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
import com.oracle.java.testlibrary.*;
import sun.misc.Unsafe;
/**
* @test
* @bug 8078497
* @summary Tests correct alignment of vectors with loop invariant offset.
* @library /testlibrary
* @run main TestVectorizationWithInvariant
*/
public class TestVectorizationWithInvariant {
private static Unsafe unsafe;
private static final long BYTE_ARRAY_OFFSET;
private static final long CHAR_ARRAY_OFFSET;
static {
unsafe = Utils.getUnsafe();
BYTE_ARRAY_OFFSET = unsafe.arrayBaseOffset(byte[].class);
CHAR_ARRAY_OFFSET = unsafe.arrayBaseOffset(char[].class);
}
public static void main(String[] args) throws Exception {
byte[] byte_array1 = new byte[1000];
byte[] byte_array2 = new byte[1000];
char[] char_array = new char[1000];
for (int i = 0; i < 20_000; ++i) {
copyByteToChar(byte_array1, byte_array2, char_array, 1);
copyCharToByte(char_array, byte_array1, 1);
copyCharToByteAligned(char_array, byte_array1);
copyCharToByteUnaligned(char_array, byte_array1);
}
}
/*
* Copy multiple consecutive chars from a byte array to a given offset in a char array
* to trigger C2's superword optimization. The offset in the byte array is independent
* of the loop induction variable and can be set to an arbitrary value. It may then not
* be possible to both align the LoadUS and the StoreC operations. Therefore, vectorization
* should only be done in this case if unaligned memory accesses are allowed.
*/
public static void copyByteToChar(byte[] src1, byte[] src2, char[] dst, int off) {
off = (int) BYTE_ARRAY_OFFSET + (off << 1);
byte[] src = src1;
for (int i = (int) CHAR_ARRAY_OFFSET; i < 100; i = i + 8) {
// Copy 8 chars from src to dst
unsafe.putChar(dst, i + 0, unsafe.getChar(src, off + 0));
unsafe.putChar(dst, i + 2, unsafe.getChar(src, off + 2));
unsafe.putChar(dst, i + 4, unsafe.getChar(src, off + 4));
unsafe.putChar(dst, i + 6, unsafe.getChar(src, off + 6));
unsafe.putChar(dst, i + 8, unsafe.getChar(src, off + 8));
unsafe.putChar(dst, i + 10, unsafe.getChar(src, off + 10));
unsafe.putChar(dst, i + 12, unsafe.getChar(src, off + 12));
unsafe.putChar(dst, i + 14, unsafe.getChar(src, off + 14));
// Prevent loop invariant code motion of char read.
src = (src == src1) ? src2 : src1;
}
}
/*
* Copy multiple consecutive chars from a char array to a given offset in a byte array
* to trigger C2's superword optimization. Checks for similar problems as 'copyByteToChar'.
*/
public static void copyCharToByte(char[] src, byte[] dst, int off) {
off = (int) BYTE_ARRAY_OFFSET + (off << 1);
for (int i = 0; i < 100; i = i + 8) {
// Copy 8 chars from src to dst
unsafe.putChar(dst, off + 0, src[i + 0]);
unsafe.putChar(dst, off + 2, src[i + 1]);
unsafe.putChar(dst, off + 4, src[i + 2]);
unsafe.putChar(dst, off + 6, src[i + 3]);
unsafe.putChar(dst, off + 8, src[i + 4]);
unsafe.putChar(dst, off + 10, src[i + 5]);
unsafe.putChar(dst, off + 12, src[i + 6]);
unsafe.putChar(dst, off + 14, src[i + 7]);
}
}
/*
* Variant of copyCharToByte with a constant destination array offset.
* The loop should always be vectorized because both the LoadUS and StoreC
* operations can be aligned.
*/
public static void copyCharToByteAligned(char[] src, byte[] dst) {
final int off = (int) BYTE_ARRAY_OFFSET;
for (int i = 8; i < 100; i = i + 8) {
// Copy 8 chars from src to dst
unsafe.putChar(dst, off + 0, src[i + 0]);
unsafe.putChar(dst, off + 2, src[i + 1]);
unsafe.putChar(dst, off + 4, src[i + 2]);
unsafe.putChar(dst, off + 6, src[i + 3]);
unsafe.putChar(dst, off + 8, src[i + 4]);
unsafe.putChar(dst, off + 10, src[i + 5]);
unsafe.putChar(dst, off + 12, src[i + 6]);
unsafe.putChar(dst, off + 14, src[i + 7]);
}
}
/*
* Variant of copyCharToByte with a constant destination array offset. The
* loop should only be vectorized if unaligned memory operations are allowed
* because not both the LoadUS and the StoreC can be aligned.
*/
public static void copyCharToByteUnaligned(char[] src, byte[] dst) {
final int off = (int) BYTE_ARRAY_OFFSET + 2;
for (int i = 0; i < 100; i = i + 8) {
// Copy 8 chars from src to dst
unsafe.putChar(dst, off + 0, src[i + 0]);
unsafe.putChar(dst, off + 2, src[i + 1]);
unsafe.putChar(dst, off + 4, src[i + 2]);
unsafe.putChar(dst, off + 6, src[i + 3]);
unsafe.putChar(dst, off + 8, src[i + 4]);
unsafe.putChar(dst, off + 10, src[i + 5]);
unsafe.putChar(dst, off + 12, src[i + 6]);
unsafe.putChar(dst, off + 14, src[i + 7]);
}
}
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册