提交 d1681e4e 编写于 作者: K kvn

8005419: Improve intrinsics code performance on x86 by using AVX2

Summary: use 256bit vpxor,vptest instructions in String.compareTo() and equals() intrinsics.
Reviewed-by: twisti
上级 1d1340ad
...@@ -2468,6 +2468,26 @@ void Assembler::ptest(XMMRegister dst, XMMRegister src) { ...@@ -2468,6 +2468,26 @@ void Assembler::ptest(XMMRegister dst, XMMRegister src) {
emit_int8((unsigned char)(0xC0 | encode)); emit_int8((unsigned char)(0xC0 | encode));
} }
void Assembler::vptest(XMMRegister dst, Address src) {
assert(VM_Version::supports_avx(), "");
InstructionMark im(this);
bool vector256 = true;
assert(dst != xnoreg, "sanity");
int dst_enc = dst->encoding();
// swap src<->dst for encoding
vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector256);
emit_int8(0x17);
emit_operand(dst, src);
}
void Assembler::vptest(XMMRegister dst, XMMRegister src) {
assert(VM_Version::supports_avx(), "");
bool vector256 = true;
int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38);
emit_int8(0x17);
emit_int8((unsigned char)(0xC0 | encode));
}
void Assembler::punpcklbw(XMMRegister dst, Address src) { void Assembler::punpcklbw(XMMRegister dst, Address src) {
NOT_LP64(assert(VM_Version::supports_sse2(), "")); NOT_LP64(assert(VM_Version::supports_sse2(), ""));
assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
......
...@@ -1444,9 +1444,12 @@ private: ...@@ -1444,9 +1444,12 @@ private:
// Shift Right by bytes Logical DoubleQuadword Immediate // Shift Right by bytes Logical DoubleQuadword Immediate
void psrldq(XMMRegister dst, int shift); void psrldq(XMMRegister dst, int shift);
// Logical Compare Double Quadword // Logical Compare 128bit
void ptest(XMMRegister dst, XMMRegister src); void ptest(XMMRegister dst, XMMRegister src);
void ptest(XMMRegister dst, Address src); void ptest(XMMRegister dst, Address src);
// Logical Compare 256bit
void vptest(XMMRegister dst, XMMRegister src);
void vptest(XMMRegister dst, Address src);
// Interleave Low Bytes // Interleave Low Bytes
void punpcklbw(XMMRegister dst, XMMRegister src); void punpcklbw(XMMRegister dst, XMMRegister src);
......
...@@ -5675,42 +5675,114 @@ void MacroAssembler::string_compare(Register str1, Register str2, ...@@ -5675,42 +5675,114 @@ void MacroAssembler::string_compare(Register str1, Register str2,
testl(cnt2, cnt2); testl(cnt2, cnt2);
jcc(Assembler::zero, LENGTH_DIFF_LABEL); jcc(Assembler::zero, LENGTH_DIFF_LABEL);
// Load first characters // Compare first characters
load_unsigned_short(result, Address(str1, 0)); load_unsigned_short(result, Address(str1, 0));
load_unsigned_short(cnt1, Address(str2, 0)); load_unsigned_short(cnt1, Address(str2, 0));
// Compare first characters
subl(result, cnt1); subl(result, cnt1);
jcc(Assembler::notZero, POP_LABEL); jcc(Assembler::notZero, POP_LABEL);
decrementl(cnt2); cmpl(cnt2, 1);
jcc(Assembler::zero, LENGTH_DIFF_LABEL); jcc(Assembler::equal, LENGTH_DIFF_LABEL);
{ // Check if the strings start at the same location.
// Check after comparing first character to see if strings are equivalent cmpptr(str1, str2);
Label LSkip2; jcc(Assembler::equal, LENGTH_DIFF_LABEL);
// Check if the strings start at same location
cmpptr(str1, str2);
jccb(Assembler::notEqual, LSkip2);
// Check if the length difference is zero (from stack)
cmpl(Address(rsp, 0), 0x0);
jcc(Assembler::equal, LENGTH_DIFF_LABEL);
// Strings might not be equivalent
bind(LSkip2);
}
Address::ScaleFactor scale = Address::times_2; Address::ScaleFactor scale = Address::times_2;
int stride = 8; int stride = 8;
// Advance to next element if (UseAVX >= 2) {
addptr(str1, 16/stride); Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
addptr(str2, 16/stride); Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
Label COMPARE_TAIL_LONG;
int pcmpmask = 0x19;
// Setup to compare 16-chars (32-bytes) vectors,
// start from first character again because it has aligned address.
int stride2 = 16;
int adr_stride = stride << scale;
int adr_stride2 = stride2 << scale;
if (UseSSE42Intrinsics) { assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
// rax and rdx are used by pcmpestri as elements counters
movl(result, cnt2);
andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count
jcc(Assembler::zero, COMPARE_TAIL_LONG);
// fast path : compare first 2 8-char vectors.
bind(COMPARE_16_CHARS);
movdqu(vec1, Address(str1, 0));
pcmpestri(vec1, Address(str2, 0), pcmpmask);
jccb(Assembler::below, COMPARE_INDEX_CHAR);
movdqu(vec1, Address(str1, adr_stride));
pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
addl(cnt1, stride);
// Compare the characters at index in cnt1
bind(COMPARE_INDEX_CHAR); //cnt1 has the offset of the mismatching character
load_unsigned_short(result, Address(str1, cnt1, scale));
load_unsigned_short(cnt2, Address(str2, cnt1, scale));
subl(result, cnt2);
jmp(POP_LABEL);
// Setup the registers to start vector comparison loop
bind(COMPARE_WIDE_VECTORS);
lea(str1, Address(str1, result, scale));
lea(str2, Address(str2, result, scale));
subl(result, stride2);
subl(cnt2, stride2);
jccb(Assembler::zero, COMPARE_WIDE_TAIL);
negptr(result);
// In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
bind(COMPARE_WIDE_VECTORS_LOOP);
vmovdqu(vec1, Address(str1, result, scale));
vpxor(vec1, Address(str2, result, scale));
vptest(vec1, vec1);
jccb(Assembler::notZero, VECTOR_NOT_EQUAL);
addptr(result, stride2);
subl(cnt2, stride2);
jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
// compare wide vectors tail
bind(COMPARE_WIDE_TAIL);
testptr(result, result);
jccb(Assembler::zero, LENGTH_DIFF_LABEL);
movl(result, stride2);
movl(cnt2, result);
negptr(result);
jmpb(COMPARE_WIDE_VECTORS_LOOP);
// Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
bind(VECTOR_NOT_EQUAL);
lea(str1, Address(str1, result, scale));
lea(str2, Address(str2, result, scale));
jmp(COMPARE_16_CHARS);
// Compare tail chars, length between 1 to 15 chars
bind(COMPARE_TAIL_LONG);
movl(cnt2, result);
cmpl(cnt2, stride);
jccb(Assembler::less, COMPARE_SMALL_STR);
movdqu(vec1, Address(str1, 0));
pcmpestri(vec1, Address(str2, 0), pcmpmask);
jcc(Assembler::below, COMPARE_INDEX_CHAR);
subptr(cnt2, stride);
jccb(Assembler::zero, LENGTH_DIFF_LABEL);
lea(str1, Address(str1, result, scale));
lea(str2, Address(str2, result, scale));
negptr(cnt2);
jmpb(WHILE_HEAD_LABEL);
bind(COMPARE_SMALL_STR);
} else if (UseSSE42Intrinsics) {
Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
int pcmpmask = 0x19; int pcmpmask = 0x19;
// Setup to compare 16-byte vectors // Setup to compare 8-char (16-byte) vectors,
// start from first character again because it has aligned address.
movl(result, cnt2); movl(result, cnt2);
andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count
jccb(Assembler::zero, COMPARE_TAIL); jccb(Assembler::zero, COMPARE_TAIL);
...@@ -5742,7 +5814,7 @@ void MacroAssembler::string_compare(Register str1, Register str2, ...@@ -5742,7 +5814,7 @@ void MacroAssembler::string_compare(Register str1, Register str2,
jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
// compare wide vectors tail // compare wide vectors tail
testl(result, result); testptr(result, result);
jccb(Assembler::zero, LENGTH_DIFF_LABEL); jccb(Assembler::zero, LENGTH_DIFF_LABEL);
movl(cnt2, stride); movl(cnt2, stride);
...@@ -5754,21 +5826,20 @@ void MacroAssembler::string_compare(Register str1, Register str2, ...@@ -5754,21 +5826,20 @@ void MacroAssembler::string_compare(Register str1, Register str2,
// Mismatched characters in the vectors // Mismatched characters in the vectors
bind(VECTOR_NOT_EQUAL); bind(VECTOR_NOT_EQUAL);
addptr(result, cnt1); addptr(cnt1, result);
movptr(cnt2, result); load_unsigned_short(result, Address(str1, cnt1, scale));
load_unsigned_short(result, Address(str1, cnt2, scale)); load_unsigned_short(cnt2, Address(str2, cnt1, scale));
load_unsigned_short(cnt1, Address(str2, cnt2, scale)); subl(result, cnt2);
subl(result, cnt1);
jmpb(POP_LABEL); jmpb(POP_LABEL);
bind(COMPARE_TAIL); // limit is zero bind(COMPARE_TAIL); // limit is zero
movl(cnt2, result); movl(cnt2, result);
// Fallthru to tail compare // Fallthru to tail compare
} }
// Shift str2 and str1 to the end of the arrays, negate min // Shift str2 and str1 to the end of the arrays, negate min
lea(str1, Address(str1, cnt2, scale, 0)); lea(str1, Address(str1, cnt2, scale));
lea(str2, Address(str2, cnt2, scale, 0)); lea(str2, Address(str2, cnt2, scale));
decrementl(cnt2); // first character was compared already
negptr(cnt2); negptr(cnt2);
// Compare the rest of the elements // Compare the rest of the elements
...@@ -5833,7 +5904,44 @@ void MacroAssembler::char_arrays_equals(bool is_array_equ, Register ary1, Regist ...@@ -5833,7 +5904,44 @@ void MacroAssembler::char_arrays_equals(bool is_array_equ, Register ary1, Regist
shll(limit, 1); // byte count != 0 shll(limit, 1); // byte count != 0
movl(result, limit); // copy movl(result, limit); // copy
if (UseSSE42Intrinsics) { if (UseAVX >= 2) {
// With AVX2, use 32-byte vector compare
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
// Compare 32-byte vectors
andl(result, 0x0000001e); // tail count (in bytes)
andl(limit, 0xffffffe0); // vector count (in bytes)
jccb(Assembler::zero, COMPARE_TAIL);
lea(ary1, Address(ary1, limit, Address::times_1));
lea(ary2, Address(ary2, limit, Address::times_1));
negptr(limit);
bind(COMPARE_WIDE_VECTORS);
vmovdqu(vec1, Address(ary1, limit, Address::times_1));
vmovdqu(vec2, Address(ary2, limit, Address::times_1));
vpxor(vec1, vec2);
vptest(vec1, vec1);
jccb(Assembler::notZero, FALSE_LABEL);
addptr(limit, 32);
jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
testl(result, result);
jccb(Assembler::zero, TRUE_LABEL);
vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
vpxor(vec1, vec2);
vptest(vec1, vec1);
jccb(Assembler::notZero, FALSE_LABEL);
jmpb(TRUE_LABEL);
bind(COMPARE_TAIL); // limit is zero
movl(limit, result);
// Fallthru to tail compare
} else if (UseSSE42Intrinsics) {
// With SSE4.2, use double quad vector compare // With SSE4.2, use double quad vector compare
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
......
...@@ -1011,6 +1011,10 @@ public: ...@@ -1011,6 +1011,10 @@ public:
Assembler::vxorpd(dst, nds, src, vector256); Assembler::vxorpd(dst, nds, src, vector256);
} }
// Simple version for AVX2 256bit vectors
void vpxor(XMMRegister dst, XMMRegister src) { Assembler::vpxor(dst, dst, src, true); }
void vpxor(XMMRegister dst, Address src) { Assembler::vpxor(dst, dst, src, true); }
// Move packed integer values from low 128 bit to hign 128 bit in 256 bit vector. // Move packed integer values from low 128 bit to hign 128 bit in 256 bit vector.
void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) { void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
if (UseAVX > 1) // vinserti128h is available only in AVX2 if (UseAVX > 1) // vinserti128h is available only in AVX2
......
/*
* Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
* @test
* @bug 8005419
* @summary Improve intrinsics code performance on x86 by using AVX2
* @run main/othervm -Xbatch -Xmx64m Test8005419
*
*/
public class Test8005419 {
public static int SIZE = 64;
public static void main(String[] args) {
char[] a = new char[SIZE];
char[] b = new char[SIZE];
for (int i = 16; i < SIZE; i++) {
a[i] = (char)i;
b[i] = (char)i;
}
String s1 = new String(a);
String s2 = new String(b);
// Warm up
boolean failed = false;
int result = 0;
for (int i = 0; i < 10000; i++) {
result += test(s1, s2);
}
for (int i = 0; i < 10000; i++) {
result += test(s1, s2);
}
for (int i = 0; i < 10000; i++) {
result += test(s1, s2);
}
if (result != 0) failed = true;
System.out.println("Start testing");
// Compare same string
result = test(s1, s1);
if (result != 0) {
failed = true;
System.out.println("Failed same: result = " + result + ", expected 0");
}
// Compare equal strings
for (int i = 1; i <= SIZE; i++) {
s1 = new String(a, 0, i);
s2 = new String(b, 0, i);
result = test(s1, s2);
if (result != 0) {
failed = true;
System.out.println("Failed equals s1[" + i + "], s2[" + i + "]: result = " + result + ", expected 0");
}
}
// Compare equal strings but different sizes
for (int i = 1; i <= SIZE; i++) {
s1 = new String(a, 0, i);
for (int j = 1; j <= SIZE; j++) {
s2 = new String(b, 0, j);
result = test(s1, s2);
if (result != (i-j)) {
failed = true;
System.out.println("Failed diff size s1[" + i + "], s2[" + j + "]: result = " + result + ", expected " + (i-j));
}
}
}
// Compare strings with one char different and different sizes
for (int i = 1; i <= SIZE; i++) {
s1 = new String(a, 0, i);
for (int j = 0; j < i; j++) {
b[j] -= 3; // change char
s2 = new String(b, 0, i);
result = test(s1, s2);
int chdiff = a[j] - b[j];
if (result != chdiff) {
failed = true;
System.out.println("Failed diff char s1[" + i + "], s2[" + i + "]: result = " + result + ", expected " + chdiff);
}
result = test(s2, s1);
chdiff = b[j] - a[j];
if (result != chdiff) {
failed = true;
System.out.println("Failed diff char s2[" + i + "], s1[" + i + "]: result = " + result + ", expected " + chdiff);
}
b[j] += 3; // restore
}
}
if (failed) {
System.out.println("FAILED");
System.exit(97);
}
System.out.println("PASSED");
}
private static int test(String str1, String str2) {
return str1.compareTo(str2);
}
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册