Merge

510019d2 · amurillo · d9e9a7b8 · 123028c7 · 510019d2 · 510019d2
8 changed file
--- a/.hgtags
+++ b/.hgtags
@@ -641,3 +641,4 @@ ced08ed4924fc6581626c7ce2d769fc18d7b23e0 jdk8u60-b13
 c9f8b7319d0a5ab07310cf53507642a8fd91589b jdk8u60-b14
 4187dc92e90b16b4097627b8af4f5e6e63f3b497 hs25.60-b15
 b99f1bf208f385277b03a985d35b6614b4095f3e jdk8u60-b15
+f5800068c61d0627c14e99836e9ce5cf0ef00075 hs25.60-b16
--- a/make/hotspot_version
+++ b/make/hotspot_version
@@ -35,7 +35,7 @@ HOTSPOT_VM_COPYRIGHT=Copyright 2015

 HS_MAJOR_VER=25
 HS_MINOR_VER=60
-HS_BUILD_NUMBER=15
+HS_BUILD_NUMBER=16

 JDK_MAJOR_VER=1
 JDK_MINOR_VER=8

--- a/src/cpu/x86/vm/macroAssembler_x86.cpp
+++ b/src/cpu/x86/vm/macroAssembler_x86.cpp
@@ -6690,7 +6690,7 @@ void MacroAssembler::string_compare(Register str1, Register str2,
    subl(cnt2, stride2);
    jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
    // clean upper bits of YMM registers
-    vzeroupper();
+    vpxor(vec1, vec1);

    // compare wide vectors tail
    bind(COMPARE_WIDE_TAIL);
@@ -6705,7 +6705,7 @@ void MacroAssembler::string_compare(Register str1, Register str2,
    // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
    bind(VECTOR_NOT_EQUAL);
    // clean upper bits of YMM registers
-    vzeroupper();
+    vpxor(vec1, vec1);
    lea(str1, Address(str1, result, scale));
    lea(str2, Address(str2, result, scale));
    jmp(COMPARE_16_CHARS);
@@ -6964,7 +6964,8 @@ void MacroAssembler::char_arrays_equals(bool is_array_equ, Register ary1, Regist
  bind(DONE);
  if (UseAVX >= 2) {
    // clean upper bits of YMM registers
-    vzeroupper();
+    vpxor(vec1, vec1);
+    vpxor(vec2, vec2);
  }
 }

@@ -7098,7 +7099,8 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,

        BIND(L_check_fill_8_bytes);
        // clean upper bits of YMM registers
-        vzeroupper();
+        movdl(xtmp, value);
+        pshufd(xtmp, xtmp, 0);
      } else {
        // Fill 32-byte chunks
        pshufd(xtmp, xtmp, 0);
@@ -7261,7 +7263,11 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
    bind(L_copy_16_chars_exit);
    if (UseAVX >= 2) {
      // clean upper bits of YMM registers
-      vzeroupper();
+      vpxor(tmp2Reg, tmp2Reg);
+      vpxor(tmp3Reg, tmp3Reg);
+      vpxor(tmp4Reg, tmp4Reg);
+      movdl(tmp1Reg, tmp5);
+      pshufd(tmp1Reg, tmp1Reg, 0);
    }
    subptr(len, 8);
    jccb(Assembler::greater, L_copy_8_chars_exit);

--- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp
+++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp
@@ -837,7 +837,8 @@ class StubGenerator: public StubCodeGenerator {

    if (UseUnalignedLoadStores && (UseAVX >= 2)) {
      // clean upper bits of YMM registers
-      __ vzeroupper();
+      __ vpxor(xmm0, xmm0);
+      __ vpxor(xmm1, xmm1);
    }
    __ addl(qword_count, 8);
    __ jccb(Assembler::zero, L_exit);

--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp
@@ -1328,7 +1328,8 @@ class StubGenerator: public StubCodeGenerator {
      __ BIND(L_end);
      if (UseAVX >= 2) {
        // clean upper bits of YMM registers
-        __ vzeroupper();
+        __ vpxor(xmm0, xmm0);
+        __ vpxor(xmm1, xmm1);
      }
    } else {
      // Copy 32-bytes per iteration
@@ -1405,7 +1406,8 @@ class StubGenerator: public StubCodeGenerator {
      __ BIND(L_end);
      if (UseAVX >= 2) {
        // clean upper bits of YMM registers
-        __ vzeroupper();
+        __ vpxor(xmm0, xmm0);
+        __ vpxor(xmm1, xmm1);
      }
    } else {
      // Copy 32-bytes per iteration

--- a/src/share/vm/opto/superword.cpp
+++ b/src/share/vm/opto/superword.cpp
@@ -232,6 +232,13 @@ void SuperWord::find_adjacent_refs() {
          // if unaligned memory access is not allowed because number of
          // iterations in pre-loop will be not enough to align it.
          create_pack = false;
+        } else {
+          SWPointer p2(best_align_to_mem_ref, this);
+          if (align_to_ref_p.invar() != p2.invar()) {
+            // Do not vectorize memory accesses with different invariants
+            // if unaligned memory accesses are not allowed.
+            create_pack = false;
+          }
        }
      }
    } else {
@@ -445,29 +452,57 @@ bool SuperWord::ref_is_alignable(SWPointer& p) {
  int preloop_stride = pre_end->stride_con();

  int span = preloop_stride * p.scale_in_bytes();
-
-  // Stride one accesses are alignable.
-  if (ABS(span) == p.memory_size())
+  int mem_size = p.memory_size();
+  int offset   = p.offset_in_bytes();
+  // Stride one accesses are alignable if offset is aligned to memory operation size.
+  // Offset can be unaligned when UseUnalignedAccesses is used.
+  if (ABS(span) == mem_size && (ABS(offset) % mem_size) == 0) {
    return true;
-
-  // If initial offset from start of object is computable,
-  // compute alignment within the vector.
+  }
+  // If the initial offset from start of the object is computable,
+  // check if the pre-loop can align the final offset accordingly.
+  //
+  // In other words: Can we find an i such that the offset
+  // after i pre-loop iterations is aligned to vw?
+  //   (init_offset + pre_loop) % vw == 0              (1)
+  // where
+  //   pre_loop = i * span
+  // is the number of bytes added to the offset by i pre-loop iterations.
+  //
+  // For this to hold we need pre_loop to increase init_offset by
+  //   pre_loop = vw - (init_offset % vw)
+  //
+  // This is only possible if pre_loop is divisible by span because each
+  // pre-loop iteration increases the initial offset by 'span' bytes:
+  //   (vw - (init_offset % vw)) % span == 0
+  //
  int vw = vector_width_in_bytes(p.mem());
  assert(vw > 1, "sanity");
-  if (vw % span == 0) {
-    Node* init_nd = pre_end->init_trip();
-    if (init_nd->is_Con() && p.invar() == NULL) {
-      int init = init_nd->bottom_type()->is_int()->get_con();
-
-      int init_offset = init * p.scale_in_bytes() + p.offset_in_bytes();
-      assert(init_offset >= 0, "positive offset from object start");
-
+  Node* init_nd = pre_end->init_trip();
+  if (init_nd->is_Con() && p.invar() == NULL) {
+    int init = init_nd->bottom_type()->is_int()->get_con();
+    int init_offset = init * p.scale_in_bytes() + offset;
+    assert(init_offset >= 0, "positive offset from object start");
+    if (vw % span == 0) {
+      // If vm is a multiple of span, we use formula (1).
      if (span > 0) {
        return (vw - (init_offset % vw)) % span == 0;
      } else {
        assert(span < 0, "nonzero stride * scale");
        return (init_offset % vw) % -span == 0;
      }
+    } else if (span % vw == 0) {
+      // If span is a multiple of vw, we can simplify formula (1) to:
+      //   (init_offset + i * span) % vw == 0
+      //     =>
+      //   (init_offset % vw) + ((i * span) % vw) == 0
+      //     =>
+      //   init_offset % vw == 0
+      //
+      // Because we add a multiple of vw to the initial offset, the final
+      // offset is a multiple of vw if and only if init_offset is a multiple.
+      //
+      return (init_offset % vw) == 0;
    }
  }
  return false;
@@ -479,17 +514,23 @@ int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
  SWPointer align_to_ref_p(mem_ref, this);
  int offset = align_to_ref_p.offset_in_bytes();
  int scale  = align_to_ref_p.scale_in_bytes();
+  int elt_size = align_to_ref_p.memory_size();
  int vw       = vector_width_in_bytes(mem_ref);
  assert(vw > 1, "sanity");
-  int stride_sign   = (scale * iv_stride()) > 0 ? 1 : -1;
-  // At least one iteration is executed in pre-loop by default. As result
-  // several iterations are needed to align memory operations in main-loop even
-  // if offset is 0.
-  int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
-  int elt_size = align_to_ref_p.memory_size();
-  assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0),
-         err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size));
-  int iv_adjustment = iv_adjustment_in_bytes/elt_size;
+  int iv_adjustment;
+  if (scale != 0) {
+    int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1;
+    // At least one iteration is executed in pre-loop by default. As result
+    // several iterations are needed to align memory operations in main-loop even
+    // if offset is 0.
+    int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
+    assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0),
+           err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size));
+    iv_adjustment = iv_adjustment_in_bytes/elt_size;
+  } else {
+    // This memory op is not dependent on iv (scale == 0)
+    iv_adjustment = 0;
+  }

 #ifndef PRODUCT
  if (TraceSuperWord)
@@ -2247,6 +2288,11 @@ SWPointer::SWPointer(MemNode* mem, SuperWord* slp) :
  }
  // Match AddP(base, AddP(ptr, k*iv [+ invariant]), constant)
  Node* base = adr->in(AddPNode::Base);
+  // The base address should be loop invariant
+  if (!invariant(base)) {
+    assert(!valid(), "base address is loop variant");
+    return;
+  }
  //unsafe reference could not be aligned appropriately without runtime checking
  if (base == NULL || base->bottom_type() == Type::TOP) {
    assert(!valid(), "unsafe access");

--- a/src/share/vm/opto/superword.hpp
+++ b/src/share/vm/opto/superword.hpp
@@ -41,7 +41,7 @@
 // Exploiting SuperWord Level Parallelism with
 //   Multimedia Instruction Sets
 // by
-//   Samuel Larsen and Saman Amarasighe
+//   Samuel Larsen and Saman Amarasinghe
 //   MIT Laboratory for Computer Science
 // date
 //   May 2000
@@ -432,7 +432,7 @@ class SWPointer VALUE_OBJ_CLASS_SPEC {

  Node* _base;         // NULL if unsafe nonheap reference
  Node* _adr;          // address pointer
-  jint  _scale;        // multipler for iv (in bytes), 0 if no loop iv
+  jint  _scale;        // multiplier for iv (in bytes), 0 if no loop iv
  jint  _offset;       // constant offset (in bytes)
  Node* _invar;        // invariant offset (in bytes), NULL if none
  bool  _negate_invar; // if true then use: (0 - _invar)

--- a/test/compiler/loopopts/superword/TestVectorizationWithInvariant.java
+++ b/test/compiler/loopopts/superword/TestVectorizationWithInvariant.java
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+import com.oracle.java.testlibrary.*;
+import sun.misc.Unsafe;
+
+/**
+ * @test
+ * @bug 8078497
+ * @summary Tests correct alignment of vectors with loop invariant offset.
+ * @library /testlibrary
+ * @run main TestVectorizationWithInvariant
+ */
+public class TestVectorizationWithInvariant {
+
+    private static Unsafe unsafe;
+    private static final long BYTE_ARRAY_OFFSET;
+    private static final long CHAR_ARRAY_OFFSET;
+
+    static {
+        unsafe = Utils.getUnsafe();
+        BYTE_ARRAY_OFFSET = unsafe.arrayBaseOffset(byte[].class);
+        CHAR_ARRAY_OFFSET = unsafe.arrayBaseOffset(char[].class);
+    }
+
+    public static void main(String[] args) throws Exception {
+        byte[] byte_array1 = new byte[1000];
+        byte[] byte_array2 = new byte[1000];
+        char[] char_array = new char[1000];
+
+        for (int i = 0; i < 20_000; ++i) {
+            copyByteToChar(byte_array1, byte_array2, char_array, 1);
+            copyCharToByte(char_array, byte_array1, 1);
+            copyCharToByteAligned(char_array, byte_array1);
+            copyCharToByteUnaligned(char_array, byte_array1);
+        }
+    }
+
+    /*
+     * Copy multiple consecutive chars from a byte array to a given offset in a char array
+     * to trigger C2's superword optimization. The offset in the byte array is independent
+     * of the loop induction variable and can be set to an arbitrary value. It may then not
+     * be possible to both align the LoadUS and the StoreC operations. Therefore, vectorization
+     * should only be done in this case if unaligned memory accesses are allowed.
+     */
+    public static void copyByteToChar(byte[] src1, byte[] src2, char[] dst, int off) {
+        off = (int) BYTE_ARRAY_OFFSET + (off << 1);
+        byte[] src = src1;
+        for (int i = (int) CHAR_ARRAY_OFFSET; i < 100; i = i + 8) {
+            // Copy 8 chars from src to dst
+            unsafe.putChar(dst, i + 0, unsafe.getChar(src, off + 0));
+            unsafe.putChar(dst, i + 2, unsafe.getChar(src, off + 2));
+            unsafe.putChar(dst, i + 4, unsafe.getChar(src, off + 4));
+            unsafe.putChar(dst, i + 6, unsafe.getChar(src, off + 6));
+            unsafe.putChar(dst, i + 8, unsafe.getChar(src, off + 8));
+            unsafe.putChar(dst, i + 10, unsafe.getChar(src, off + 10));
+            unsafe.putChar(dst, i + 12, unsafe.getChar(src, off + 12));
+            unsafe.putChar(dst, i + 14, unsafe.getChar(src, off + 14));
+
+            // Prevent loop invariant code motion of char read.
+            src = (src == src1) ? src2 : src1;
+        }
+    }
+
+    /*
+     * Copy multiple consecutive chars from a char array to a given offset in a byte array
+     * to trigger C2's superword optimization. Checks for similar problems as 'copyByteToChar'.
+     */
+    public static void copyCharToByte(char[] src, byte[] dst, int off) {
+        off = (int) BYTE_ARRAY_OFFSET + (off << 1);
+        for (int i = 0; i < 100; i = i + 8) {
+            // Copy 8 chars from src to dst
+            unsafe.putChar(dst, off + 0, src[i + 0]);
+            unsafe.putChar(dst, off + 2, src[i + 1]);
+            unsafe.putChar(dst, off + 4, src[i + 2]);
+            unsafe.putChar(dst, off + 6, src[i + 3]);
+            unsafe.putChar(dst, off + 8, src[i + 4]);
+            unsafe.putChar(dst, off + 10, src[i + 5]);
+            unsafe.putChar(dst, off + 12, src[i + 6]);
+            unsafe.putChar(dst, off + 14, src[i + 7]);
+        }
+    }
+
+    /*
+     * Variant of copyCharToByte with a constant destination array offset.
+     * The loop should always be vectorized because both the LoadUS and StoreC
+     * operations can be aligned.
+     */
+    public static void copyCharToByteAligned(char[] src, byte[] dst) {
+        final int off = (int) BYTE_ARRAY_OFFSET;
+        for (int i = 8; i < 100; i = i + 8) {
+            // Copy 8 chars from src to dst
+            unsafe.putChar(dst, off + 0, src[i + 0]);
+            unsafe.putChar(dst, off + 2, src[i + 1]);
+            unsafe.putChar(dst, off + 4, src[i + 2]);
+            unsafe.putChar(dst, off + 6, src[i + 3]);
+            unsafe.putChar(dst, off + 8, src[i + 4]);
+            unsafe.putChar(dst, off + 10, src[i + 5]);
+            unsafe.putChar(dst, off + 12, src[i + 6]);
+            unsafe.putChar(dst, off + 14, src[i + 7]);
+        }
+    }
+
+    /*
+     * Variant of copyCharToByte with a constant destination array offset. The
+     * loop should only be vectorized if unaligned memory operations are allowed
+     * because not both the LoadUS and the StoreC can be aligned.
+     */
+    public static void copyCharToByteUnaligned(char[] src, byte[] dst) {
+        final int off = (int) BYTE_ARRAY_OFFSET + 2;
+        for (int i = 0; i < 100; i = i + 8) {
+            // Copy 8 chars from src to dst
+            unsafe.putChar(dst, off + 0, src[i + 0]);
+            unsafe.putChar(dst, off + 2, src[i + 1]);
+            unsafe.putChar(dst, off + 4, src[i + 2]);
+            unsafe.putChar(dst, off + 6, src[i + 3]);
+            unsafe.putChar(dst, off + 8, src[i + 4]);
+            unsafe.putChar(dst, off + 10, src[i + 5]);
+            unsafe.putChar(dst, off + 12, src[i + 6]);
+            unsafe.putChar(dst, off + 14, src[i + 7]);
+        }
+    }
+}