add back jit simd instructions. stage.

31676583 · dzhwinter · bf2e4cb1 · 31676583 · 31676583 · 31676583
13 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -181,10 +181,10 @@ include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/cares)
 include(external/cub)
-include(external/xxhash)    # download xxhash

 if (NOT WIN32)
 # there is no official support of snappystream, warpctc, nccl, cupti in windows
+include(external/xxhash)    # download xxhash
 include(external/snappy)    # download snappy
 include(external/snappystream) # download snappystream
 include(external/warpctc)   # download, build, install warpctc

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -87,13 +87,14 @@ copy(boost_lib
  DSTS ${dst_dir}
  DEPS boost
 )
-
+if(NOT WIN32)
 set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/xxhash")
 copy(xxhash_lib
  SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES}
  DSTS ${dst_dir} ${dst_dir}/lib
  DEPS xxhash
 )
+endif(NOT WIN32)

 if(NOT PROTOBUF_FOUND)
    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf")

--- a/paddle/fluid/inference/api/demo_ci/real_data_icnet_tester.cc
+++ b/paddle/fluid/inference/api/demo_ci/real_data_icnet_tester.cc
@@ -20,9 +20,6 @@
 #include "paddle/fluid/inference/paddle_inference_api.h"

 namespace paddle {
-// DEFINE_string(dirname, "./lb",
-//               "Directory of the inference model.");
-
 NativeConfig GetConfig() {
  NativeConfig config;


--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -86,7 +86,7 @@ function(op_library TARGET)
    # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op"
     "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op"
-      "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op")
+      "fusion_seqconv_eltadd_relu_op" "hash_op")
        if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
          return()
        endif()

--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -74,8 +74,8 @@ cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
 cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
 if (NOT WIN32)
    math_library(matrix_bit_code)
-    cc_library(jit_kernel
+endif (NOT WIN32)
+cc_library(jit_kernel
    SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc
    DEPS cpu_info cblas)
-    cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
-endif (NOT WIN32)
+cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -18,10 +18,6 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
-
 #ifdef PADDLE_WITH_MKLML
 #include "paddle/fluid/platform/dynload/mklml.h"
 #endif

--- a/paddle/fluid/operators/math/detail/activation_functions.h
+++ b/paddle/fluid/operators/math/detail/activation_functions.h
@@ -15,13 +15,10 @@ limitations under the License. */
 #pragma once
 #include <math.h>
 #include <string>
+#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/hostdevice.h"

-#ifdef __AVX__
-#include <immintrin.h>
-#endif
-
 namespace paddle {
 namespace operators {
 namespace math {

--- a/paddle/fluid/operators/math/jit_kernel_blas.cc
+++ b/paddle/fluid/operators/math/jit_kernel_blas.cc
@@ -19,10 +19,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/mklml.h"
 #endif

-#ifdef __AVX__
-#include <immintrin.h>
-#endif
-
 namespace paddle {
 namespace operators {
 namespace math {

--- a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
+++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
@@ -16,9 +16,6 @@ limitations under the License. */
 #include <limits>
 #include <string>
 #include "paddle/fluid/operators/math/jit_kernel_macro.h"
-#ifdef __AVX__
-#include <immintrin.h>
-#endif

 namespace paddle {
 namespace operators {
@@ -263,6 +260,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
    }                                                                          \
  }

+#ifndef _WIN32  // commented out crf decoding
 #ifdef __AVX__
 INTRIAVX_FLOAT(kEQ8);
 INTRIAVX_FLOAT(kGT8LT16);
@@ -275,6 +273,7 @@ INTRIAVX2_FLOAT(jit::avx2, kGT8LT16);
 INTRIAVX2_FLOAT(jit::avx2, kEQ16);
 INTRIAVX2_FLOAT(jit::avx2, kGT16);
 #endif
+#endif  // WIN32
 #ifdef __AVX512F__
 INTRIAVX2_FLOAT(jit::avx512f, kEQ8);
 INTRIAVX2_FLOAT(jit::avx512f, kGT8LT16);

--- a/paddle/fluid/operators/math/jit_kernel_exp.cc
+++ b/paddle/fluid/operators/math/jit_kernel_exp.cc
@@ -20,10 +20,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/mklml.h"
 #endif

-#ifdef __AVX__
-#include <immintrin.h>
-#endif
-
 namespace paddle {
 namespace operators {
 namespace math {
@@ -66,14 +62,18 @@ namespace detail {

 #ifdef __AVX__

+#if defined(_WIN32)
+#define ALIGN32 __declspec(align(32))
+#else
 #define ALIGN32 __attribute__((aligned(32)))
+#endif  // _WIN32

 #define _PS256_CONST(Name, Val)                                      \
-  static const float _ps256_##Name[8] ALIGN32 = {Val, Val, Val, Val, \
+  static const float ALIGN32 _ps256_##Name[8] = {Val, Val, Val, Val, \
                                                 Val, Val, Val, Val}

 #define _PI256_CONST(Name, Val)                                    \
-  static const int _pi256_##Name[8] ALIGN32 = {Val, Val, Val, Val, \
+  static const int ALIGN32 _pi256_##Name[8] = {Val, Val, Val, Val, \
                                               Val, Val, Val, Val}

 _PI256_CONST(0x7f, 0x7f);
@@ -98,7 +98,7 @@ typedef union imm_xmm_union {

 #define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) \
  {                                         \
-    imm_xmm_union u ALIGN32;                \
+    imm_xmm_union ALIGN32 u;                \
    u.imm = imm_;                           \
    xmm0_ = u.xmm[0];                       \
    xmm1_ = u.xmm[1];                       \
@@ -106,7 +106,7 @@ typedef union imm_xmm_union {

 #define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) \
  {                                         \
-    imm_xmm_union u ALIGN32;                \
+    imm_xmm_union ALIGN32 u;                \
    u.xmm[0] = xmm0_;                       \
    u.xmm[1] = xmm1_;                       \
    imm_ = u.imm;                           \
@@ -508,12 +508,14 @@ class VTanhKernelImpl : public VTanhKernel<T> {
    vaddbias_->Compute(-1.f, y, y);                                           \
  }

+#ifndef __WIN32
 #ifdef __AVX__
 INTRI8_FLOAT(jit::avx, detail::ExpAVX);
 INTRI16_FLOAT(jit::avx, detail::ExpAVX);
 INTRI_GT8LT16_FLOAT(jit::avx, detail::ExpAVX);
 INTRI_GT16_FLOAT(jit::avx, detail::ExpAVX);
-#endif
+#endif  // AVX
+#endif  // WIN32
 #ifdef __AVX2__
 INTRI8_FLOAT(jit::avx2, detail::ExpAVX2);
 INTRI16_FLOAT(jit::avx2, detail::ExpAVX2);

--- a/paddle/fluid/operators/math/jit_kernel_rnn.cc
+++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc
@@ -18,10 +18,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"

-#ifdef __AVX__
-#include <immintrin.h>
-#endif
-
 namespace paddle {
 namespace operators {
 namespace math {

--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -16,6 +16,18 @@ limitations under the License. */

 #include <stddef.h>

+#ifdef _WIN32
+#if defined(__AVX2__)
+#include <immintrin.h>  //avx2
+#elif defined(__AVX__)
+#include <intrin.h>  //avx
+#endif               // AVX
+#else                // WIN32
+#ifdef __AVX__
+#include <immintrin.h>
+#endif
+#endif  // WIN32
+
 namespace paddle {
 namespace platform {


--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
@@ -62,7 +62,6 @@ static void *dlopen(const char *filename, int flag) {
  }
  return reinterpret_cast<void *>(hModule);
 }
-
 #endif  // !_WIN32

 static void ExecShellCommand(const std::string &cmd, std::string *message) {