From 316765839de9e63aa65617cf3396f2b2f70b7cc9 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Wed, 31 Oct 2018 13:55:42 +0800 Subject: [PATCH] add back jit simd instructions. stage. --- CMakeLists.txt | 2 +- cmake/inference_lib.cmake | 3 ++- .../api/demo_ci/real_data_icnet_tester.cc | 3 --- paddle/fluid/operators/CMakeLists.txt | 2 +- paddle/fluid/operators/math/CMakeLists.txt | 8 ++++---- paddle/fluid/operators/math/cpu_vec.h | 4 ---- .../math/detail/activation_functions.h | 5 +---- .../fluid/operators/math/jit_kernel_blas.cc | 4 ---- .../operators/math/jit_kernel_crf_decode.cc | 5 ++--- paddle/fluid/operators/math/jit_kernel_exp.cc | 20 ++++++++++--------- paddle/fluid/operators/math/jit_kernel_rnn.cc | 4 ---- paddle/fluid/platform/cpu_info.h | 12 +++++++++++ paddle/fluid/platform/port.h | 1 - 13 files changed, 34 insertions(+), 39 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 968815f9b6..bf8725c41a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -181,10 +181,10 @@ include(external/eigen) # download eigen3 include(external/pybind11) # download pybind11 include(external/cares) include(external/cub) -include(external/xxhash) # download xxhash if (NOT WIN32) # there is no official support of snappystream, warpctc, nccl, cupti in windows +include(external/xxhash) # download xxhash include(external/snappy) # download snappy include(external/snappystream) # download snappystream include(external/warpctc) # download, build, install warpctc diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index cde339d83f..72ce7070c8 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -87,13 +87,14 @@ copy(boost_lib DSTS ${dst_dir} DEPS boost ) - +if(NOT WIN32) set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/xxhash") copy(xxhash_lib SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES} DSTS ${dst_dir} ${dst_dir}/lib DEPS xxhash ) +endif(NOT WIN32) if(NOT PROTOBUF_FOUND) set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf") diff --git a/paddle/fluid/inference/api/demo_ci/real_data_icnet_tester.cc b/paddle/fluid/inference/api/demo_ci/real_data_icnet_tester.cc index c7db21d093..5553d37355 100644 --- a/paddle/fluid/inference/api/demo_ci/real_data_icnet_tester.cc +++ b/paddle/fluid/inference/api/demo_ci/real_data_icnet_tester.cc @@ -20,9 +20,6 @@ #include "paddle/fluid/inference/paddle_inference_api.h" namespace paddle { -// DEFINE_string(dirname, "./lb", -// "Directory of the inference model."); - NativeConfig GetConfig() { NativeConfig config; diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 19a8e5f4b3..3721d7da70 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -86,7 +86,7 @@ function(op_library TARGET) # remove windows unsupported op, because windows has no nccl, no warpctc such ops. foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op" "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op" - "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op") + "fusion_seqconv_eltadd_relu_op" "hash_op") if ("${TARGET}" STREQUAL "${windows_unsupport_op}") return() endif() diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index dcc3520abe..d3e0006d40 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -74,8 +74,8 @@ cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split) cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) if (NOT WIN32) math_library(matrix_bit_code) - cc_library(jit_kernel - SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc - DEPS cpu_info cblas) - cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) endif (NOT WIN32) +cc_library(jit_kernel + SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc + DEPS cpu_info cblas) +cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h index 0aed253c80..38df5776bf 100644 --- a/paddle/fluid/operators/math/cpu_vec.h +++ b/paddle/fluid/operators/math/cpu_vec.h @@ -18,10 +18,6 @@ limitations under the License. */ #include #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/enforce.h" -#ifdef __AVX__ -#include -#endif - #ifdef PADDLE_WITH_MKLML #include "paddle/fluid/platform/dynload/mklml.h" #endif diff --git a/paddle/fluid/operators/math/detail/activation_functions.h b/paddle/fluid/operators/math/detail/activation_functions.h index b127fbe8c8..24df1f93ed 100644 --- a/paddle/fluid/operators/math/detail/activation_functions.h +++ b/paddle/fluid/operators/math/detail/activation_functions.h @@ -15,13 +15,10 @@ limitations under the License. */ #pragma once #include #include +#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/hostdevice.h" -#ifdef __AVX__ -#include -#endif - namespace paddle { namespace operators { namespace math { diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc index c88b17b012..e23b5008e5 100644 --- a/paddle/fluid/operators/math/jit_kernel_blas.cc +++ b/paddle/fluid/operators/math/jit_kernel_blas.cc @@ -19,10 +19,6 @@ limitations under the License. */ #include "paddle/fluid/platform/dynload/mklml.h" #endif -#ifdef __AVX__ -#include -#endif - namespace paddle { namespace operators { namespace math { diff --git a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc index e481d1921a..6ff35a8835 100644 --- a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc +++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc @@ -16,9 +16,6 @@ limitations under the License. */ #include #include #include "paddle/fluid/operators/math/jit_kernel_macro.h" -#ifdef __AVX__ -#include -#endif namespace paddle { namespace operators { @@ -263,6 +260,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel { } \ } +#ifndef _WIN32 // commented out crf decoding #ifdef __AVX__ INTRIAVX_FLOAT(kEQ8); INTRIAVX_FLOAT(kGT8LT16); @@ -275,6 +273,7 @@ INTRIAVX2_FLOAT(jit::avx2, kGT8LT16); INTRIAVX2_FLOAT(jit::avx2, kEQ16); INTRIAVX2_FLOAT(jit::avx2, kGT16); #endif +#endif // WIN32 #ifdef __AVX512F__ INTRIAVX2_FLOAT(jit::avx512f, kEQ8); INTRIAVX2_FLOAT(jit::avx512f, kGT8LT16); diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc index c4247580f4..d1b04b2b8f 100644 --- a/paddle/fluid/operators/math/jit_kernel_exp.cc +++ b/paddle/fluid/operators/math/jit_kernel_exp.cc @@ -20,10 +20,6 @@ limitations under the License. */ #include "paddle/fluid/platform/dynload/mklml.h" #endif -#ifdef __AVX__ -#include -#endif - namespace paddle { namespace operators { namespace math { @@ -66,14 +62,18 @@ namespace detail { #ifdef __AVX__ +#if defined(_WIN32) +#define ALIGN32 __declspec(align(32)) +#else #define ALIGN32 __attribute__((aligned(32))) +#endif // _WIN32 #define _PS256_CONST(Name, Val) \ - static const float _ps256_##Name[8] ALIGN32 = {Val, Val, Val, Val, \ + static const float ALIGN32 _ps256_##Name[8] = {Val, Val, Val, Val, \ Val, Val, Val, Val} #define _PI256_CONST(Name, Val) \ - static const int _pi256_##Name[8] ALIGN32 = {Val, Val, Val, Val, \ + static const int ALIGN32 _pi256_##Name[8] = {Val, Val, Val, Val, \ Val, Val, Val, Val} _PI256_CONST(0x7f, 0x7f); @@ -98,7 +98,7 @@ typedef union imm_xmm_union { #define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) \ { \ - imm_xmm_union u ALIGN32; \ + imm_xmm_union ALIGN32 u; \ u.imm = imm_; \ xmm0_ = u.xmm[0]; \ xmm1_ = u.xmm[1]; \ @@ -106,7 +106,7 @@ typedef union imm_xmm_union { #define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) \ { \ - imm_xmm_union u ALIGN32; \ + imm_xmm_union ALIGN32 u; \ u.xmm[0] = xmm0_; \ u.xmm[1] = xmm1_; \ imm_ = u.imm; \ @@ -508,12 +508,14 @@ class VTanhKernelImpl : public VTanhKernel { vaddbias_->Compute(-1.f, y, y); \ } +#ifndef __WIN32 #ifdef __AVX__ INTRI8_FLOAT(jit::avx, detail::ExpAVX); INTRI16_FLOAT(jit::avx, detail::ExpAVX); INTRI_GT8LT16_FLOAT(jit::avx, detail::ExpAVX); INTRI_GT16_FLOAT(jit::avx, detail::ExpAVX); -#endif +#endif // AVX +#endif // WIN32 #ifdef __AVX2__ INTRI8_FLOAT(jit::avx2, detail::ExpAVX2); INTRI16_FLOAT(jit::avx2, detail::ExpAVX2); diff --git a/paddle/fluid/operators/math/jit_kernel_rnn.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc index fab293f7d0..64b60abe72 100644 --- a/paddle/fluid/operators/math/jit_kernel_rnn.cc +++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc @@ -18,10 +18,6 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/macros.h" -#ifdef __AVX__ -#include -#endif - namespace paddle { namespace operators { namespace math { diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index 6810a1651a..bc0204e579 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -16,6 +16,18 @@ limitations under the License. */ #include +#ifdef _WIN32 +#if defined(__AVX2__) +#include //avx2 +#elif defined(__AVX__) +#include //avx +#endif // AVX +#else // WIN32 +#ifdef __AVX__ +#include +#endif +#endif // WIN32 + namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h index 3dd595aac6..8f1e3bdd31 100644 --- a/paddle/fluid/platform/port.h +++ b/paddle/fluid/platform/port.h @@ -62,7 +62,6 @@ static void *dlopen(const char *filename, int flag) { } return reinterpret_cast(hModule); } - #endif // !_WIN32 static void ExecShellCommand(const std::string &cmd, std::string *message) { -- GitLab