add the jit back

fix compile error on windows

add the jit back
fix compile error on windows
a3e952f4 · peizhilin · 1cc23ef6 · a3e952f4 · a3e952f4 · a3e952f4
9 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -130,6 +130,11 @@ if (APPLE OR WIN32)
        "Disable MKL for building on mac and windows" FORCE)
 endif()

+if (WIN32)
+    set(WITH_AVX OFF CACHE STRING
+            "Disable AVX when compiling for Windows" FORCE)
+endif()
+
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
  "A path setting third party libraries download & build directories.")


--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -84,9 +84,8 @@ function(op_library TARGET)
    endif()
    if (WIN32)
    # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
-    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op"
-     "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op"
-      "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op")
+    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op"
+            "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op")
        if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
          return()
        endif()

--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -70,17 +70,20 @@ int main()
    return 0;
 }" AVX_FOUND)

-# Check AVX 2
-set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
-set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
-int main()
-{
+# disable AVX2 by default on windows
+if(NOT WIN32)
+    # Check AVX 2
+    set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
+    set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+    CHECK_CXX_SOURCE_RUNS("
+    #include <immintrin.h>
+    int main()
+    {
        __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
        __m256i result = _mm256_abs_epi32 (a);
        return 0;
-}" AVX2_FOUND)
+    }" AVX2_FOUND)
+endif(NOT WIN32)

 # Check AVX512F
 set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -22,9 +22,7 @@ if(WITH_DISTRIBUTE)
    add_subdirectory(distributed_ops)
 endif()

-if (NOT WIN32)
-    add_subdirectory(reader)
-endif()
+add_subdirectory(reader)

 if (NOT WIN32)
    add_subdirectory(nccl)
@@ -49,9 +47,10 @@ endif()

 set(COMMON_OP_DEPS "")

-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor dynload_warpctc sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor sequence_padding sequence_scale cos_sim_functor memory concat_and_split cross_entropy softmax vol2col im2col sampler)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} lstm_compute matrix_bit_code sequence2batch gru_compute activation_functions jit_kernel)
 if (NOT WIN32)
-  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
+  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 endif()
 if (WITH_GPU)
  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv cub)

--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -111,7 +111,7 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
    auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
    auto pre_out_grad_mat = EigenMatrix<T>::From(pre_out_grad);
    auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
-    Eigen::array<int, 2> bcast({{1, static_cast<int>(pre_out_grad.dims()[1])}});
+    Eigen::array<int, 2> bcast{1, static_cast<int>(pre_out_grad.dims()[1])};

    // softrelu derivative
    pre_out_grad_mat.device(place) =

--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
-if (NOT WIN32)
-    add_subdirectory(detail)
-endif(NOT WIN32)
+add_subdirectory(detail)

 function(math_library TARGET)
    # math_library is a function to create math library.
@@ -43,10 +41,8 @@ math_library(depthwise_conv)
 math_library(im2col)
 math_library(sampler)

-if (NOT WIN32) # windows do not support avx functions yet.
-    math_library(gru_compute DEPS activation_functions math_function)
-    math_library(lstm_compute DEPS activation_functions)
-endif (NOT WIN32)
+math_library(gru_compute DEPS activation_functions math_function)
+math_library(lstm_compute DEPS activation_functions)

 cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
 math_library(math_function DEPS blas)
@@ -58,9 +54,9 @@ math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function)
 math_library(sequence_scale)
 math_library(softmax DEPS math_function)
-if (NOT WIN32)
-    math_library(matrix_bit_code)
-endif (NOT WIN32)
+
+math_library(matrix_bit_code)
+
 math_library(unpooling)
 math_library(vol2col)

@@ -76,13 +72,12 @@ if(WITH_GPU)
 endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
 cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
-if (NOT WIN32)
-    set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc)
-    set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce)
-    if(WITH_XBYAK)
+
+set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc)
+set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce)
+if(WITH_XBYAK)
    list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc)
    list(APPEND JIT_KERNEL_DEPS xbyak)
-    endif()
-    cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS})
-    cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
-endif (NOT WIN32)
+endif()
+cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS})
+cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -67,7 +67,7 @@ inline constexpr size_t FindLastSet(size_t x) {
             : (std::is_same<size_t, unsigned long>::value  // NOLINT
                    ? (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0)
                    : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0));
-
+}
 #else
 // windows don't have built-in clz, ctz function
 template <typename T>
@@ -92,7 +92,6 @@ inline int clz(const T& value) {

 inline size_t FindLastSet(size_t x) { return sizeof(size_t) * 8 - clz(x); }
 #endif  // !_WIN32
-}

 struct SimpleCode {
  SimpleCode(size_t code, size_t num_classes) : c_(code + num_classes) {}

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -170,12 +170,6 @@ __all__ = [
    'bilinear_tensor_product',
 ]

-# To avoid the api checker complains
-if os.name == 'nt':
-    __all__.remove('dynamic_lstm')
-    __all__.remove('crf_decoding')
-    __all__.remove('roi_pool')
-

 def fc(input,
       size,
@@ -349,10 +343,8 @@ def embedding(input,
    return tmp


-if os.name != 'nt':
-
-    @templatedoc(op_type="lstm")
-    def dynamic_lstm(input,
+@templatedoc(op_type="lstm")
+def dynamic_lstm(input,
                 size,
                 h_0=None,
                 c_0=None,
@@ -969,10 +961,8 @@ def linear_chain_crf(input, label, param_attr=None):
    return log_likelihood


-if os.name != 'nt':
-
-    @templatedoc()
-    def crf_decoding(input, param_attr, label=None):
+@templatedoc()
+def crf_decoding(input, param_attr, label=None):
    """
    ${comment}

@@ -998,11 +988,9 @@ if os.name != 'nt':
        dtype=helper.input_dtype())
    helper.append_op(
        type='crf_decoding',
-            inputs={
-                "Emission": [input],
+        inputs={"Emission": [input],
                "Transition": transition,
-                "Label": label
-            },
+                "Label": label},
        outputs={"ViterbiPath": [viterbi_path]})

    return viterbi_path
@@ -5599,14 +5587,8 @@ def label_smooth(label,
    return smooth_label


-if os.name != 'nt':
-
-    @templatedoc()
-    def roi_pool(input,
-                 rois,
-                 pooled_height=1,
-                 pooled_width=1,
-                 spatial_scale=1.0):
+@templatedoc()
+def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
    """
    ${comment}


--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -100,12 +100,12 @@ Examples:
    >>> result = fluid.layers.hard_shrink(x=data, threshold=0.3)
 """

-if os.name != 'nt':
-    __all__ += ['cumsum']
+__all__ += ['cumsum']

-    _cum_sum_ = generate_layer_fn('cumsum')
+_cum_sum_ = generate_layer_fn('cumsum')

-    def cumsum(x, axis=None, exclusive=None, reverse=None):
+
+def cumsum(x, axis=None, exclusive=None, reverse=None):
    locals_var = locals().keys()
    kwargs = dict()
    for name in locals_var:
@@ -114,12 +114,13 @@ if os.name != 'nt':
            kwargs[name] = val
    return _cum_sum_(**kwargs)

-    cumsum.__doc__ = _cum_sum_.__doc__ + """
-    Examples:
+
+cumsum.__doc__ = _cum_sum_.__doc__ + """
+Examples:

    >>> data = fluid.layers.data(name="input", shape=[32, 784])
    >>> result = fluid.layers.cumsum(data, axis=0)
-    """
+"""

 __all__ += ['thresholded_relu']