test=develop

3c6102a3 · JiabinYang · 57a18e32 · dd6fd4c7 · 3c6102a3 · 3c6102a3
57 changed file
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -25,6 +25,7 @@
 | kexinzhao | Ke-Xin Zhao |
 | kuke | Yi-Bing Liu |
 | lcy-seso | Ying Cao |
+| cjld | Dun Liang |
 | lipeng-unisound | Peng Li |
 | liuyuan | Yuan Liu |
 | livc | Zhao Li |

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -130,6 +130,21 @@ if (APPLE OR WIN32)
        "Disable MKL for building on mac and windows" FORCE)
 endif()

+if (WIN32)
+    set(WITH_AVX OFF CACHE STRING
+            "Disable AVX when compiling for Windows" FORCE)
+    set(WITH_DSO OFF CACHE STRING
+            "Disable DSO when compiling for Windows" FORCE)
+    set(WITH_MKL OFF CACHE STRING
+            "Disable MKL when compiling for Windows" FORCE)
+    set(WITH_DISTRIBUTE OFF CACHE STRING
+            "Disable DISTRIBUTE when compiling for Windows" FORCE)
+    set(WITH_C_API OFF CACHE STRING
+            "Disable C_API when compiling for Windows" FORCE)
+    set(WITH_FLUID_ONLY ON CACHE STRING
+            "Enable FLUID_ONLY when compiling for Windows" FORCE)
+endif()
+
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
  "A path setting third party libraries download & build directories.")

@@ -190,11 +205,11 @@ include(external/pybind11)  # download pybind11
 include(external/cares)
 include(external/cub)
 include(external/xxhash)    # download xxhash
-
-if (NOT WIN32)
-# there is no official support of snappystream, warpctc, nccl, cupti in windows
 include(external/snappy)    # download snappy
 include(external/snappystream) # download snappystream
+
+if (NOT WIN32)
+# there is no official support of warpctc, nccl, cupti in windows
 include(external/warpctc)   # download, build, install warpctc
 include(cupti)
 endif (NOT WIN32)

--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -50,7 +50,11 @@ IF(WITH_TESTING)
        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                        -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                        -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                        -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                        -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                        -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
                        -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                        -DBUILD_GMOCK=ON

--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
@@ -24,7 +24,11 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
 set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
 set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)

-set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
+if (WIN32)
+    set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/snappy.lib")
+else(WIN32)
+    set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
+endif (WIN32)

 ExternalProject_Add(
    extern_snappy
@@ -34,8 +38,12 @@ ExternalProject_Add(
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                    -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
                    -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON

--- a/cmake/external/snappystream.cmake
+++ b/cmake/external/snappystream.cmake
@@ -18,36 +18,45 @@ ENDIF()

 include (ExternalProject)

-# NOTE: snappy is needed when linking with recordio
-
 set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
 set(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
 set(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include" CACHE PATH "snappy stream include directory." FORCE)

-set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
-
-ExternalProject_Add(
-        extern_snappystream
-        GIT_REPOSITORY "https://github.com/hoxnox/snappystream.git"
-        GIT_TAG "0.2.8"
-        PREFIX          ${SNAPPYSTREAM_SOURCES_DIR}
-        UPDATE_COMMAND  ""
-        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-                        -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                        -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
-                        -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
-                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                        -DSNAPPY_ROOT=${SNAPPY_INSTALL_DIR}
-                        ${EXTERNAL_OPTIONAL_ARGS}
-                        CMAKE_CACHE_ARGS
-                        -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR}
-                        -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib
-                        -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-        DEPENDS snappy
-)
+if(WIN32)
+    # Fix me, VS2015 come without VLA support
+    set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/snappystream.lib")
+    MESSAGE(WARNING, "In windows, snappystream has no compile support for windows,
+    please build it manually and put it at " ${SNAPPYSTREAM_INSTALL_DIR})
+else(WIN32)
+    set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
+
+    ExternalProject_Add(
+            extern_snappystream
+            GIT_REPOSITORY "https://github.com/hoxnox/snappystream.git"
+            GIT_TAG "0.2.8"
+            PREFIX          ${SNAPPYSTREAM_SOURCES_DIR}
+            UPDATE_COMMAND  ""
+            CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                            -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                            -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                            -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                            -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                            -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                            -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+                            -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
+                            -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
+                            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                            -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                            -DSNAPPY_ROOT=${SNAPPY_INSTALL_DIR}
+                            ${EXTERNAL_OPTIONAL_ARGS}
+                            CMAKE_CACHE_ARGS
+                            -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR}
+                            -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib
+                            -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+            DEPENDS snappy
+    )
+endif(WIN32)

 add_library(snappystream STATIC IMPORTED GLOBAL)
 set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION ${SNAPPYSTREAM_LIBRARIES})

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -351,6 +351,9 @@ function(cc_test TARGET_NAME)
    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_executable(${TARGET_NAME} ${cc_test_SRCS})
    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    if(WIN32)
+      target_link_libraries(${TARGET_NAME} shlwapi)
+    endif(WIN32)
    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
    add_test(NAME ${TARGET_NAME}
             COMMAND ${TARGET_NAME} ${cc_test_ARGS}

--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -84,9 +84,7 @@ function(op_library TARGET)
    endif()
    if (WIN32)
    # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
-    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op"
-     "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op"
-      "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op")
+    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op")
        if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
          return()
        endif()

--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -57,43 +57,46 @@ int main()
    return 0;
 }" SSE3_FOUND)

-# Check AVX
-set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
-set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
-int main()
-{
-    __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
-    __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
-    __m256 result = _mm256_add_ps (a, b);
-    return 0;
-}" AVX_FOUND)
+# disable AVX by default on windows
+if(NOT WIN32)
+    # Check AVX
+    set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
+    set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+    CHECK_CXX_SOURCE_RUNS("
+    #include <immintrin.h>
+    int main()
+    {
+        __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
+        __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
+        __m256 result = _mm256_add_ps (a, b);
+        return 0;
+    }" AVX_FOUND)

-# Check AVX 2
-set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
-set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
-int main()
-{
-    __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
-    __m256i result = _mm256_abs_epi32 (a);
-    return 0;
-}" AVX2_FOUND)
+    # Check AVX 2
+    set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
+    set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+    CHECK_CXX_SOURCE_RUNS("
+    #include <immintrin.h>
+    int main()
+    {
+        __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
+        __m256i result = _mm256_abs_epi32 (a);
+        return 0;
+    }" AVX2_FOUND)

-# Check AVX512F
-set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
-set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
-int main()
-{
-    __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
-                                  13, -5, 6, -7, 9, 2, -6, 3);
-    __m512i result = _mm512_abs_epi32 (a);
-    return 0;
-}" AVX512F_FOUND)
+    # Check AVX512F
+    set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
+    set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+    CHECK_CXX_SOURCE_RUNS("
+    #include <immintrin.h>
+    int main()
+    {
+        __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
+                                      13, -5, 6, -7, 9, 2, -6, 3);
+        __m512i result = _mm512_abs_epi32 (a);
+        return 0;
+    }" AVX512F_FOUND)
+endif(NOT WIN32)

 set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
 mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -103,6 +103,7 @@ paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 's
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
+paddle.fluid.layers.group_norm ArgSpec(args=['input', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW', None))
 paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode', 'return_softmax'], varargs=None, keywords=None, defaults=(False, -100, False, False))
 paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)

--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -3,13 +3,9 @@ add_subdirectory(platform)
 add_subdirectory(framework)
 add_subdirectory(operators)
 add_subdirectory(string)
-
-add_subdirectory(pybind)
-if (NOT WIN32)
 add_subdirectory(recordio)
-endif(NOT WIN32)
+add_subdirectory(pybind)

 # NOTE: please add subdirectory inference at last.
 add_subdirectory(inference)
-
 add_subdirectory(train)
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -31,9 +31,7 @@ function(windows_symbolic TARGET)
 endfunction()

 add_subdirectory(ir)
-if (NOT WIN32)
 add_subdirectory(details)
-endif (NOT WIN32)
 # ddim lib
 proto_library(framework_proto SRCS framework.proto)

@@ -68,11 +66,7 @@ if(WITH_GPU)
 else()
  cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
 endif()
-if (NOT WIN32)
-  cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)
-else()
-  cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
-endif (NOT WIN32)
+cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)

 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
@@ -122,13 +116,8 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)

-if (NOT WIN32)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
    shape_inference data_transform lod_tensor profiler)
-else()
-cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
-    shape_inference data_transform lod_tensor)
-endif(NOT WIN32)

 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)

@@ -183,12 +172,10 @@ else()
  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()

-if (NOT WIN32)
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
        graph build_strategy
        fast_threaded_ssa_graph_executor)
-endif() # NOT WIN32

 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)

--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@@ -13,9 +13,9 @@
 // limitations under the License.

 #pragma once
+#include <ThreadPool.h>
 #include <string>
 #include <vector>
-#include "ThreadPool.h"
 #include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"

--- a/paddle/fluid/framework/eigen.h
+++ b/paddle/fluid/framework/eigen.h
@@ -13,11 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
-// logging.h and windows.h conflict
-#define GLOG_NO_ABBREVIATED_SEVERITIES
-// solve static linking error in windows
-// https://github.com/google/glog/issues/301
-#define GOOGLE_GLOG_DLL_DECL

 #include "paddle/fluid/framework/tensor.h"
 #include "unsupported/Eigen/CXX11/Tensor"

--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -23,11 +23,6 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>

-#if defined(_WIN32)
-#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
-#define GOOGLE_GLOG_DLL_DECL
-#endif
-
 #include "glog/logging.h"  // For VLOG()
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/details/op_registry.h"

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define GLOG_NO_ABBREVIATED_SEVERITIES
-#define GOOGLE_GLOG_DLL_DECL

 #include <gflags/gflags.h>
 #include <glog/logging.h>

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -20,8 +20,6 @@ limitations under the License. */
 #include <tuple>
 #include <unordered_map>
 #include <vector>
-#define GLOG_NO_ABBREVIATED_SEVERITIES
-#define GOOGLE_GLOG_DLL_DECL

 #include "glog/logging.h"  // For VLOG
 #include "paddle/fluid/framework/attribute.h"

--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -14,12 +14,6 @@ limitations under the License. */

 #pragma once

-// logging.h and windows.h conflict
-#define GLOG_NO_ABBREVIATED_SEVERITIES
-// solve static linking error in windows
-// https://github.com/google/glog/issues/301
-#define GOOGLE_GLOG_DLL_DECL
-
 #include <glog/logging.h>
 #include <map>
 #include <memory>

--- a/paddle/fluid/memory/allocation/cpu_allocator.h
+++ b/paddle/fluid/memory/allocation/cpu_allocator.h
@@ -15,6 +15,12 @@
 #pragma once
 #include "paddle/fluid/memory/allocation/allocator.h"

+#ifdef _WIN32
+#define posix_memalign_free _aligned_free
+#define posix_memalign(p, a, s) \
+  (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno)
+#endif
+
 namespace paddle {
 namespace memory {
 namespace allocation {

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -22,9 +22,7 @@ if(WITH_DISTRIBUTE)
    add_subdirectory(distributed_ops)
 endif()

-if (NOT WIN32)
-    add_subdirectory(reader)
-endif()
+add_subdirectory(reader)

 if (NOT WIN32)
    add_subdirectory(nccl)
@@ -41,25 +39,30 @@ endif()

 register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS})

-# warpctc_cudnn need cudnn 7 above
-if (WITH_GPU)
+# warpctc_op needs cudnn 7 above
+if (WITH_GPU AND NOT WIN32)
    if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
        op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc)
    else()
        op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
    endif()
-  op_library(conv_fusion_op)
-  file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n")
+    # conv_fusion_op needs cudnn 7 above
+    if (NOT ${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+        op_library(conv_fusion_op)
+        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n")
+    endif()
 else()
    op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 endif()

 set(COMMON_OP_DEPS ${OP_HEADER_DEPS})

-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor dynload_warpctc sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
 if (NOT WIN32)
-  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
+    set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 endif()
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
 if (WITH_GPU)
  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv)
 endif()

--- a/paddle/fluid/operators/conv_fusion_op.cu.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cu.cc
@@ -22,6 +22,7 @@ DECLARE_bool(cudnn_exhaustive_search);
 namespace paddle {
 namespace operators {

+#if CUDNN_VERSION >= 7001
 using Tensor = framework::Tensor;
 using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
@@ -178,10 +179,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
    workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
  }
 };
+#endif

 }  // namespace operators
 }  // namespace paddle

+#if CUDNN_VERSION >= 7001
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(conv2d_fusion, ops::CUDNNConvFusionOpKernel<float>,
                        ops::CUDNNConvFusionOpKernel<double>);
+#endif
--- a/paddle/fluid/operators/group_norm_op.cc
+++ b/paddle/fluid/operators/group_norm_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/group_norm_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+
+class GroupNormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"),
+                   "Output(Y) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Mean"),
+                   "Output(Mean) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Variance"),
+                   "Output(Variance) of GroupNormOp should not be null.");
+
+    auto x_dim = ctx->GetInputDim("X");
+    auto channel_num = x_dim[1];
+    auto batch_size = x_dim[0];
+    auto groups = ctx->Attrs().Get<int>("groups");
+    PADDLE_ENFORCE_LE(
+        groups, channel_num,
+        "'groups' must be less equal than the number of channels.");
+    PADDLE_ENFORCE_GE(groups, 1, "'groups' must be greater equal than 1.");
+
+    if (ctx->HasInput("Scale")) {
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], channel_num);
+    }
+    if (ctx->HasInput("Bias")) {
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], channel_num);
+    }
+
+    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
+    ctx->SetOutputDim("Mean", {batch_size, groups});
+    ctx->SetOutputDim("Variance", {batch_size, groups});
+    ctx->ShareLoD("X", "Y");
+  }
+};
+
+class GroupNormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input tensor.");
+    AddInput("Scale",
+             "Scale is a 1-dimensional tensor of size C"
+             "that is applied to the output.")
+        .AsDispensable();
+    AddInput("Bias",
+             "Bias is a 1-dimensional tensor of size C "
+             "that is applied to the output")
+        .AsDispensable();
+    AddOutput("Y", "Result after normalization.");
+    AddOutput("Mean", "Mean of each group.").AsIntermediate();
+    AddOutput("Variance", "Variance of each group.").AsIntermediate();
+
+    AddAttr<float>("epsilon",
+                   "Constant for numerical stability [default 1e-5].")
+        .SetDefault(1e-5)
+        .AddCustomChecker([](const float &epsilon) {
+          PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 1.0f,
+                         "'epsilon' should be between 0.0 and 1.0.");
+        });
+    AddAttr<int>("groups", "The number of groups that divided from channels.")
+        .AddCustomChecker([](const int &groups) {
+          PADDLE_ENFORCE_GT(groups, 0, "'groups' should be greater than zero.");
+        });
+
+    AddComment(R"DOC(
+Group Normalization
+
+Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_
+)DOC");
+  }
+};
+
+class GroupNormGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    // check input
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Mean"),
+                   "Input(Mean) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Variance"),
+                   "Input(Variance) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
+                   "Input(Y@GRAD) of GroupNormOp should not be null.");
+
+    // check output
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
+      ctx->SetOutputDim(framework::GradVarName("Scale"),
+                        ctx->GetInputDim("Scale"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
+      ctx->SetOutputDim(framework::GradVarName("Bias"),
+                        ctx->GetInputDim("Bias"));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    const auto *var = ctx.InputVar(framework::GradVarName("Y"));
+    if (var == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    const Tensor *t = nullptr;
+    if (var->IsType<Tensor>()) {
+      t = &var->Get<Tensor>();
+    } else if (var->IsType<LoDTensor>()) {
+      t = &var->Get<LoDTensor>();
+    }
+    if (t == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    return framework::OpKernelType(framework::ToDataType(t->type()),
+                                   ctx.GetPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(group_norm, ops::GroupNormOp, ops::GroupNormOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(group_norm_grad, ops::GroupNormGradOp);
+REGISTER_OP_CPU_KERNEL(
+    group_norm, ops::GroupNormKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GroupNormKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    group_norm_grad,
+    ops::GroupNormGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GroupNormGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cub/cub.cuh>
+#include "paddle/fluid/operators/group_norm_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C,
+                                              int imsize, int groups,
+                                              int group_size, T* mean, T* var) {
+  int gid = blockIdx.y;
+  int cid = blockIdx.x;
+  int bid = blockIdx.z;
+  int number = min(group_size, static_cast<int>(C - gid * group_size));
+  int ccid = gid * group_size + cid;
+  if (ccid >= C) return;
+  T x_mean = 0, x_var = 0;
+  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
+    T val = x[(bid * C + ccid) * imsize + imid];
+    x_mean += val;
+    x_var += val * val;
+  }
+  x_mean /= number * imsize;
+  x_var /= number * imsize;
+  __shared__ T s_mem[2];
+  if (threadIdx.x == 0) {
+    s_mem[0] = s_mem[1] = 0;
+  }
+  __syncthreads();
+  paddle::platform::CudaAtomicAdd(&s_mem[0], x_mean);
+  paddle::platform::CudaAtomicAdd(&s_mem[1], x_var);
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    paddle::platform::CudaAtomicAdd(&mean[bid * groups + gid], s_mem[0]);
+    paddle::platform::CudaAtomicAdd(&var[bid * groups + gid], s_mem[1]);
+  }
+}
+
+template <typename T>
+__global__ void GroupNormForward(const T* x, const T* mean, const T* var,
+                                 const T* scale, const T* bias, int N, int C,
+                                 int imsize, int groups, int group_size,
+                                 T epsilon, T* y, T* real_var) {
+  int gid = blockIdx.y;
+  int cid = blockIdx.x;
+  int bid = blockIdx.z;
+  int ccid = gid * group_size + cid;
+  if (ccid >= C) return;
+  T x_mean = mean[bid * groups + gid];
+  T x_var = var[bid * groups + gid];
+  x_var = x_var - x_mean * x_mean;
+  T var_inv = 1.0 / sqrt(x_var + epsilon);
+  if (cid == 0 && threadIdx.x == 0) real_var[bid * groups + gid] = x_var;
+  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
+    T val = x[(bid * C + ccid) * imsize + imid];
+    val = (val - x_mean) * var_inv;
+    if (scale) val *= scale[gid * group_size + cid];
+    if (bias) val += bias[gid * group_size + cid];
+    y[(bid * C + ccid) * imsize + imid] = val;
+  }
+}
+
+template <typename T>
+class GroupNormKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* x = ctx.Input<Tensor>("X");
+
+    auto* y = ctx.Output<Tensor>("Y");
+    auto* mean = ctx.Output<Tensor>("Mean");
+    auto* var = ctx.Output<Tensor>("Variance");
+    const auto groups = ctx.Attr<int>("groups");
+
+    const auto x_dims = x->dims();
+    const int group_size = (x_dims[1] - 1) / groups + 1;
+
+    y->mutable_data<T>(ctx.GetPlace());
+    mean->mutable_data<T>(ctx.GetPlace());
+    var->mutable_data<T>(ctx.GetPlace());
+    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    Tensor temp_var;
+    temp_var.mutable_data<T>(var->dims(), ctx.GetPlace());
+
+    set_zero(dev_ctx, mean, static_cast<T>(0));
+    set_zero(dev_ctx, &temp_var, static_cast<T>(0));
+
+    auto* x_data = x->data<T>();
+    auto* y_data = y->data<T>();
+    auto* mean_data = mean->data<T>();
+    auto* var_data = var->data<T>();
+    auto* temp_var_data = temp_var.data<T>();
+
+    const T* scale_data = nullptr;
+    if (scale) scale_data = scale->data<T>();
+    const T* bias_data = nullptr;
+    if (bias) bias_data = bias->data<T>();
+
+    int imsize = x_dims[2] * x_dims[3];
+    int block_size = std::min(512, imsize);
+    dim3 grid(group_size, groups, x_dims[0]);
+    dim3 threads(block_size, 1, 1);
+    GroupNormForwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>(
+        x_data, x_dims[0], x_dims[1], imsize, groups, group_size, mean_data,
+        temp_var_data);
+    GroupNormForward<T><<<grid, threads, 0, dev_ctx.stream()>>>(
+        x_data, mean_data, temp_var_data, scale_data, bias_data, x_dims[0],
+        x_dims[1], imsize, groups, group_size, epsilon, y_data, var_data);
+  }
+};
+
+template <typename T>
+__global__ void GroupNormBackwardGetMeanAndVar(
+    const T* x, const T* mean, const T* var, const T* scale, const T* d_y,
+    int N, int C, int imsize, int groups, int group_size, T epsilon, T* d_x,
+    T* d_mean, T* d_var, T* d_scale, T* d_bias) {
+  int gid = blockIdx.y;
+  int cid = blockIdx.x;
+  int bid = blockIdx.z;
+  int number = min(group_size, static_cast<int>(C - gid * group_size));
+  int ccid = gid * group_size + cid;
+  if (ccid >= C) return;
+  T x_mean = mean[bid * groups + gid];
+  T x_var = var[bid * groups + gid];
+  T var_inv = 1.0 / sqrt(x_var + epsilon);
+  T d_var_inv = 0, d_x_mean = 0;
+  T d_mean_data = 0, d_var_data = 0, d_scale_data = 0, d_bias_data = 0;
+
+  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
+    T tmp = x[(bid * C + ccid) * imsize + imid];
+    T val = (tmp - x_mean) * var_inv;
+    T dval = d_y[(bid * C + ccid) * imsize + imid];
+    if (d_bias) d_bias_data += dval;
+    if (d_scale) d_scale_data += val * dval;
+    if (scale) dval = dval * scale[ccid];
+    d_var_data += (tmp - x_mean) * dval;
+    T d_tmp = dval * var_inv;
+    if (d_x) d_x[(bid * C + ccid) * imsize + imid] = d_tmp;
+    d_mean_data -= d_tmp;
+  }
+
+  __shared__ T s_mem[4];
+  if (threadIdx.x == 0) {
+    s_mem[0] = s_mem[1] = 0;
+    if (d_scale) s_mem[2] = 0;
+    if (d_bias) s_mem[3] = 0;
+  }
+  __syncthreads();
+  paddle::platform::CudaAtomicAdd(&s_mem[0], d_mean_data);
+  paddle::platform::CudaAtomicAdd(&s_mem[1], d_var_data);
+  if (d_scale) paddle::platform::CudaAtomicAdd(&s_mem[2], d_scale_data);
+  if (d_bias) paddle::platform::CudaAtomicAdd(&s_mem[3], d_bias_data);
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    paddle::platform::CudaAtomicAdd(&d_mean[bid * groups + gid], s_mem[0]);
+    paddle::platform::CudaAtomicAdd(&d_var[bid * groups + gid], s_mem[1]);
+    if (d_scale) paddle::platform::CudaAtomicAdd(&d_scale[ccid], s_mem[2]);
+    if (d_bias) paddle::platform::CudaAtomicAdd(&d_bias[ccid], s_mem[3]);
+  }
+}
+
+template <typename T>
+__global__ void GroupNormBackward(const T* x, const T* mean, const T* var,
+                                  const T* d_mean, const T* d_var, int N, int C,
+                                  int imsize, int groups, int group_size,
+                                  T epsilon, T* d_x) {
+  int gid = blockIdx.y;
+  int cid = blockIdx.x;
+  int bid = blockIdx.z;
+  int number = min(group_size, static_cast<int>(C - gid * group_size));
+  int ccid = gid * group_size + cid;
+  if (ccid >= C) return;
+  T x_mean = mean[bid * groups + gid];
+  T x_var = var[bid * groups + gid];
+  T d_x_mean = d_mean[bid * groups + gid];
+  T d_var_inv = d_var[bid * groups + gid];
+
+  T d_x_var =
+      -1.0 / (2 * (x_var + epsilon) * sqrt(x_var + epsilon)) * d_var_inv;
+  d_x_mean -= 2 * d_x_var * x_mean;
+  d_x_var /= number * imsize;
+  d_x_mean /= number * imsize;
+  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
+    T tmp = x[(bid * C + ccid) * imsize + imid];
+    if (d_x)
+      d_x[(bid * C + ccid) * imsize + imid] += d_x_mean + tmp * 2 * d_x_var;
+  }
+}
+
+template <typename T>
+class GroupNormGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto* x = ctx.Input<Tensor>("X");
+    auto* mean = ctx.Input<Tensor>("Mean");
+    auto* var = ctx.Input<Tensor>("Variance");
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto groups = ctx.Attr<int>("groups");
+
+    // init output
+    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    const auto& x_dims = x->dims();
+    const int group_size = (x_dims[1] - 1) / groups + 1;
+
+    T* d_x_data = nullptr;
+    if (d_x) {
+      d_x->mutable_data<T>(ctx.GetPlace());
+      d_x_data = d_x->data<T>();
+    }
+    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    Tensor temp_var;
+    temp_var.mutable_data<T>(var->dims(), ctx.GetPlace());
+    set_zero(dev_ctx, &temp_var, static_cast<T>(0));
+    T* temp_var_data = temp_var.data<T>();
+
+    Tensor temp_mean;
+    temp_mean.mutable_data<T>(var->dims(), ctx.GetPlace());
+    set_zero(dev_ctx, &temp_mean, static_cast<T>(0));
+    T* temp_mean_data = temp_mean.data<T>();
+
+    auto* x_data = x->data<T>();
+    auto* y_data = d_y->data<T>();
+    auto* mean_data = mean->data<T>();
+    auto* var_data = var->data<T>();
+    T* d_scale_data = nullptr;
+    if (d_scale) {
+      d_scale->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, d_scale, static_cast<T>(0));
+      d_scale_data = d_scale->data<T>();
+    }
+    T* d_bias_data = nullptr;
+    if (d_bias) {
+      d_bias->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, d_bias, static_cast<T>(0));
+      d_bias_data = d_bias->data<T>();
+    }
+
+    const T* scale_data = nullptr;
+    if (scale) scale_data = scale->data<T>();
+
+    int imsize = x_dims[2] * x_dims[3];
+    int block_size = std::min(512, imsize);
+    dim3 grid(group_size, groups, x_dims[0]);
+    dim3 threads(block_size, 1, 1);
+    GroupNormBackwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>(
+        x_data, mean_data, var_data, scale_data, y_data, x_dims[0], x_dims[1],
+        imsize, groups, group_size, epsilon, d_x_data, temp_mean_data,
+        temp_var_data, d_scale_data, d_bias_data);
+    GroupNormBackward<T><<<grid, threads, 0, dev_ctx.stream()>>>(
+        x_data, mean_data, var_data, temp_mean_data, temp_var_data, x_dims[0],
+        x_dims[1], imsize, groups, group_size, epsilon, d_x_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    group_norm,
+    ops::GroupNormKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GroupNormKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    group_norm_grad,
+    ops::GroupNormGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GroupNormGradKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/group_norm_op.h
+++ b/paddle/fluid/operators/group_norm_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+
+template <typename DeviceContext, typename T>
+class GroupNormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* x = ctx.Input<Tensor>("X");
+
+    auto* y = ctx.Output<Tensor>("Y");
+    auto* mean = ctx.Output<Tensor>("Mean");
+    auto* var = ctx.Output<Tensor>("Variance");
+    const auto groups = ctx.Attr<int>("groups");
+
+    const auto x_dims = x->dims();
+    const int group_size = (x_dims[1] - 1) / groups + 1;
+
+    y->mutable_data<T>(ctx.GetPlace());
+    mean->mutable_data<T>(ctx.GetPlace());
+    var->mutable_data<T>(ctx.GetPlace());
+
+    auto* x_data = x->data<T>();
+    auto* y_data = y->data<T>();
+    auto* mean_data = mean->data<T>();
+    auto* var_data = var->data<T>();
+
+    const T* scale_data = nullptr;
+    if (scale) scale_data = scale->data<T>();
+    const T* bias_data = nullptr;
+    if (bias) bias_data = bias->data<T>();
+
+    int imsize = x_dims[2] * x_dims[3];
+    auto* iter_x_data = x_data;
+    auto* iter_y_data = y_data;
+    for (int bid = 0; bid < x_dims[0]; bid++)
+      for (int gid = 0; gid < groups; gid++) {
+        T x_mean = 0, x_var = 0;
+        int number = std::min(group_size,
+                              static_cast<int>(x_dims[1] - gid * group_size));
+        auto* tmp = iter_x_data;
+        for (int cid = 0; cid < number; cid++) {
+          for (int imid = 0; imid < imsize; imid++, iter_x_data++) {
+            x_mean += iter_x_data[0];
+            x_var += iter_x_data[0] * iter_x_data[0];
+          }
+        }
+        x_mean /= number * imsize;
+        x_var /= number * imsize;
+        x_var = x_var - x_mean * x_mean;
+        T var_inv = 1.0 / sqrt(x_var + epsilon);
+        mean_data[bid * groups + gid] = x_mean;
+        var_data[bid * groups + gid] = x_var;
+        for (int cid = 0; cid < number; cid++) {
+          for (int imid = 0; imid < imsize; imid++, tmp++, iter_y_data++) {
+            T val = (tmp[0] - x_mean) * var_inv;
+            if (scale_data) val *= scale_data[gid * group_size + cid];
+            if (bias_data) val += bias_data[gid * group_size + cid];
+            iter_y_data[0] = val;
+          }
+        }
+      }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GroupNormGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto* x = ctx.Input<Tensor>("X");
+    auto* mean = ctx.Input<Tensor>("Mean");
+    auto* var = ctx.Input<Tensor>("Variance");
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto groups = ctx.Attr<int>("groups");
+
+    // init output
+    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    const auto& x_dims = x->dims();
+    const int group_size = (x_dims[1] - 1) / groups + 1;
+
+    // TODO(liangdun): need to check d_x is null
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    T* d_x_data = nullptr;
+    if (d_x) {
+      d_x->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, d_x, static_cast<T>(0));
+      d_x_data = d_x->data<T>();
+    }
+
+    auto* x_data = x->data<T>();
+    auto* y_data = d_y->data<T>();
+    auto* mean_data = mean->data<T>();
+    auto* var_data = var->data<T>();
+    T* d_scale_data = nullptr;
+    if (d_scale) {
+      d_scale->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, d_scale, static_cast<T>(0));
+      d_scale_data = d_scale->data<T>();
+    }
+    T* d_bias_data = nullptr;
+    if (d_bias) {
+      d_bias->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, d_bias, static_cast<T>(0));
+      d_bias_data = d_bias->data<T>();
+    }
+
+    const T* scale_data = nullptr;
+    if (scale) scale_data = scale->data<T>();
+
+    int imsize = x_dims[2] * x_dims[3];
+    auto* iter_x_data = x_data;
+    auto* iter_d_x_data = d_x_data;
+    auto* iter_y_data = y_data;
+    for (int bid = 0; bid < x_dims[0]; bid++)
+      for (int gid = 0; gid < groups; gid++) {
+        T x_mean = mean_data[bid * groups + gid];
+        T x_var = var_data[bid * groups + gid];
+        T var_inv = 1.0 / sqrt(x_var + epsilon);
+        int number = std::min(group_size,
+                              static_cast<int>(x_dims[1] - gid * group_size));
+        auto* tmp = iter_x_data;
+        auto* tmp2 = iter_d_x_data;
+        T d_var_inv = 0, d_x_mean = 0;
+        for (int cid = 0; cid < number; cid++) {
+          for (int imid = 0; imid < imsize;
+               imid++, tmp++, iter_y_data++, iter_d_x_data++) {
+            T val = (tmp[0] - x_mean) * var_inv;
+            T dval = iter_y_data[0];
+            if (d_bias_data) d_bias_data[gid * group_size + cid] += dval;
+            if (d_scale_data)
+              d_scale_data[gid * group_size + cid] += val * dval;
+            if (scale_data) dval = scale_data[gid * group_size + cid] * dval;
+
+            d_var_inv += (tmp[0] - x_mean) * dval;
+            T d_tmp = dval * var_inv;
+            if (d_x_data) iter_d_x_data[0] += d_tmp;
+            d_x_mean -= d_tmp;
+          }
+        }
+
+        T d_x_var =
+            -1.0 / (2 * (x_var + epsilon) * sqrt(x_var + epsilon)) * d_var_inv;
+        d_x_mean -= 2 * d_x_var * x_mean;
+        d_x_var /= number * imsize;
+        d_x_mean /= number * imsize;
+
+        iter_d_x_data = tmp2;
+
+        if (d_x_data) {
+          for (int cid = 0; cid < number; cid++) {
+            for (int imid = 0; imid < imsize;
+                 imid++, iter_x_data++, iter_d_x_data++) {
+              iter_d_x_data[0] += d_x_mean;
+              iter_d_x_data[0] += iter_x_data[0] * 2 * d_x_var;
+            }
+          }
+        }
+      }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
-if (NOT WIN32)
-    add_subdirectory(detail)
-endif(NOT WIN32)
+add_subdirectory(detail)

 function(math_library TARGET)
    # math_library is a function to create math library.
@@ -43,10 +41,8 @@ math_library(depthwise_conv)
 math_library(im2col)
 math_library(sampler)

-if (NOT WIN32) # windows do not support avx functions yet.
-    math_library(gru_compute DEPS activation_functions math_function)
-    math_library(lstm_compute DEPS activation_functions)
-endif (NOT WIN32)
+math_library(gru_compute DEPS activation_functions math_function)
+math_library(lstm_compute DEPS activation_functions)

 cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
 math_library(math_function DEPS blas)
@@ -58,9 +54,9 @@ math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function)
 math_library(sequence_scale)
 math_library(softmax DEPS math_function)
-if (NOT WIN32)
-    math_library(matrix_bit_code)
-endif (NOT WIN32)
+
+math_library(matrix_bit_code)
+
 math_library(unpooling)
 math_library(vol2col)

@@ -76,13 +72,12 @@ if(WITH_GPU)
 endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
 cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
-if (NOT WIN32)
-    set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc jit_kernel_layer_norm.cc)
-    set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce)
-    if(WITH_XBYAK)
-        list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc)
-        list(APPEND JIT_KERNEL_DEPS xbyak)
-    endif()
-    cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS})
-    cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
-endif (NOT WIN32)
+
+set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc jit_kernel_layer_norm.cc)
+set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce)
+if(WITH_XBYAK)
+    list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc)
+    list(APPEND JIT_KERNEL_DEPS xbyak)
+endif()
+cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS})
+cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
--- a/paddle/fluid/operators/math/detail/activation_functions.h
+++ b/paddle/fluid/operators/math/detail/activation_functions.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <math.h>
 #include <string>
+
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/hostdevice.h"


--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -69,7 +69,7 @@ inline constexpr size_t FindLastSet(size_t x) {
             : (std::is_same<size_t, unsigned long>::value  // NOLINT
                    ? (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0)
                    : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0));
-
+}
 #else
 // windows don't have built-in clz, ctz function
 template <typename T>
@@ -94,7 +94,6 @@ inline int clz(const T& value) {

 inline size_t FindLastSet(size_t x) { return sizeof(size_t) * 8 - clz(x); }
 #endif  // !_WIN32
-}
 // set a code interface to create multiple code
 class Code {
 public:

--- a/paddle/fluid/operators/reader/create_py_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_py_reader_op.cc
@@ -74,7 +74,7 @@ class CreatePyReaderOpMaker : public FileReaderMakerBase {
             "Name of the `LoDTensorBlockingQueueHolder` variable");

    AddComment(R"DOC(
-			Create PyReader to support LoDTensor data feeding in Python side.
+      Create PyReader to support LoDTensor data feeding in Python side.
      )DOC");
  }
 };

--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -35,10 +35,10 @@ class ROIAlignOp : public framework::OperatorWithKernel {
                   "The format of input tensor is NCHW.");
    PADDLE_ENFORCE(rois_dims.size() == 2,
                   "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
-                   "given as [[x1, y1, x2, y2], …].");
+                   "given as [[x1, y1, x2, y2], ...].");
    PADDLE_ENFORCE(rois_dims[1] == 4,
                   "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
-                   "given as [[x1, y1, x2, y2], …].");
+                   "given as [[x1, y1, x2, y2], ...].");
    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
@@ -103,7 +103,7 @@ class ROIAlignOpMaker : public framework::OpProtoAndCheckerMaker {
             "(LoDTensor), "
             "ROIs (Regions of Interest) to pool over. "
             "should be a 2-D LoDTensor of shape (num_rois, 4)"
-             "given as [[x1, y1, x2, y2], …]. "
+             "given as [[x1, y1, x2, y2], ...]. "
             "(x1, y1) is the top left coordinates, and "
             "(x2, y2) is the bottom right coordinates.");
    AddOutput("Out",

--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -40,10 +40,10 @@ class ROIPoolOp : public framework::OperatorWithKernel {
                   "The format of input tensor is NCHW.");
    PADDLE_ENFORCE(rois_dims.size() == 2,
                   "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
-                   "given as [[x1, y1, x2, y2], …].");
+                   "given as [[x1, y1, x2, y2], ...].");
    PADDLE_ENFORCE(rois_dims[1] == kROISize,
                   "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
-                   "given as [[x1, y1, x2, y2], …].");
+                   "given as [[x1, y1, x2, y2], ...].");

    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
@@ -110,7 +110,7 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
             "(LoDTensor), "
             "ROIs (Regions of Interest) to pool over. "
             "should be a 2-D LoDTensor of shape (num_rois, 4)"
-             "given as [[x1, y1, x2, y2], …]. "
+             "given as [[x1, y1, x2, y2], ...]. "
             "Where batch_id is the id of the data, "
             "(x1, y1) is the top left coordinates, and "
             "(x2, y2) is the bottom right coordinates.");

--- a/paddle/fluid/operators/space_to_depth_op.cc
+++ b/paddle/fluid/operators/space_to_depth_op.cc
@@ -86,7 +86,7 @@ class SpaceToDepthOpMaker : public framework::OpProtoAndCheckerMaker {
        .GreaterThan(1);
    AddComment(R"DOC(
        reorg operator used in Yolo v2.
-        The equation is: C2 = C1/blocksize * blocksize, W2 = W1 * blocksize + offset % blocksize, H2 = H1 * blocksize + offset / blocksize,   
+        The equation is: C2 = C1/blocksize * blocksize, W2 = W1 * blocksize + offset % blocksize, H2 = H1 * blocksize + offset / blocksize,

        Reshape Input(X) into the shape according to Attr(blocksize). The
        data in Input(X) are unchanged.

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
-if (NOT WIN32)
 proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto)
 py_proto_compile(profiler_py_proto SRCS profiler.proto)

@@ -6,11 +5,19 @@ add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch _

 add_dependencies(profiler_py_proto profiler_py_proto_init)

+if (NOT WIN32)
 add_custom_command(TARGET profiler_py_proto POST_BUILD
        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
        COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
        COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+else(NOT WIN32)
+string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler/")
+add_custom_command(TARGET profiler_py_proto POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
+        COMMAND copy /Y *.py ${proto_dstpath}
+        COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 endif(NOT WIN32)

 if(WITH_GPU)
@@ -60,12 +67,9 @@ cc_test(init_test SRCS init_test.cc DEPS device_context)
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)

-
-if (NOT WIN32)
 cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
-endif(NOT WIN32)

 nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
 cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)

--- a/paddle/fluid/platform/cpu_helper.cc
+++ b/paddle/fluid/platform/cpu_helper.cc
@@ -29,6 +29,13 @@ namespace platform {

 void SetNumThreads(int num_threads) {
 #ifdef PADDLE_USE_OPENBLAS
+// windows has no support for openblas multi-thread
+// please refer to: https://github.com/PaddlePaddle/Paddle/issues/7234
+#ifdef _WIN32
+  if (num_threads > 1) {
+    num_threads = 1;
+  }
+#endif
  int real_num_threads = num_threads > 1 ? num_threads : 1;
  openblas_set_num_threads(real_num_threads);
 #elif defined(PADDLE_WITH_MKLML)

--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -13,17 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once

-#if !defined(_WIN32)
-#include <sys/time.h>
-#else
-#include <windows.h>
-#endif  // !_WIN32
-
-#include <time.h>
 #include <chrono>  // NOLINT
 #include <string>

 #include "paddle/fluid/platform/dynload/cupti.h"
+#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.pb.h"

 namespace paddle {
@@ -32,15 +26,11 @@ namespace platform {
 ///////////////////////
 // WARN: Under Development. Don't depend on it yet.
 //////////////////////
-#if !defined(_WIN32)
 inline uint64_t PosixInNsec() {
  struct timeval tv;
  gettimeofday(&tv, nullptr);
  return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
 }
-#else
-inline uint64_t PosixInNsec() { return static_cast<uint64_t>(0); }
-#endif  // !_WIN32

 // DeviceTracer performs the following tasks:
 // 1. Register cuda callbacks for various events: kernel, memcpy, etc.

--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
-#define GLOG_NO_ABBREVIATED_SEVERITIES
-#define GOOGLE_GLOG_DLL_DECL
 #include <glog/logging.h>

 #include <cudnn.h>

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -18,12 +18,6 @@ limitations under the License. */
 #include <cxxabi.h>  // for __cxa_demangle
 #endif               // __GNUC__

-#if defined(_WIN32)
-#define NOMINMAX  // msvc max/min macro conflict with std::min/max
-#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
-#define GOOGLE_GLOG_DLL_DECL
-#endif
-
 #ifdef PADDLE_WITH_CUDA
 #include <cublas_v2.h>
 #include <cudnn.h>
@@ -127,14 +121,14 @@ struct EOFException : public std::exception {
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
 #else
 // there is no equivalent intrinsics in msvc.
-#define UNLIKELY(condition) (condition == 0)
+#define UNLIKELY(condition) (condition)
 #endif

 #if !defined(_WIN32)
 #define LIKELY(condition) __builtin_expect(static_cast<bool>(condition), 1)
 #else
 // there is no equivalent intrinsics in msvc.
-#define LIKELY(condition) (condition != 0)
+#define LIKELY(condition) (condition)
 #endif

 template <typename... Args>
@@ -248,7 +242,6 @@ inline void throw_on_error(T e) {
  throw_on_error(e, "");
 }

-#if !defined(_WIN32)
 #define PADDLE_THROW(...)                                              \
  do {                                                                 \
    throw ::paddle::platform::EnforceNotMet(                           \
@@ -272,17 +265,6 @@ inline void throw_on_error(T e) {
 #define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__);
 #endif  // REPLACE_ENFORCE_GLOG

-#else  // !_WIN32
-// disable enforce, caused by the varardic macro exception error
-#define PADDLE_THROW(x)                                      \
-  do {                                                       \
-    throw std::make_exception_ptr(                           \
-        std::runtime_error("Windows disable the enforce.")); \
-  } while (false)
-
-#define PADDLE_ENFORCE(x, ...) x
-#endif  // !_WIN32
-
 #define PADDLE_THROW_EOF()                                                     \
  do {                                                                         \
    throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
@@ -302,20 +284,6 @@ inline void throw_on_error(T e) {
 *    extra messages is also supported, for example:
 *    PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
 */
-#if !defined(_WIN32)
-#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__)
-#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, !=, ==, __VA_ARGS__)
-#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >, <=, __VA_ARGS__)
-#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >=, <, __VA_ARGS__)
-#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__)
-#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
-
 #define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                  \
  do {                                                       \
    if (UNLIKELY(nullptr == (__VAL))) {                      \
@@ -335,27 +303,19 @@ inline void throw_on_error(T e) {
                   paddle::string::Sprintf("" __VA_ARGS__));            \
    }                                                                   \
  } while (0)
-#else
-#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) ((__VAL0) == (__VAL1))
-#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) ((__VAL0) != (__VAL1))
-#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) ((__VAL0) > (__VAL1))
-#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) ((__VAL0) >= (__VAL1))
-#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) ((__VAL0) < (__VAL1))
-#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) ((__VAL0) <= (__VAL1))
-
-#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \
-  do {                                                                 \
-    if (!((__VAL0)__CMP(__VAL1))) {                                    \
-      PADDLE_THROW("Windows disable the enforce. Enforce failed.");    \
-    }                                                                  \
-  } while (0)
-#define PADDLE_ENFORCE_NOT_NULL(__VAL1, ...)                       \
-  do {                                                             \
-    if (nullptr == (__VAL1)) {                                     \
-      PADDLE_THROW("Windows disable the enforce. Enforce failed"); \
-    }                                                              \
-  } while (0)
-#endif  // !_WIN32
+
+#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__)
+#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, !=, ==, __VA_ARGS__)
+#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >, <=, __VA_ARGS__)
+#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >=, <, __VA_ARGS__)
+#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__)
+#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)

 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -117,13 +117,6 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
  places.emplace_back(platform::CPUPlace());
  platform::DeviceContextPool::Init(places);

-// windows has no support for openblas multi-thread
-#ifdef _WIN32
-  if (FLAGS_paddle_num_threads > 1) {
-    FLAGS_paddle_num_threads = 1;
-  }
-#endif
-
 #ifndef PADDLE_WITH_MKLDNN
  platform::SetNumThreads(FLAGS_paddle_num_threads);
 #endif

--- a/paddle/fluid/platform/init.h
+++ b/paddle/fluid/platform/init.h
@@ -16,9 +16,6 @@ limitations under the License. */
 #include <string>
 #include <vector>

-#define GLOG_NO_ABBREVIATED_SEVERITIES
-#define GOOGLE_GLOG_DLL_DECL
-
 #include "gflags/gflags.h"
 #include "glog/logging.h"


--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
@@ -17,6 +17,7 @@
 #include <cstdio>
 #include <stdexcept>

+#include <time.h>
 #include <memory>
 #include <string>

@@ -27,8 +28,13 @@
 #include <dlfcn.h>     //  dladdr
 #include <execinfo.h>  // backtrace
 #include <sys/stat.h>
+#include <sys/time.h>
 #include <algorithm>  // std::accumulate
 #else
+#define NOMINMAX  // msvc max/min macro conflict with std::min/max
+// solve static linking error in windows
+// https://github.com/google/glog/issues/301
+#define GOOGLE_GLOG_DLL_DECL
 #include <io.h>  // _popen, _pclose
 #include <stdio.h>
 #include <windows.h>
@@ -57,6 +63,25 @@ static void *dlopen(const char *filename, int flag) {
  return reinterpret_cast<void *>(hModule);
 }

+static int gettimeofday(struct timeval *tp, void *tzp) {
+  time_t clock;
+  struct tm tm;
+  SYSTEMTIME wtm;
+
+  GetLocalTime(&wtm);
+  tm.tm_year = wtm.wYear - 1900;
+  tm.tm_mon = wtm.wMonth - 1;
+  tm.tm_mday = wtm.wDay;
+  tm.tm_hour = wtm.wHour;
+  tm.tm_min = wtm.wMinute;
+  tm.tm_sec = wtm.wSecond;
+  tm.tm_isdst = -1;
+  clock = mktime(&tm);
+  tp->tv_sec = clock;
+  tp->tv_usec = wtm.wMilliseconds * 1000;
+
+  return (0);
+}
 #endif  // !_WIN32

 static void ExecShellCommand(const std::string &cmd, std::string *message) {
@@ -132,10 +157,12 @@ static void MkDir(const char *path) {
    }
  }
 #else
-  CreateDirectory(path, NULL);
-  auto errorno = GetLastError();
-  if (errorno != ERROR_ALREADY_EXISTS) {
-    throw std::runtime_error(path_error);
+  BOOL return_value = CreateDirectory(path, NULL);
+  if (!return_value) {
+    auto errorno = GetLastError();
+    if (errorno != ERROR_ALREADY_EXISTS) {
+      throw std::runtime_error(path_error);
+    }
  }
 #endif  // !_WIN32
 }

--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/port.h"

-#include <sys/time.h>
 #include <algorithm>
 #include <iomanip>
 #include <limits>

--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -69,7 +69,6 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx);

 void PopEvent(const std::string& name, const DeviceContext* dev_ctx);

-#if !defined(_WIN32)
 struct RecordEvent {
  // dev_ctx can be set to nullptr if device is cpu.
  RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
@@ -106,15 +105,6 @@ struct RecordBlock {
  std::string name_;
  uint64_t start_ns_;
 };
-#else
-// windows do not support profiler temporarily.
-struct RecordEvent {
-  RecordEvent(const std::string& name, const DeviceContext* dev_ctx) {}
-};
-struct RecordBlock {
-  explicit RecordBlock(int block_id) {}
-};
-#endif

 // Return the event list of all threads. Assumed the returned value calls
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.

--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
@@ -45,16 +45,15 @@ class StreamCallbackManager {
  inline void AddCallback(Callback &&callback) const {
    auto *stream_callback_context =
        new StreamCallbackContext(this, std::forward<Callback>(callback));
-    PADDLE_ENFORCE(
 #if CUDA_VERSION >= 10000
-        cudaLaunchHostFunc(stream_, StreamCallbackManager::StreamCallbackFunc,
-                           stream_callback_context)
+    PADDLE_ENFORCE(cudaLaunchHostFunc(stream_,
+                                      StreamCallbackManager::StreamCallbackFunc,
+                                      stream_callback_context));  // NOLINT
 #else
-        cudaStreamAddCallback(stream_,
-                              StreamCallbackManager::StreamCallbackFunc,
-                              stream_callback_context, 0)
+    PADDLE_ENFORCE(cudaStreamAddCallback(
+        stream_, StreamCallbackManager::StreamCallbackFunc,
+        stream_callback_context, 0));  // NOLINT
 #endif
-            );  // NOLINT
  }

  void Wait() const { thread_pool_.reset(new ThreadPool(1)); }

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt

-set(PYBIND_DEPS pybind python proto_desc memory executor prune  feed_fetch_method pass_builder)
-set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc)
-if(NOT WIN32)
-  list(APPEND PYBIND_DEPS parallel_executor profiler)
-  list(APPEND PYBIND_SRCS recordio.cc)
-endif(NOT WIN32)
+set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder parallel_executor profiler)
+set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc)
 if(WITH_PYTHON)
  if(WITH_AMD_GPU)
    hip_library(paddle_pybind SHARED

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -21,13 +21,6 @@ limitations under the License. */
 #include <utility>
 #include <vector>

-#if defined(_WIN32)
-#define NOMINMAX
-#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
-#define GOOGLE_GLOG_DLL_DECL
-#include <Windows.h>
-#endif
-
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/framework.pb.h"
@@ -36,9 +29,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
-#ifndef _WIN32
 #include "paddle/fluid/framework/parallel_executor.h"
-#endif
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
@@ -46,6 +37,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
+#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
@@ -95,6 +87,9 @@ bool IsCompiledWithDIST() {
 }

 PYBIND11_PLUGIN(core) {
+  // Not used, just make sure cpu_info.cc is linked.
+  paddle::platform::CpuTotalPhysicalMemory();
+
  paddle::memory::allocation::UseAllocatorStrategyGFlag();
  py::module m("core", "C++ core of PaddlePaddle");

@@ -359,22 +354,16 @@ All parameter, weight, gradient are variables in Paddle.
             return self.GetMutable<platform::Communicator>();
           },
           py::return_value_policy::reference)
-
 #endif
-#ifndef _WIN32
      .def("get_reader",
           [](Variable &self) -> framework::ReaderHolder * {
             PADDLE_ENFORCE(self.IsType<framework::ReaderHolder>());
             return self.GetMutable<framework::ReaderHolder>();
           },
-           py::return_value_policy::reference)
-#endif
-      ;  // NOLINT
+           py::return_value_policy::reference);

-#if !defined(_WIN32)
  py::class_<framework::ReaderHolder>(m, "Reader", "")
      .def("reset", &framework::ReaderHolder::ResetAll);
-#endif

  using LoDTensorBlockingQueue =
      ::paddle::operators::reader::LoDTensorBlockingQueue;
@@ -643,7 +632,6 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
 #endif

-#ifndef _WIN32
  py::enum_<platform::ProfilerState>(m, "ProfilerState", py::arithmetic())
      .value("kDisabled", platform::ProfilerState::kDisabled)
      .value("kCPU", platform::ProfilerState::kCPU)
@@ -664,7 +652,6 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("disable_profiler", platform::DisableProfiler);
  m.def("is_profiler_enabled", platform::IsProfileEnabled);
  m.def("reset_profiler", platform::ResetProfiler);
-#endif

  py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
  pass.def(py::init())
@@ -693,7 +680,6 @@ All parameter, weight, gradient are variables in Paddle.
      .def("remove_pass",
           [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });

-#ifndef _WIN32
  // -- python binds for parallel executor.
  py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
  py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy", R"DOC(
@@ -921,7 +907,6 @@ All parameter, weight, gradient are variables in Paddle.
      });

  BindRecordIOWriter(&m);
-#endif
  return m.ptr();
 }
 }  // namespace pybind

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -115,9 +115,8 @@ def __bootstrap__():
        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'eager_delete_scope',
        'use_mkldnn', 'use_ngraph', 'initial_cpu_memory_in_mb',
        'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
-        "dist_threadpool_size", 'cpu_deterministic', 'eager_delete_tensor_gb',
-        'allocator_strategy', 'reader_queue_speed_test_mode',
-        'print_sub_graph_dir'
+        "dist_threadpool_size", 'eager_delete_tensor_gb', 'allocator_strategy',
+        'reader_queue_speed_test_mode', 'print_sub_graph_dir'
    ]
    if os.name != 'nt':
        read_env_flags.append('warpctc_dir')

--- a/python/paddle/fluid/contrib/inferencer.py
+++ b/python/paddle/fluid/contrib/inferencer.py
@@ -15,15 +15,13 @@
 from __future__ import print_function

 import contextlib
-import os

 from .. import core

 from .. import executor
 from .. import framework
 from .. import io
-if os.name != 'nt':
-    from .. import parallel_executor
+from .. import parallel_executor
 from .. import unique_name
 from .trainer import check_and_get_place


--- a/python/paddle/fluid/contrib/trainer.py
+++ b/python/paddle/fluid/contrib/trainer.py
@@ -28,8 +28,7 @@ from .. import framework
 from .. import io
 # optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module
 from .. import optimizer as opt_module
-if os.name != 'nt':
-    from .. import parallel_executor
+from .. import parallel_executor
 from ..transpiler import distribute_transpiler

 __all__ = [

--- a/python/paddle/fluid/contrib/utils/__init__.py
+++ b/python/paddle/fluid/contrib/utils/__init__.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from . import hdfs_utils
+from .hdfs_utils import *
+
+__all__ = hdfs_utils.__all__
--- a/python/paddle/fluid/contrib/utils/hdfs_utils.py
+++ b/python/paddle/fluid/contrib/utils/hdfs_utils.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""HDFS Utils"""
+
+import os
+import subprocess
+import multiprocessing
+from datetime import datetime
+
+import re
+import copy
+import errno
+
+import logging
+
+__all__ = ["HDFSClient", "multi_download"]
+
+logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
+_logger = logging.getLogger("hdfs_utils")
+_logger.setLevel(logging.INFO)
+
+
+class HDFSClient(object):
+    def __init__(self, hadoop_home, configs):
+        self.pre_commands = []
+        hadoop_bin = '%s/bin/hadoop' % hadoop_home
+        self.pre_commands.append(hadoop_bin)
+        dfs = 'fs'
+        self.pre_commands.append(dfs)
+
+        for k, v in configs.iteritems():
+            config_command = '-D%s=%s' % (k, v)
+            self.pre_commands.append(config_command)
+
+    def __run_hdfs_cmd(self, commands, retry_times=5):
+        whole_commands = copy.deepcopy(self.pre_commands)
+        whole_commands.extend(commands)
+
+        print('Running system command: {0}'.format(' '.join(whole_commands)))
+
+        ret_code = 0
+        ret_out = None
+        ret_err = None
+        for x in range(retry_times + 1):
+            proc = subprocess.Popen(
+                whole_commands, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            (output, errors) = proc.communicate()
+            ret_code, ret_out, ret_err = proc.returncode, output, errors
+            if ret_code:
+                _logger.warn(
+                    'Times: %d, Error running command: %s. Return code: %d, Error: %s'
+                    % (x, ' '.join(whole_commands), proc.returncode, errors))
+            else:
+                break
+        return ret_code, ret_out, ret_err
+
+    def upload(self, hdfs_path, local_path, overwrite=False, retry_times=5):
+        """
+            upload the local file to hdfs
+            args:
+                local_file_path: the local file path
+                remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
+            return:
+                True or False
+        """
+        assert hdfs_path is not None
+        assert local_path is not None and os.path.exists(local_path)
+
+        if os.path.isdir(local_path):
+            _logger.warn(
+                "The Local path: {} is dir and I will support it later, return".
+                format(local_path))
+            return
+
+        base = os.path.basename(local_path)
+        if not self.is_exist(hdfs_path):
+            self.makedirs(hdfs_path)
+        else:
+            if self.is_exist(os.path.join(hdfs_path, base)):
+                if overwrite:
+                    _logger.error(
+                        "The HDFS path: {} is exist and overwrite is True, delete it".
+                        format(hdfs_path))
+                    self.delete(hdfs_path)
+                else:
+                    _logger.error(
+                        "The HDFS path: {} is exist and overwrite is False, return".
+                        format(hdfs_path))
+                    return False
+
+        put_commands = ["-put", local_path, hdfs_path]
+        returncode, output, errors = self.__run_hdfs_cmd(put_commands,
+                                                         retry_times)
+        if returncode:
+            _logger.error("Put local path: {} to HDFS path: {} failed".format(
+                local_path, hdfs_path))
+            return False
+        else:
+            _logger.info("Put local path: {} to HDFS path: {} successfully".
+                         format(local_path, hdfs_path))
+            return True
+
+    def download(self, hdfs_path, local_path, overwrite=False, unzip=False):
+        """
+            download from hdfs
+            args:
+                local_file_path: the local file path
+                remote_file_path: remote dir on hdfs
+            return:
+                True or False
+        """
+        _logger.info('Downloading %r to %r.', hdfs_path, local_path)
+        _logger.info('Download of %s to %r complete.', hdfs_path, local_path)
+
+        if not self.is_exist(hdfs_path):
+            print("HDFS path: {} do not exist".format(hdfs_path))
+            return False
+        if self.is_dir(hdfs_path):
+            _logger.error(
+                "The HDFS path: {} is dir and I will support it later, return".
+                format(hdfs_path))
+
+        if os.path.exists(local_path):
+            base = os.path.basename(hdfs_path)
+            local_file = os.path.join(local_path, base)
+            if os.path.exists(local_file):
+                if overwrite:
+                    os.remove(local_file)
+                else:
+                    _logger.error(
+                        "The Local path: {} is exist and overwrite is False, return".
+                        format(local_file))
+                    return False
+
+        self.make_local_dirs(local_path)
+
+        download_commands = ["-get", hdfs_path, local_path]
+        returncode, output, errors = self.__run_hdfs_cmd(download_commands)
+        if returncode:
+            _logger.error("Get local path: {} from HDFS path: {} failed".format(
+                local_path, hdfs_path))
+            return False
+        else:
+            _logger.info("Get local path: {} from HDFS path: {} successfully".
+                         format(local_path, hdfs_path))
+            return True
+
+    def is_exist(self, hdfs_path=None):
+        """
+            whether the remote hdfs path exists?
+            args:
+                remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
+                fs_name: The default values are the same as in the job configuration
+                fs_ugi: The default values are the same as in the job configuration
+            return:
+                True or False
+        """
+        exist_cmd = ['-test', '-e', hdfs_path]
+        returncode, output, errors = self.__run_hdfs_cmd(
+            exist_cmd, retry_times=1)
+
+        if returncode:
+            _logger.error("HDFS is_exist HDFS path: {} failed".format(
+                hdfs_path))
+            return False
+        else:
+            _logger.info("HDFS is_exist HDFS path: {} successfully".format(
+                hdfs_path))
+            return True
+
+    def is_dir(self, hdfs_path=None):
+        """
+            whether the remote hdfs path exists?
+            args:
+                remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
+                fs_name: The default values are the same as in the job configuration
+                fs_ugi: The default values are the same as in the job configuration
+            return:
+                True or False
+        """
+
+        if not self.is_exist(hdfs_path):
+            return False
+
+        dir_cmd = ['-test', '-d', hdfs_path]
+        returncode, output, errors = self.__run_hdfs_cmd(dir_cmd, retry_times=1)
+
+        if returncode:
+            _logger.error("HDFS path: {} failed is not a directory".format(
+                hdfs_path))
+            return False
+        else:
+            _logger.info("HDFS path: {} successfully is a directory".format(
+                hdfs_path))
+            return True
+
+    def delete(self, hdfs_path):
+        """Remove a file or directory from HDFS.
+
+        :param hdfs_path: HDFS path.
+        :param recursive: Recursively delete files and directories. By default,
+          this method will raise an :class:`HdfsError` if trying to delete a
+          non-empty directory.
+
+        This function returns `True` if the deletion was successful and `False` if
+        no file or directory previously existed at `hdfs_path`.
+
+        """
+        _logger.info('Deleting %r.', hdfs_path)
+
+        if not self.is_exist(hdfs_path):
+            _logger.warn("HDFS path: {} do not exist".format(hdfs_path))
+            return True
+
+        if self.is_dir(hdfs_path):
+            del_cmd = ['-rmr', hdfs_path]
+        else:
+            del_cmd = ['-rm', hdfs_path]
+
+        returncode, output, errors = self.__run_hdfs_cmd(del_cmd, retry_times=0)
+
+        if returncode:
+            _logger.error("HDFS path: {} delete files failure".format(
+                hdfs_path))
+            return False
+        else:
+            _logger.info("HDFS path: {} delete files successfully".format(
+                hdfs_path))
+            return True
+
+    def rename(self, hdfs_src_path, hdfs_dst_path, overwrite=False):
+        """Move a file or folder.
+
+        :param hdfs_src_path: Source path.
+        :param hdfs_dst_path: Destination path. If the path already exists and is
+          a directory, the source will be moved into it. If the path exists and is
+          a file, or if a parent destination directory is missing, this method will
+          raise an :class:`HdfsError`.
+
+        """
+        assert hdfs_src_path is not None
+        assert hdfs_dst_path is not None
+
+        if not self.is_exist(hdfs_src_path):
+            _logger.info("HDFS path do not exist: {}".format(hdfs_src_path))
+        if self.is_exist(hdfs_dst_path) and not overwrite:
+            _logger.error("HDFS path is exist: {} and overwrite=False".format(
+                hdfs_dst_path))
+
+        rename_command = ['-mv', hdfs_src_path, hdfs_dst_path]
+        returncode, output, errors = self.__run_hdfs_cmd(
+            rename_command, retry_times=1)
+
+        if returncode:
+            _logger.error("HDFS rename path: {} to {} failed".format(
+                hdfs_src_path, hdfs_dst_path))
+            return False
+        else:
+            _logger.info("HDFS rename path: {} to {} successfully".format(
+                hdfs_src_path, hdfs_dst_path))
+            return True
+
+    @staticmethod
+    def make_local_dirs(local_path):
+        try:
+            os.makedirs(local_path)
+        except OSError as e:
+            if e.errno != errno.EEXIST:
+                raise
+
+    def makedirs(self, hdfs_path):
+        """Create a remote directory, recursively if necessary.
+
+        :param hdfs_path: Remote path. Intermediate directories will be created
+          appropriately.
+        """
+        _logger.info('Creating directories to %r.', hdfs_path)
+        assert hdfs_path is not None
+
+        if self.is_exist(hdfs_path):
+            return
+
+        mkdirs_commands = ['-mkdir', hdfs_path]
+        returncode, output, errors = self.__run_hdfs_cmd(
+            mkdirs_commands, retry_times=1)
+
+        if returncode:
+            _logger.error("HDFS mkdir path: {} failed".format(hdfs_path))
+            return False
+        else:
+            _logger.error("HDFS mkdir path: {} successfully".format(hdfs_path))
+            return True
+
+    def ls(self, hdfs_path):
+        assert hdfs_path is not None
+
+        if not self.is_exist(hdfs_path):
+            return []
+
+        ls_commands = ['-ls', hdfs_path]
+        returncode, output, errors = self.__run_hdfs_cmd(
+            ls_commands, retry_times=1)
+
+        if returncode:
+            _logger.error("HDFS list path: {} failed".format(hdfs_path))
+            return []
+        else:
+            _logger.info("HDFS list path: {} successfully".format(hdfs_path))
+
+            ret_lines = []
+            regex = re.compile('\s+')
+            out_lines = output.strip().split("\n")
+            for line in out_lines:
+                re_line = regex.split(line)
+                if len(re_line) == 8:
+                    ret_lines.append(re_line[7])
+            return ret_lines
+
+    def lsr(self, hdfs_path, only_file=True, sort=True):
+        def sort_by_time(v1, v2):
+            v1_time = datetime.strptime(v1[1], '%Y-%m-%d %H:%M')
+            v2_time = datetime.strptime(v2[1], '%Y-%m-%d %H:%M')
+            return v1_time > v2_time
+
+        assert hdfs_path is not None
+
+        if not self.is_exist(hdfs_path):
+            return []
+
+        ls_commands = ['-lsr', hdfs_path]
+        returncode, output, errors = self.__run_hdfs_cmd(
+            ls_commands, retry_times=1)
+
+        if returncode:
+            _logger.error("HDFS list all files: {} failed".format(hdfs_path))
+            return []
+        else:
+            _logger.info("HDFS list all files: {} successfully".format(
+                hdfs_path))
+            lines = []
+            regex = re.compile('\s+')
+            out_lines = output.strip().split("\n")
+            for line in out_lines:
+                re_line = regex.split(line)
+                if len(re_line) == 8:
+                    if only_file and re_line[0][0] == "d":
+                        continue
+                    else:
+                        lines.append(
+                            (re_line[7], re_line[5] + " " + re_line[6]))
+            if sort:
+                sorted(lines, cmp=sort_by_time)
+            ret_lines = [ret[0] for ret in lines]
+            return ret_lines
+
+
+def multi_upload(client,
+                 hdfs_path,
+                 local_path,
+                 multi_processes=5,
+                 overwrite=False):
+    """
+    :param overwrite: will overwrite hdfs file or not
+    :param multi_processes: the upload data process at the same time, default=5
+    :param client: instance of HDFSClient
+    :param hdfs_path: path on hdfs
+    :param local_path: path on local
+    :return:
+    """
+
+    def __subprocess_upload(datas):
+        for data in datas:
+            re_path = os.path.relpath(os.path.dirname(data), local_path)
+            hdfs_re_path = os.path.join(hdfs_path, re_path)
+            client.upload(hdfs_re_path, data, overwrite, retry_times=5)
+
+    def get_local_files(path):
+        rlist = []
+
+        if not os.path.isdir(path):
+            return rlist
+
+        for dirname, folder, files in os.walk(path):
+            for i in files:
+                t = os.path.join(dirname, i)
+                rlist.append(t)
+        return rlist
+
+    assert isinstance(client, HDFSClient)
+
+    all_files = get_local_files(local_path)
+    if not all_files:
+        _logger.info("there are nothing need to upload, exit")
+        return
+    _logger.info("Start {} multi process to upload datas".format(
+        multi_processes))
+    procs = []
+    for i in range(multi_processes):
+        process_datas = all_files[i::multi_processes]
+        p = multiprocessing.Process(
+            target=__subprocess_upload, args=(process_datas, ))
+        procs.append(p)
+        p.start()
+
+    # complete the processes
+    for proc in procs:
+        proc.join()
+
+    _logger.info("Finish {} multi process to upload datas".format(
+        multi_processes))
+
+
+def multi_download(client,
+                   hdfs_path,
+                   local_path,
+                   trainer_id,
+                   trainers,
+                   multi_processes=5):
+    """
+    multi_download
+    :param client: instance of HDFSClient
+    :param hdfs_path: path on hdfs
+    :param local_path: path on local
+    :param trainer_id: current trainer id
+    :param trainers: all trainers number
+    :param multi_processes: the download data process at the same time, default=5
+    :return: None
+    """
+
+    def __subprocess_download(datas):
+        for data in datas:
+            re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
+            local_re_path = os.path.join(local_path, re_path)
+            client.download(data, local_re_path)
+
+    assert isinstance(client, HDFSClient)
+
+    client.make_local_dirs(local_path)
+    _logger.info("Make local dir {} successfully".format(local_path))
+
+    all_need_download = client.lsr(hdfs_path, sort=True)
+    need_download = all_need_download[trainer_id::trainers]
+    _logger.info("Get {} files From all {} files need to be download from {}".
+                 format(len(need_download), len(all_need_download), hdfs_path))
+
+    _logger.info("Start {} multi process to download datas".format(
+        multi_processes))
+    procs = []
+    for i in range(multi_processes):
+        process_datas = need_download[i::multi_processes]
+        p = multiprocessing.Process(
+            target=__subprocess_download, args=(process_datas, ))
+        procs.append(p)
+        p.start()
+
+    # complete the processes
+    for proc in procs:
+        proc.join()
+
+    _logger.info("Finish {} multi process to download datas".format(
+        multi_processes))
+
+    local_downloads = []
+    for data in need_download:
+        data_name = os.path.basename(data)
+        re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
+        local_re_path = os.path.join(local_path, re_path, data_name)
+        local_downloads.append(local_re_path)
+
+    return local_downloads
+
+
+if __name__ == "__main__":
+    hadoop_home = "/home/client/hadoop-client/hadoop/"
+
+    configs = {
+        "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+        "hadoop.job.ugi": "hello,hello123"
+    }
+
+    client = HDFSClient(hadoop_home, configs)
+
+    client.ls("/user/com/train-25")
+    files = client.lsr("/user/com/train-25/models")
+
+    downloads = multi_download(
+        client,
+        "/user/com/train-25/model",
+        "/home/xx/data1",
+        1,
+        5,
+        multi_processes=5)
+
+    multi_upload(client, "/user/com/train-25/model", "/home/xx/data1")
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -347,72 +347,70 @@ def _copy_reader_create_op_(block, op):
    return new_op


-if os.name != 'nt':
-
-    @templatedoc(op_type='create_recordio_file_reader')
-    def open_recordio_file(filename,
-                           shapes,
-                           lod_levels,
-                           dtypes,
-                           pass_num=1,
-                           for_parallel=True):
-        """
-        ${comment}
-
-        Args:
-           filename(${filename_type}): ${filename_comment}.
-           shapes(list): List of tuples which declaring data shapes.
-           lod_levels(${lod_levels_type}): ${lod_levels_comment}.
-           dtypes(list): List of strs which declaring data type.
-           pass_num(int): Number of passes to run.
-           for_parallel(Bool): Set it as True if you are going to run
-                subsequent operators in parallel.
-
-        Returns:
-           ${out_comment}.
-
-        Examples:
-
-            >>> import paddle.fluid as fluid
-            >>> reader = fluid.layers.io.open_recordio_file(
-            >>>                               filename='./data.recordio',
-            >>>                               shapes=[(3,224,224), (1)],
-            >>>                               lod_levels=[0, 0],
-            >>>                               dtypes=['float32', 'int64'])
-            >>> # Via the reader, we can use 'read_file' layer to get data:
-            >>> image, label = fluid.layers.io.read_file(reader)
-        """
-        dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
-        shape_concat = []
-        ranks = []
+@templatedoc(op_type='create_recordio_file_reader')
+def open_recordio_file(filename,
+                       shapes,
+                       lod_levels,
+                       dtypes,
+                       pass_num=1,
+                       for_parallel=True):
+    """
+    ${comment}

-        for shape in shapes:
-            shape_concat.extend(shape)
-            ranks.append(len(shape))
+    Args:
+       filename(${filename_type}): ${filename_comment}.
+       shapes(list): List of tuples which declaring data shapes.
+       lod_levels(${lod_levels_type}): ${lod_levels_comment}.
+       dtypes(list): List of strs which declaring data type.
+       pass_num(int): Number of passes to run.
+       for_parallel(Bool): Set it as True if you are going to run
+            subsequent operators in parallel.

-        var_name = unique_name('open_recordio_file')
+    Returns:
+       ${out_comment}.

-        startup_blk = default_startup_program().current_block()
-        startup_var = startup_blk.create_var(name=var_name)
-        startup_blk.append_op(
-            type='create_recordio_file_reader',
-            outputs={'Out': [startup_var]},
-            attrs={
-                'shape_concat': shape_concat,
-                'lod_levels': lod_levels,
-                'filename': filename,
-                'ranks': ranks
-            })
+    Examples:

-        startup_var.desc.set_dtypes(dtypes)
-        startup_var.persistable = True
-        main_prog_var = _copy_reader_var_(
-            default_main_program().current_block(), startup_var)
+        >>> import paddle.fluid as fluid
+        >>> reader = fluid.layers.io.open_recordio_file(
+        >>>                               filename='./data.recordio',
+        >>>                               shapes=[(3,224,224), (1)],
+        >>>                               lod_levels=[0, 0],
+        >>>                               dtypes=['float32', 'int64'])
+        >>> # Via the reader, we can use 'read_file' layer to get data:
+        >>> image, label = fluid.layers.io.read_file(reader)
+    """
+    dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
+    shape_concat = []
+    ranks = []

-        if pass_num > 1:
-            main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
+    for shape in shapes:
+        shape_concat.extend(shape)
+        ranks.append(len(shape))
+
+    var_name = unique_name('open_recordio_file')
+
+    startup_blk = default_startup_program().current_block()
+    startup_var = startup_blk.create_var(name=var_name)
+    startup_blk.append_op(
+        type='create_recordio_file_reader',
+        outputs={'Out': [startup_var]},
+        attrs={
+            'shape_concat': shape_concat,
+            'lod_levels': lod_levels,
+            'filename': filename,
+            'ranks': ranks
+        })

-        return monkey_patch_reader_methods(main_prog_var)
+    startup_var.desc.set_dtypes(dtypes)
+    startup_var.persistable = True
+    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
+                                      startup_var)
+
+    if pass_num > 1:
+        main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
+
+    return monkey_patch_reader_methods(main_prog_var)


 def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -85,6 +85,7 @@ __all__ = [
    'row_conv',
    'multiplex',
    'layer_norm',
+    'group_norm',
    'softmax_with_cross_entropy',
    'smooth_l1',
    'one_hot',
@@ -343,128 +344,126 @@ def embedding(input,
    return tmp


-if os.name != 'nt':
+@templatedoc(op_type="lstm")
+def dynamic_lstm(input,
+                 size,
+                 h_0=None,
+                 c_0=None,
+                 param_attr=None,
+                 bias_attr=None,
+                 use_peepholes=True,
+                 is_reverse=False,
+                 gate_activation='sigmoid',
+                 cell_activation='tanh',
+                 candidate_activation='tanh',
+                 dtype='float32',
+                 name=None):
+    """
+    ${comment}

-    @templatedoc(op_type="lstm")
-    def dynamic_lstm(input,
-                     size,
-                     h_0=None,
-                     c_0=None,
-                     param_attr=None,
-                     bias_attr=None,
-                     use_peepholes=True,
-                     is_reverse=False,
-                     gate_activation='sigmoid',
-                     cell_activation='tanh',
-                     candidate_activation='tanh',
-                     dtype='float32',
-                     name=None):
-        """
-        ${comment}
-
-        Args:
-            input (Variable): ${input_comment}
-            size (int): 4 * hidden size.
-            h_0(Variable): The initial hidden state is an optional input, default is zero.
-                           This is a tensor with shape (N x D), where N is the
-                           batch size and D is the hidden size.
-            c_0(Variable): The initial cell state is an optional input, default is zero.
-                           This is a tensor with shape (N x D), where N is the
-                           batch size. `h_0` and `c_0` can be NULL but only at the same time.
-            param_attr(ParamAttr|None): The parameter attribute for the learnable
-                                   hidden-hidden weights.
-
-                                   - Weights = {:math:`W_{ch}, W_{ih}, \
-                                                    W_{fh}, W_{oh}`}
-                                   - The shape is (D x 4D), where D is the hidden
-                                     size.
-
-                                   If it is set to None or one attribute of ParamAttr,
-                                   dynamic_lstm will create ParamAttr as param_attr.
-                                   If the Initializer of the param_attr is not set, the
-                                   parameter is initialized with Xavier. Default: None.
-            bias_attr (ParamAttr|None): The bias attribute for the learnable bias
-                                  weights, which contains two parts, input-hidden
-                                  bias weights and peephole connections weights if
-                                  setting `use_peepholes` to `True`.
-
-                                  1. `use_peepholes = False`
-                                     - Biases = {:math:`b_c, b_i, b_f, b_o`}.
-                                     - The shape is (1 x 4D).
-                                  2. `use_peepholes = True`
-                                     - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
-                                                     W_{fc}, W_{oc}`}.
-                                     - The shape is (1 x 7D).
-
-                                  If it is set to None or one attribute of ParamAttr,
-                                  dynamic_lstm will create ParamAttr as bias_attr.
-                                  If the Initializer of the bias_attr is not set,
-                                  the bias is initialized zero. Default: None.
-            use_peepholes (bool): ${use_peepholes_comment}
-            is_reverse (bool): ${is_reverse_comment}
-            gate_activation (str): ${gate_activation_comment}
-            cell_activation (str): ${cell_activation_comment}
-            candidate_activation (str): ${candidate_activation_comment}
-            dtype (str): Data type. Choices = ["float32", "float64"], default "float32".
-            name (str|None): A name for this layer(optional). If set None, the layer
-                             will be named automatically.
-
-        Returns:
-            tuple: The hidden state, and cell state of LSTM. The shape of both \
-            is (T x D), and lod is the same with the `input`.
-
-        Examples:
-            .. code-block:: python
-
-                hidden_dim = 512
-                forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
-                                               bias_attr=False)
-                forward, _ = fluid.layers.dynamic_lstm(
-                    input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
-        """
-        assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
-        helper = LayerHelper('lstm', **locals())
-        size = size // 4
-        weight = helper.create_parameter(
-            attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype)
-        bias_size = [1, 7 * size]
-        if not use_peepholes:
-            bias_size[1] = 4 * size
-        bias = helper.create_parameter(
-            attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+    Args:
+        input (Variable): ${input_comment}
+        size (int): 4 * hidden size.
+        h_0(Variable): The initial hidden state is an optional input, default is zero.
+                       This is a tensor with shape (N x D), where N is the
+                       batch size and D is the hidden size.
+        c_0(Variable): The initial cell state is an optional input, default is zero.
+                       This is a tensor with shape (N x D), where N is the
+                       batch size. `h_0` and `c_0` can be NULL but only at the same time.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+                               hidden-hidden weights.

-        hidden = helper.create_variable_for_type_inference(dtype)
-        cell = helper.create_variable_for_type_inference(dtype)
-        batch_gate = helper.create_variable_for_type_inference(dtype)
-        batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
-        inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
-        batch_size = input.shape[0]
-        if h_0:
-            assert h_0.shape == (batch_size, size), \
-                'The shape of h0 should be (batch_size, %d)' % size
-            inputs['H0'] = h_0
-        if c_0:
-            assert c_0.shape == (batch_size, size), \
-                'The shape of c0 should be (batch_size, %d)' % size
-            inputs['C0'] = c_0
+                               - Weights = {:math:`W_{ch}, W_{ih}, \
+                                                W_{fh}, W_{oh}`}
+                               - The shape is (D x 4D), where D is the hidden
+                                 size.

-        helper.append_op(
-            type='lstm',
-            inputs=inputs,
-            outputs={
-                'Hidden': hidden,
-                'Cell': cell,
-                'BatchGate': batch_gate,
-                'BatchCellPreAct': batch_cell_pre_act
-            },
-            attrs={
-                'use_peepholes': use_peepholes,
-                'is_reverse': is_reverse,
-                'gate_activation': gate_activation,
-                'cell_activation': cell_activation,
-                'candidate_activation': candidate_activation
-            })
-        return hidden, cell
+                               If it is set to None or one attribute of ParamAttr,
+                               dynamic_lstm will create ParamAttr as param_attr.
+                               If the Initializer of the param_attr is not set, the
+                               parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|None): The bias attribute for the learnable bias
+                              weights, which contains two parts, input-hidden
+                              bias weights and peephole connections weights if
+                              setting `use_peepholes` to `True`.
+
+                              1. `use_peepholes = False`
+                                 - Biases = {:math:`b_c, b_i, b_f, b_o`}.
+                                 - The shape is (1 x 4D).
+                              2. `use_peepholes = True`
+                                 - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
+                                                 W_{fc}, W_{oc}`}.
+                                 - The shape is (1 x 7D).
+
+                              If it is set to None or one attribute of ParamAttr,
+                              dynamic_lstm will create ParamAttr as bias_attr.
+                              If the Initializer of the bias_attr is not set,
+                              the bias is initialized zero. Default: None.
+        use_peepholes (bool): ${use_peepholes_comment}
+        is_reverse (bool): ${is_reverse_comment}
+        gate_activation (str): ${gate_activation_comment}
+        cell_activation (str): ${cell_activation_comment}
+        candidate_activation (str): ${candidate_activation_comment}
+        dtype (str): Data type. Choices = ["float32", "float64"], default "float32".
+        name (str|None): A name for this layer(optional). If set None, the layer
+                         will be named automatically.
+
+    Returns:
+        tuple: The hidden state, and cell state of LSTM. The shape of both \
+        is (T x D), and lod is the same with the `input`.
+
+    Examples:
+        .. code-block:: python
+
+            hidden_dim = 512
+            forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
+                                           bias_attr=False)
+            forward, _ = fluid.layers.dynamic_lstm(
+                input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
+    """
+    assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
+    helper = LayerHelper('lstm', **locals())
+    size = size // 4
+    weight = helper.create_parameter(
+        attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype)
+    bias_size = [1, 7 * size]
+    if not use_peepholes:
+        bias_size[1] = 4 * size
+    bias = helper.create_parameter(
+        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+
+    hidden = helper.create_variable_for_type_inference(dtype)
+    cell = helper.create_variable_for_type_inference(dtype)
+    batch_gate = helper.create_variable_for_type_inference(dtype)
+    batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
+    inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
+    batch_size = input.shape[0]
+    if h_0:
+        assert h_0.shape == (batch_size, size), \
+            'The shape of h0 should be (batch_size, %d)' % size
+        inputs['H0'] = h_0
+    if c_0:
+        assert c_0.shape == (batch_size, size), \
+            'The shape of c0 should be (batch_size, %d)' % size
+        inputs['C0'] = c_0
+
+    helper.append_op(
+        type='lstm',
+        inputs=inputs,
+        outputs={
+            'Hidden': hidden,
+            'Cell': cell,
+            'BatchGate': batch_gate,
+            'BatchCellPreAct': batch_cell_pre_act
+        },
+        attrs={
+            'use_peepholes': use_peepholes,
+            'is_reverse': is_reverse,
+            'gate_activation': gate_activation,
+            'cell_activation': cell_activation,
+            'candidate_activation': candidate_activation
+        })
+    return hidden, cell


 def dynamic_lstmp(input,
@@ -963,43 +962,39 @@ def linear_chain_crf(input, label, param_attr=None):
    return log_likelihood


-if os.name != 'nt':
-
-    @templatedoc()
-    def crf_decoding(input, param_attr, label=None):
-        """
-        ${comment}
+@templatedoc()
+def crf_decoding(input, param_attr, label=None):
+    """
+    ${comment}

-        Args:
-            input(${emission_type}): ${emission_comment}
+    Args:
+        input(${emission_type}): ${emission_comment}

-            param_attr(ParamAttr): The parameter attribute for training.
+        param_attr(ParamAttr): The parameter attribute for training.

-            label(${label_type}): ${label_comment}
+        label(${label_type}): ${label_comment}

-        Returns:
-            Variable: ${viterbi_path_comment}
+    Returns:
+        Variable: ${viterbi_path_comment}

-        Examples:
-            .. code-block:: python
+    Examples:
+        .. code-block:: python

-               crf_decode = layers.crf_decoding(
-                    input=hidden, param_attr=ParamAttr(name="crfw"))
-        """
-        helper = LayerHelper('crf_decoding', **locals())
-        transition = helper.get_parameter(param_attr.name)
-        viterbi_path = helper.create_variable_for_type_inference(
-            dtype=helper.input_dtype())
-        helper.append_op(
-            type='crf_decoding',
-            inputs={
-                "Emission": [input],
+           crf_decode = layers.crf_decoding(
+                input=hidden, param_attr=ParamAttr(name="crfw"))
+    """
+    helper = LayerHelper('crf_decoding', **locals())
+    transition = helper.get_parameter(param_attr.name)
+    viterbi_path = helper.create_variable_for_type_inference(
+        dtype=helper.input_dtype())
+    helper.append_op(
+        type='crf_decoding',
+        inputs={"Emission": [input],
                "Transition": transition,
-                "Label": label
-            },
-            outputs={"ViterbiPath": [viterbi_path]})
+                "Label": label},
+        outputs={"ViterbiPath": [viterbi_path]})

-        return viterbi_path
+    return viterbi_path


 @templatedoc()
@@ -2553,6 +2548,84 @@ def layer_norm(input,
    return helper.append_activation(layer_norm_out)


+@templatedoc()
+def group_norm(input,
+               groups,
+               epsilon=1e-05,
+               param_attr=None,
+               bias_attr=None,
+               act=None,
+               data_layout='NCHW',
+               name=None):
+    """
+    **Group Normalization Layer**
+
+    Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`
+
+    Args:
+        input(Variable): The input tensor variable.
+        groups(int): The number of groups that divided from channels.
+        epsilon(float): The small value added to the variance to prevent
+            division by zero.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+            scale :math:`g`. If it is set to False, no scale will be added to the output units.
+            If it is set to None, the bias is initialized one. Default: None.
+        bias_attr(ParamAttr|None): The parameter attribute for the learnable
+            bias :math:`b`. If it is set to False, no bias will be added to the output units.
+            If it is set to None, the bias is initialized zero. Default: None.
+        act(str): Activation to be applied to the output of group normalizaiton.
+        data_layout(string|NCHW): Only NCHW is supported.
+        name (str): The name of this layer. It is optional.
+
+    Returns:
+        Variable: A tensor variable which is the result after applying group normalization on the input.
+
+    Examples:
+
+        >>> data = fluid.layers.data(name='data', shape=[8, 32, 32],
+        >>>                          dtype='float32')
+        >>> x = fluid.layers.group_norm(input=data, groups=4)
+    """
+    helper = LayerHelper('group_norm', **locals())
+    dtype = helper.input_dtype()
+
+    # create intput and parameters
+    inputs = {'X': input}
+    input_shape = input.shape
+    if data_layout != 'NCHW':
+        raise ValueError("unsupported data layout:" + data_layout)
+    param_shape = [input_shape[1]]
+    if param_attr:
+        scale = helper.create_parameter(
+            attr=helper.param_attr,
+            shape=param_shape,
+            dtype=dtype,
+            default_initializer=Constant(1.0))
+        inputs['Scale'] = scale
+    if bias_attr:
+        bias = helper.create_parameter(
+            attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
+        inputs['Bias'] = bias
+
+    # create output
+    mean_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+    variance_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+    group_norm_out = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type="group_norm",
+        inputs=inputs,
+        outputs={
+            "Y": group_norm_out,
+            "Mean": mean_out,
+            "Variance": variance_out,
+        },
+        attrs={"epsilon": epsilon,
+               "groups": groups})
+
+    return helper.append_activation(group_norm_out)
+
+
 def conv2d_transpose(input,
                     num_filters,
                     output_size=None,
@@ -5642,48 +5715,42 @@ def label_smooth(label,
    return smooth_label


-if os.name != 'nt':
-
-    @templatedoc()
-    def roi_pool(input,
-                 rois,
-                 pooled_height=1,
-                 pooled_width=1,
-                 spatial_scale=1.0):
-        """
-        ${comment}
-
-        Args:
-            input (Variable): ${x_comment}
-            rois (Variable): ROIs (Regions of Interest) to pool over.
-            pooled_height (integer): ${pooled_height_comment} Default: 1
-            pooled_width (integer): ${pooled_width_comment} Default: 1
-            spatial_scale (float): ${spatial_scale_comment} Default: 1.0
-
-        Returns:
-            Variable: ${out_comment}.
-
-        Examples:
-            .. code-block:: python
-
-                pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0)
-        """
-        helper = LayerHelper('roi_pool', **locals())
-        dtype = helper.input_dtype()
-        pool_out = helper.create_variable_for_type_inference(dtype)
-        argmaxes = helper.create_variable_for_type_inference(dtype='int32')
-        helper.append_op(
-            type="roi_pool",
-            inputs={"X": input,
-                    "ROIs": rois},
-            outputs={"Out": pool_out,
-                     "Argmax": argmaxes},
-            attrs={
-                "pooled_height": pooled_height,
-                "pooled_width": pooled_width,
-                "spatial_scale": spatial_scale
-            })
-        return pool_out
+@templatedoc()
+def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
+    """
+    ${comment}
+
+    Args:
+        input (Variable): ${x_comment}
+        rois (Variable): ROIs (Regions of Interest) to pool over.
+        pooled_height (integer): ${pooled_height_comment} Default: 1
+        pooled_width (integer): ${pooled_width_comment} Default: 1
+        spatial_scale (float): ${spatial_scale_comment} Default: 1.0
+
+    Returns:
+        Variable: ${out_comment}.
+
+    Examples:
+        .. code-block:: python
+
+            pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0)
+    """
+    helper = LayerHelper('roi_pool', **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+    argmaxes = helper.create_variable_for_type_inference(dtype='int32')
+    helper.append_op(
+        type="roi_pool",
+        inputs={"X": input,
+                "ROIs": rois},
+        outputs={"Out": pool_out,
+                 "Argmax": argmaxes},
+        attrs={
+            "pooled_height": pooled_height,
+            "pooled_width": pooled_width,
+            "spatial_scale": spatial_scale
+        })
+    return pool_out


 @templatedoc()

--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -100,26 +100,27 @@ Examples:
    >>> result = fluid.layers.hard_shrink(x=data, threshold=0.3)
 """

-if os.name != 'nt':
-    __all__ += ['cumsum']
-
-    _cum_sum_ = generate_layer_fn('cumsum')
-
-    def cumsum(x, axis=None, exclusive=None, reverse=None):
-        locals_var = locals().keys()
-        kwargs = dict()
-        for name in locals_var:
-            val = locals()[name]
-            if val is not None:
-                kwargs[name] = val
-        return _cum_sum_(**kwargs)
-
-    cumsum.__doc__ = _cum_sum_.__doc__ + """
-    Examples:
-    
-        >>> data = fluid.layers.data(name="input", shape=[32, 784])
-        >>> result = fluid.layers.cumsum(data, axis=0)
-    """
+__all__ += ['cumsum']
+
+_cum_sum_ = generate_layer_fn('cumsum')
+
+
+def cumsum(x, axis=None, exclusive=None, reverse=None):
+    locals_var = locals().keys()
+    kwargs = dict()
+    for name in locals_var:
+        val = locals()[name]
+        if val is not None:
+            kwargs[name] = val
+    return _cum_sum_(**kwargs)
+
+
+cumsum.__doc__ = _cum_sum_.__doc__ + """
+Examples:
+
+    >>> data = fluid.layers.data(name="input", shape=[32, 784])
+    >>> result = fluid.layers.cumsum(data, axis=0)
+"""

 __all__ += ['thresholded_relu']


--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -23,6 +23,12 @@ if(NOT WITH_DISTRIBUTE)
    LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification)
 endif(NOT WITH_DISTRIBUTE)

+if (NOT ${WITH_GPU})
+    LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
+elseif(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+    LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
+endif()
+
 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
 list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184
 list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185
@@ -75,10 +81,12 @@ list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
 list(REMOVE_ITEM TEST_OPS test_dist_transformer)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
 list(REMOVE_ITEM TEST_OPS test_image_classification_resnet)
+list(REMOVE_ITEM TEST_OPS test_interpolate_op)
 foreach(TEST_OP ${TEST_OPS})
    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
+py_test_modules(test_interpolate_op MODULES test_interpolate_op SERIAL)
 if(WITH_DISTRIBUTE)
    py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
    set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)

--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -381,8 +381,8 @@ class OpTest(unittest.TestCase):
            outs.sort(key=len)
            checker(outs)

-    def __assert_is_close(self, numeric_grads, analytic_grads, names,
-                          max_relative_error, msg_prefix):
+    def _assert_is_close(self, numeric_grads, analytic_grads, names,
+                         max_relative_error, msg_prefix):

        for a, b, name in six.moves.zip(numeric_grads, analytic_grads, names):
            abs_a = np.abs(a)
@@ -451,9 +451,9 @@ class OpTest(unittest.TestCase):
        analytic_grads = self._get_gradient(inputs_to_check, place,
                                            output_names, no_grad_set)

-        self.__assert_is_close(numeric_grads, analytic_grads, inputs_to_check,
-                               max_relative_error,
-                               "Gradient Check On %s" % str(place))
+        self._assert_is_close(numeric_grads, analytic_grads, inputs_to_check,
+                              max_relative_error,
+                              "Gradient Check On %s" % str(place))

    @staticmethod
    def _numpy_to_lod_tensor(np_value, lod, place):

--- a/python/paddle/fluid/tests/unittests/test_group_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+
+from operator import mul
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from op_test import OpTest
+
+from testsuite import create_op
+
+
+def group_norm_naive(x, scale, bias, epsilon, groups):
+    N, C, H, W = x.shape
+    G = groups
+    x = x.reshape((N * G, -1))
+    mean = np.mean(x, axis=1, keepdims=True)
+    var = np.var(x, axis=1, keepdims=True)
+    output = (x - mean) / np.sqrt(var + epsilon)
+    output = output.reshape((N, C, H, W)) * scale.reshape(
+        (-1, 1, 1)) + bias.reshape((-1, 1, 1))
+    return output, mean.reshape((N, G)), var.reshape((N, G))
+
+
+class TestGroupNormOp(OpTest):
+    def setUp(self):
+        self.op_type = "group_norm"
+        self.data_format = "NCHW"
+        self.dtype = np.float32
+        self.shape = (2, 4, 3, 3)
+        self.attrs = {'epsilon': 1e-5, 'groups': 2}
+        self.compare_between_place = False
+        self.init_test_case()
+
+        input = np.random.random(self.shape).astype(self.dtype)
+        scale = np.random.random([self.shape[1]]).astype(self.dtype)
+        bias = np.random.random([self.shape[1]]).astype(self.dtype)
+        output, mean, var = group_norm_naive(
+            input, scale, bias, self.attrs['epsilon'], self.attrs['groups'])
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(input),
+            'Scale': OpTest.np_dtype_to_fluid_dtype(scale),
+            'Bias': OpTest.np_dtype_to_fluid_dtype(bias)
+        }
+        self.outputs = {'Y': output, 'Mean': mean, 'Variance': var}
+
+    def test_check_output(self):
+        atol = 1e-4
+        place = core.CPUPlace()
+        self.check_output_with_place(place, atol=atol)
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=atol)
+
+    def do_compare_between_place(self):
+        if not core.is_compiled_with_cuda(): return
+        place = core.CPUPlace()
+        place2 = core.CUDAPlace(0)
+        self.scope = core.Scope()
+        op_inputs = self.inputs if hasattr(self, "inputs") else dict()
+        op_outputs = self.outputs if hasattr(self, "outputs") else dict()
+        op_attrs = self.attrs if hasattr(self, "attrs") else dict()
+        self.op = create_op(self.scope, self.op_type, op_inputs, op_outputs,
+                            op_attrs)
+        inputs_to_check = set(['X', 'Scale', 'Bias'])
+        output_names = 'Y'
+        cpu_grads = self._get_gradient(inputs_to_check, place, output_names,
+                                       None)
+        gpu_grads = self._get_gradient(inputs_to_check, place2, output_names,
+                                       None)
+        self._assert_is_close(cpu_grads, gpu_grads, inputs_to_check, 0.005,
+                              "Gradient Check On %s" % str(place))
+
+    def test_check_grad(self):
+        if self.compare_between_place:
+            self.do_compare_between_place()
+            return
+        place = core.CPUPlace()
+        self.check_grad_with_place(
+            place, set(['X', 'Scale', 'Bias']), 'Y', max_relative_error=0.01)
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place,
+                set(['X', 'Scale', 'Bias']),
+                'Y',
+                max_relative_error=0.01)
+
+    def init_test_case(self):
+        pass
+
+
+class TestGroupNormOp1(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['groups'] = 1
+
+
+class TestGroupNormOp2(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['groups'] = 4
+
+
+class TestGroupNormOpBigEps1(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['groups'] = 1
+        self.attrs['epsilon'] = 0.5
+
+
+class TestGroupNormOpBigEps2(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['groups'] = 4
+        self.attrs['epsilon'] = 0.5
+
+
+class TestGroupNormOpBigEps3(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['epsilon'] = 0.5
+
+
+class TestGroupNormOpLargeData(TestGroupNormOp):
+    def init_test_case(self):
+        self.shape = (2, 32, 64, 64)
+        self.attrs['groups'] = 8
+        self.compare_between_place = True
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tools/manylinux1/Dockerfile.x64
+++ b/tools/manylinux1/Dockerfile.x64
@@ -36,17 +36,21 @@ RUN cd /opt && wget -q --no-check-certificate https://github.com/google/protobuf
    tar xzf protobuf-cpp-3.1.0.tar.gz && \
    cd protobuf-3.1.0 && ./configure && make -j4 && make install && cd .. && rm -f protobuf-cpp-3.1.0.tar.gz

-RUN wget -O /root/requirements.txt https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt
+RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt -O /root/requirements.txt

 RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install -r /root/requirements.txt && \
    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install -r /root/requirements.txt && \
    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install -r /root/requirements.txt && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install -r /root/requirements.txt && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install -r /root/requirements.txt && \
    go get github.com/Masterminds/glide && \
    rm -rf /root/requirements.txt

 RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python

 RUN wget -O /opt/swig-2.0.12.tar.gz https://cytranet.dl.sourceforge.net/project/swig/swig/swig-2.0.12/swig-2.0.12.tar.gz && \
    cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz

--- a/tools/manylinux1/build_scripts/build.sh
+++ b/tools/manylinux1/build_scripts/build.sh
@@ -9,12 +9,12 @@ set -ex
 # remove others to expedite build and reduce docker image size. The original
 # manylinux docker image project builds many python versions.
 # NOTE We added back 3.5.1, since auditwheel requires python 3.3+
-CPYTHON_VERSIONS="2.7.11 3.5.1"
+CPYTHON_VERSIONS="3.7.0 3.6.0 3.5.1 2.7.11"

 # openssl version to build, with expected sha256 hash of .tar.gz
 # archive
-OPENSSL_ROOT=openssl-1.0.2l
-OPENSSL_HASH=ce07195b659e75f4e1db43552860070061f156a98bb37b672b101ba6e3ddf30c
+OPENSSL_ROOT=openssl-1.1.0i
+OPENSSL_HASH=ebbfc844a8c8cc0ea5dc10b86c9ce97f401837f3fa08c17b2cdadc118253cf99
 EPEL_RPM_HASH=e5ed9ecf22d0c4279e92075a64c757ad2b38049bcf5c16c4f2b75d5f6860dc0d
 DEVTOOLS_HASH=a8ebeb4bed624700f727179e6ef771dafe47651131a00a78b342251415646acc
 PATCHELF_HASH=d9afdff4baeacfbc64861454f368b7f2c15c44d245293f7587bbf726bfe722fb
@@ -25,7 +25,7 @@ AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969

 # Dependencies for compiling Python that we want to remove from
 # the final image after compiling Python
-PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel"
+PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel libffi-devel"

 # Libraries that are allowed as part of the manylinux1 profile
 MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel  mesa-libGL-devel libICE-devel libSM-devel ncurses-devel freetype-devel libpng-devel"
@@ -61,7 +61,7 @@ yum -y install bzip2 make git patch unzip bison yasm diffutils \

 wget -q https://cmake.org/files/v3.5/cmake-3.5.2.tar.gz && tar xzf cmake-3.5.2.tar.gz && \
 cd cmake-3.5.2 && ./bootstrap && \
-make -j4 && make install && cd .. && rm cmake-3.5.2.tar.gz
+make -j8 && make install && cd .. && rm cmake-3.5.2.tar.gz


 # Install newest autoconf
@@ -77,11 +77,13 @@ mkdir -p /opt/python
 build_cpythons $CPYTHON_VERSIONS

 PY35_BIN=/opt/python/cp35-cp35m/bin
+PY36_BIN=/opt/python/cp36-cp36m/bin
+PY37_BIN=/opt/python/cp37-cp37m/bin
 # NOTE Since our custom manylinux image builds pythons with shared
 # libpython, we need to add libpython's dir to LD_LIBRARY_PATH before running
 # python.
 ORIGINAL_LD_LIBRARY_PATH="${LD_LIBRARY_PATH}"
-LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib"
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib:$(dirname ${PY36_BIN})/lib:$(dirname ${PY37_BIN})/lib"

 # Our openssl doesn't know how to find the system CA trust store
 #   (https://github.com/pypa/manylinux/issues/53)
@@ -119,9 +121,8 @@ ln -s $PY35_BIN/auditwheel /usr/local/bin/auditwheel
 # final image
 yum -y erase wireless-tools gtk2 libX11 hicolor-icon-theme \
    avahi freetype bitstream-vera-fonts \
-    ${PYTHON_COMPILE_DEPS}  > /dev/null 2>&1
-yum -y install ${MANYLINUX1_DEPS}
-yum -y clean all > /dev/null 2>&1
+    ${PYTHON_COMPILE_DEPS}  > /dev/null 2>&1 || true
+yum -y install ${MANYLINUX1_DEPS} && yum -y clean all > /dev/null 2>&1 || true
 yum list installed
 # we don't need libpython*.a, and they're many megabytes
 find /opt/_internal -name '*.a' -print0 | xargs -0 rm -f

--- a/tools/manylinux1/build_scripts/build_utils.sh
+++ b/tools/manylinux1/build_scripts/build_utils.sh
@@ -52,9 +52,17 @@ function do_cpython_build {

    # NOTE --enable-shared for generating libpython shared library needed for
    # linking of some of the nupic.core test executables.
-    CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null
-    make -j2 > /dev/null
-    make install > /dev/null
+    if [ $(lex_pyver $py_ver) -ge $(lex_pyver 3.7) ]; then
+        # NOTE python 3.7 should be installed via make altinstall rather than
+        # make install, and we should specify the location of ssl
+        CFLAGS="-Wformat" ./configure --prefix=${prefix} --with-openssl=/usr/local/ssl --enable-shared $unicode_flags > /dev/null
+        make -j8 > /dev/null
+        make altinstall > /dev/null
+    else
+        CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null
+        make -j8 > /dev/null
+        make install > /dev/null
+    fi
    popd
    echo "ZZZ looking for libpython"
    find / -name 'libpython*.so*'
@@ -64,6 +72,9 @@ function do_cpython_build {
    if [ -e ${prefix}/bin/python3 ]; then
        ln -s python3 ${prefix}/bin/python
    fi
+    if [ -e ${prefix}/bin/python3.7 ]; then
+        ln -s python3.7 ${prefix}/bin/python
+    fi
    # NOTE Make libpython shared library visible to python calls below
    LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python get-pip.py
    LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/pip install wheel