Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into revert_vlog

test=develop

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into revert_vlog
test=develop
be04d99f · minqiyang · 53433d7f · 05b7ee7e · be04d99f · be04d99f
56 changed file
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -199,8 +199,11 @@ elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
 endif()
 else(NOT WIN32)
+list(APPEND CUDA_NVCC_FLAGS  "--compiler-options;/bigobj")
 if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
  list(APPEND CUDA_NVCC_FLAGS  "-g -G")
+  # match the cl's _ITERATOR_DEBUG_LEVEL
+  list(APPEND CUDA_NVCC_FLAGS  "-D_DEBUG")
 elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
  list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
 else()

--- a/cmake/external/pybind11.cmake
+++ b/cmake/external/pybind11.cmake
@@ -26,7 +26,7 @@ ExternalProject_Add(
        extern_pybind
        ${EXTERNAL_PROJECT_LOG_ARGS}
        GIT_REPOSITORY  "https://github.com/pybind/pybind11.git"
-        GIT_TAG         "v2.1.1"
+        GIT_TAG         "v2.2.4"
        PREFIX          ${PYBIND_SOURCE_DIR}
        UPDATE_COMMAND  ""
        CONFIGURE_COMMAND ""

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -349,10 +349,17 @@ function(cc_test TARGET_NAME)
    set(oneValueArgs "")
    set(multiValueArgs SRCS DEPS ARGS)
    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    if(WIN32)
+      list(APPEND win32_deps shlwapi)
+      if("${cc_test_DEPS};" MATCHES "python;")
+        list(REMOVE_ITEM cc_test_DEPS python)
+        list(APPEND win32_deps ${PYTHON_LIBRARIES})
+      endif()
+    endif(WIN32)
    add_executable(${TARGET_NAME} ${cc_test_SRCS})
    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
    if(WIN32)
-      target_link_libraries(${TARGET_NAME} shlwapi)
+      target_link_libraries(${TARGET_NAME} ${win32_deps})
    endif(WIN32)
    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
    add_test(NAME ${TARGET_NAME}
@@ -683,7 +690,7 @@ function(py_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS ARGS ENVS)
    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_test(NAME ${TARGET_NAME}
-             COMMAND env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
+             COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
             FLAGS_cpu_deterministic=true
             PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
             ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -26,10 +26,10 @@ paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], vara
 paddle.fluid.DistributeTranspilerConfig.__init__ 
 paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None))
 paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True))
-paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None
-paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None
-paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ReduceStrategy, arg0: int) -> None
-paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.BuildStrategy) -> None
+paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None
+paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None
+paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None
+paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy) -> None
 paddle.fluid.create_lod_tensor ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.create_random_int_lodtensor ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -116,14 +116,9 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)

-if (NOT WIN32)
 cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
    shape_inference data_transform lod_tensor profiler transfer_scope_cache)
-else()
-cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
-    shape_inference data_transform lod_tensor)
-endif(NOT WIN32)

 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)


--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -23,7 +23,7 @@ namespace paddle {
 namespace framework {
 namespace details {

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                     const std::vector<Scope *> &local_scopes,
                                     const std::vector<platform::Place> &places,
@@ -74,7 +74,7 @@ void AllReduceOpHandle::RunImpl() {
    }

    if (platform::is_gpu_place(lod_tensors[0]->place())) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
      PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
      int dtype = -1;
      size_t numel = 0;

--- a/paddle/fluid/framework/details/all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.h
@@ -20,7 +20,7 @@
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif

@@ -29,7 +29,7 @@ namespace framework {
 namespace details {

 struct AllReduceOpHandle : public OpHandleBase {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
  AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                    const std::vector<platform::Place> &places,
                    const platform::NCCLContextMap *ctxs);
@@ -49,7 +49,7 @@ struct AllReduceOpHandle : public OpHandleBase {
 private:
  std::vector<Scope *> local_scopes_;
  std::vector<platform::Place> places_;
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
  const platform::NCCLContextMap *nccl_ctxs_;
 #endif
 };

--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -82,7 +82,7 @@ void BroadcastOpHandle::BroadcastOneVar(
      });
    }
  } else {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
    VarHandle *out_handle = nullptr;
    int root_id = boost::get<platform::CUDAPlace>(in_tensor.place()).device;
    std::vector<std::function<void()>> broadcast_calls;

--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -24,7 +24,7 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif

@@ -34,7 +34,7 @@ namespace details {

 struct BroadcastOpHandle : public OpHandleBase {
 public:
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
  BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                    const std::vector<platform::Place> &places,
                    const platform::NCCLContextMap *nccl_ctxs)
@@ -68,7 +68,7 @@ struct BroadcastOpHandle : public OpHandleBase {

  std::vector<Scope *> local_scopes_;
  std::vector<platform::Place> places_;
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
  const platform::NCCLContextMap *nccl_ctxs_;
 #endif


--- a/paddle/fluid/framework/details/broadcast_op_handle_test.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h
@@ -42,7 +42,7 @@ struct TestBroadcastOpHandle {
  std::vector<std::unique_ptr<ir::Node>> nodes_;
  std::vector<p::Place> place_list_;
  bool use_gpu_;
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
 #endif

@@ -50,7 +50,7 @@ struct TestBroadcastOpHandle {
    for (size_t j = 0; j < ctxs_.size(); ++j) {
      ctxs_[j]->Wait();
    }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
    if (nccl_ctxs_) {
      nccl_ctxs_->WaitAll();
    }
@@ -60,7 +60,7 @@ struct TestBroadcastOpHandle {
  void InitCtxOnGpu(bool use_gpu) {
    use_gpu_ = use_gpu;
    if (use_gpu_) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
      int count = p::GetCUDADeviceCount();
      if (count <= 1) {
        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
@@ -84,7 +84,7 @@ struct TestBroadcastOpHandle {
        place_list_.push_back(p);
        ctxs_.emplace_back(new p::CPUDeviceContext(p));
      }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
      nccl_ctxs_.reset(nullptr);
 #endif
    }
@@ -106,14 +106,14 @@ struct TestBroadcastOpHandle {
    nodes_.emplace_back(
        ir::CreateNodeForTest("node0", ir::Node::Type::kOperation));
    if (use_gpu_) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
      op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
                                         place_list_, nccl_ctxs_.get());
 #else
      PADDLE_THROW("CUDA is not support.");
 #endif
    } else {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
      op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
                                         place_list_, nccl_ctxs_.get());
 #else

--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -96,7 +96,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
    const std::string &loss_var_name,
    const std::unordered_set<std::string> &param_names,
    const std::vector<Scope *> &local_scopes,
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
    const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const {
 #else
    const bool use_cuda) const {
@@ -118,7 +118,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
      pass->Erase("local_scopes");
      pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
                                                    &local_scopes);
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
      platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
      pass->Erase("nccl_ctxs");
      pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);

--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -23,7 +23,7 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif

@@ -98,7 +98,7 @@ struct BuildStrategy {
      const std::string &loss_var_name,
      const std::unordered_set<std::string> &param_names,
      const std::vector<Scope *> &local_scopes,
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
      const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const;
 #else
      const bool use_cuda) const;

--- a/paddle/fluid/framework/details/data_balance_op_handle.cc
+++ b/paddle/fluid/framework/details/data_balance_op_handle.cc
@@ -20,7 +20,7 @@ namespace paddle {
 namespace framework {
 namespace details {

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 DataBalanceOpHandle::DataBalanceOpHandle(
    ir::Node *node, const std::vector<Scope *> &local_scopes,
    const std::vector<platform::Place> &places,

--- a/paddle/fluid/framework/details/data_balance_op_handle.h
+++ b/paddle/fluid/framework/details/data_balance_op_handle.h
@@ -19,7 +19,7 @@
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif

@@ -29,7 +29,7 @@ namespace details {

 struct DataBalanceOpHandle : public OpHandleBase {
 public:
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
  DataBalanceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                      const std::vector<platform::Place> &places,
                      const platform::NCCLContextMap *ctxs);

--- a/paddle/fluid/framework/details/fused_broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
@@ -25,7 +25,7 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif

@@ -35,7 +35,7 @@ namespace details {

 struct FusedBroadcastOpHandle : public BroadcastOpHandle {
 public:
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
  FusedBroadcastOpHandle(ir::Node *node,
                         const std::vector<Scope *> local_scopes,
                         const std::vector<platform::Place> &places,

--- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
@@ -44,14 +44,14 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
    nodes_.emplace_back(
        ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation));
    if (use_gpu_) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
      op_handle_ = new FusedBroadcastOpHandle(
          nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
 #else
      PADDLE_THROW("CUDA is not supported.");
 #endif
    } else {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
      op_handle_ = new FusedBroadcastOpHandle(
          nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
 #else

--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -142,7 +142,7 @@ void MultiDevSSAGraphBuilder::Init() const {
  places_ = Get<const std::vector<platform::Place>>(kPlaces);
  local_scopes_ = Get<const std::vector<Scope *>>(kLocalScopes);
  strategy_ = Get<const BuildStrategy>(kStrategy);
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
  nccl_ctxs_ = &Get<platform::NCCLContextMap>("nccl_ctxs");
 #endif

@@ -431,7 +431,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
    }
  }
  bool use_gpu = false;
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
  use_gpu = nccl_ctxs_ != nullptr;
 #endif

@@ -478,7 +478,7 @@ bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const {

 void MultiDevSSAGraphBuilder::SetCommunicationContext(
    OpHandleBase *op_handle, const platform::Place &p) const {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
  if (nccl_ctxs_ == nullptr) {
    op_handle->SetDeviceContext(p,
                                platform::DeviceContextPool::Instance().Get(p));
@@ -492,7 +492,7 @@ void MultiDevSSAGraphBuilder::SetCommunicationContext(
 void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result,
                                                const std::string &p_name,
                                                size_t src_dev_id) const {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
  auto *op_handle = new BroadcastOpHandle(
      result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation),
      local_scopes_, places_, nccl_ctxs_);
@@ -522,7 +522,7 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result,
 void MultiDevSSAGraphBuilder::CreateFusedBroadcastOp(
    ir::Graph *result,
    const std::vector<std::unordered_set<std::string>> &bcast_varnames) const {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
  auto *op_handle = new FusedBroadcastOpHandle(
      result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation),
      local_scopes_, places_, nccl_ctxs_);
@@ -568,7 +568,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result,

 void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result,
                                                const std::string &og) const {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
  result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
      result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
      local_scopes_, places_, nccl_ctxs_));
@@ -597,7 +597,7 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result,

 void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
    ir::Graph *result, const std::vector<std::string> &datas) const {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
  result->Get<GraphOps>(kGraphOps).emplace_back(new DataBalanceOpHandle(
      result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation),
      local_scopes_, places_, nccl_ctxs_));
@@ -694,7 +694,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result,
 VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
                                                   const std::string &og,
                                                   int dst_dev_id) const {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
  result->Get<GraphOps>(kGraphOps).emplace_back(new ReduceOpHandle(
      result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
      local_scopes_, places_, nccl_ctxs_));

--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -40,7 +40,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
                         size_t device_id) const;
  void Init() const;

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
  mutable platform::NCCLContextMap *nccl_ctxs_;
 #endif


--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -125,7 +125,7 @@ void ReduceOpHandle::RunImpl() {
        }
      });
    } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
      auto pre_in = pre_in_var->Get<framework::LoDTensor>();
      VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var);
      VariableVisitor::GetMutableTensor(out_var).mutable_data(

--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -23,7 +23,7 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif

@@ -35,7 +35,7 @@ struct ReduceOpHandle : public OpHandleBase {
  std::vector<Scope *> local_scopes_;
  std::vector<platform::Place> places_;

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
  const platform::NCCLContextMap *nccl_ctxs_;
  ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                 const std::vector<platform::Place> &places,

--- a/paddle/fluid/framework/details/reduce_op_handle_test.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@@ -35,7 +35,7 @@ struct TestReduceOpHandle {
  std::vector<p::Place> gpu_list_;
  std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
 #endif

@@ -43,7 +43,7 @@ struct TestReduceOpHandle {
    for (size_t j = 0; j < ctxs_.size(); ++j) {
      ctxs_[j]->Wait();
    }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
    if (nccl_ctxs_) {
      nccl_ctxs_->WaitAll();
    }
@@ -53,7 +53,7 @@ struct TestReduceOpHandle {
  void InitCtxOnGpu(bool use_gpu) {
    use_gpu_ = use_gpu;
    if (use_gpu) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
      int count = p::GetCUDADeviceCount();
      if (count <= 1) {
        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
@@ -77,7 +77,7 @@ struct TestReduceOpHandle {
        gpu_list_.push_back(p);
        ctxs_.emplace_back(new p::CPUDeviceContext(p));
      }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
      nccl_ctxs_.reset(nullptr);
 #endif
    }
@@ -99,14 +99,14 @@ struct TestReduceOpHandle {

    nodes.emplace_back(new ir::Node("node"));
    if (use_gpu_) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
      op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
                                          gpu_list_, nccl_ctxs_.get()));
 #else
      PADDLE_THROW("CUDA is not support.");
 #endif
    } else {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
      op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
                                          gpu_list_, nccl_ctxs_.get()));
 #else

--- a/paddle/fluid/framework/ir/is_test_pass_tester.cc
+++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc
@@ -15,7 +15,10 @@
 #include "paddle/fluid/framework/ir/is_test_pass.h"

 #include <gtest/gtest.h>
-
+#ifdef _WIN32
+#undef FALSE
+#undef TRUE
+#endif
 namespace paddle {
 namespace framework {
 namespace ir {

--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -26,10 +26,8 @@ limitations under the License. */
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"

-#if !defined(_WIN32)
 #include "paddle/fluid/recordio/scanner.h"
 #include "paddle/fluid/recordio/writer.h"
-#endif  // _WIN32

 namespace paddle {
 namespace framework {
@@ -305,7 +303,6 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
  TensorFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
 }

-#if !defined(_WIN32)
 void WriteToRecordIO(recordio::Writer *writer,
                     const std::vector<LoDTensor> &tensor,
                     const platform::DeviceContext &dev_ctx) {
@@ -335,19 +332,7 @@ bool ReadFromRecordIO(recordio::Scanner *scanner,

  return true;
 }
-#else
-class Writer {};
-class Scanner {};
-void WriteToRecordIO(recordio::Writer *writer,
-                     const std::vector<LoDTensor> &tensor,
-                     const platform::DeviceContext &dev_ctx) {}
-bool ReadFromRecordIO(recordio::Scanner *scanner,
-                      const platform::DeviceContext &dev_ctx,
-                      std::vector<LoDTensor> *result_ptr) {
-  PADDLE_ENFORCE("windows didn't supported recordio!.");
-  return true;
-}
-#endif  // _WIN32
+
 std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
    const std::vector<platform::Place> places) const {
  check_memory_size();

--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -274,7 +274,6 @@ TEST(LoD, ConvertToOffsetBasedLoD) {
  EXPECT_EQ(offset_lod, expected);
 }

-#if !defined(_WIN32)
 template <typename T>
 static void TestRecordIO() {
  LoDTensor tensor;
@@ -321,7 +320,6 @@ TEST(LoDTensor, RecordIO) {
  TestRecordIO<float>();
  TestRecordIO<double>();
 }
-#endif  // !defined(_WIN32)

 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -149,17 +149,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #endif
  }

-// The profile has a process-wide mutex, results in serious performance issue
-// in concurrency scenerio. Here use an `if` to fix this issue.
-// Please not remove the `if`, ask @Superjomn if there are any concern.
-#ifndef _WIN32
+  // The profile has a process-wide mutex, results in serious performance issue
+  // in concurrency scenerio. Here use an `if` to fix this issue.
+  // Please not remove the `if`, ask @Superjomn if there are any concern.
  if (platform::IsProfileEnabled()) {
    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
    platform::RecordEvent record_event(Type(), pool.Get(place));
    RunImpl(scope, place);
-  } else  // NOLINT
-#endif
-  {
+  } else {
    RunImpl(scope, place);
  }
  VLOG(3) << place << " " << DebugStringEx(&scope);

--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -19,6 +19,7 @@
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include "paddle/fluid/platform/port.h"

 namespace paddle {
 namespace inference {
@@ -75,7 +76,7 @@ void TestWord2vecPrediction(const std::string& model_path) {
                     0.000932706};
  const size_t num_elements = outputs.front().data.length() / sizeof(float);
  // The outputs' buffers are in CPU memory.
-  for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
+  for (size_t i = 0; i < std::min((size_t)5UL, num_elements); i++) {
    LOG(INFO) << "data: "
              << static_cast<float*>(outputs.front().data.data())[i];
    PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -56,7 +56,6 @@ bool AnalysisPredictor::Init(
    const std::shared_ptr<framework::Scope> &parent_scope,
    const std::shared_ptr<framework::ProgramDesc> &program) {
  VLOG(3) << "Predictor::init()";
-#if !defined(_WIN32)
  if (FLAGS_profile) {
    LOG(WARNING) << "Profiler is actived, might affect the performance";
    LOG(INFO) << "You can turn off by set gflags '-profile false'";
@@ -64,7 +63,6 @@ bool AnalysisPredictor::Init(
                                           : platform::ProfilerState::kCPU;
    platform::EnableProfiler(tracking_device);
  }
-#endif

  // no matter with or without MKLDNN
  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
@@ -520,12 +518,10 @@ bool AnalysisPredictor::LoadParameters() {
 }

 AnalysisPredictor::~AnalysisPredictor() {
-#if !defined(_WIN32)
  if (FLAGS_profile) {
    platform::DisableProfiler(platform::EventSortingKey::kTotal,
                              "./profile.log");
  }
-#endif
  if (sub_scope_) {
    scope_->DeleteScope(sub_scope_);
  }

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -64,7 +64,6 @@ void NativePaddlePredictor::PrepareFeedFetch() {
 bool NativePaddlePredictor::Init(
    std::shared_ptr<framework::Scope> parent_scope) {
  VLOG(3) << "Predictor::init()";
-#if !defined(_WIN32)
  if (FLAGS_profile) {
    LOG(WARNING) << "Profiler is actived, might affect the performance";
    LOG(INFO) << "You can turn off by set gflags '-profile false'";
@@ -73,7 +72,6 @@ bool NativePaddlePredictor::Init(
                                           : platform::ProfilerState::kCPU;
    platform::EnableProfiler(tracking_device);
  }
-#endif

  // no matter with or without MKLDNN
  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
@@ -121,12 +119,10 @@ bool NativePaddlePredictor::Init(
 }

 NativePaddlePredictor::~NativePaddlePredictor() {
-#if !defined(_WIN32)
  if (FLAGS_profile) {
    platform::DisableProfiler(platform::EventSortingKey::kTotal,
                              "./profile.log");
  }
-#endif
  if (sub_scope_) {
    scope_->DeleteScope(sub_scope_);
  }

--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -15,10 +15,6 @@
 #pragma once

 #include <glog/logging.h>
-#if !defined(_WIN32)
-#include <sys/time.h>
-#else
-#endif

 #include <algorithm>
 #include <chrono>  // NOLINT
@@ -28,6 +24,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/string/printf.h"

 namespace paddle {

--- a/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <gflags/gflags.h>
-#include <sys/time.h>
 #include <time.h>
 #include <algorithm>
 #include <fstream>

--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -178,11 +178,9 @@ void TestOneThreadPrediction(
    warmup_timer.tic();
    predictor->Run(inputs[0], outputs, batch_size);
    PrintTime(batch_size, 1, 1, 0, warmup_timer.toc(), 1);
-#if !defined(_WIN32)
    if (FLAGS_profile) {
      paddle::platform::ResetProfiler();
    }
-#endif
  }

  LOG(INFO) << "Run " << num_times << " times...";
@@ -232,11 +230,9 @@ void TestMultiThreadPrediction(
        warmup_timer.tic();
        predictor->Run(inputs[0], outputs, batch_size);
        PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1);
-#if !defined(_WIN32)
        if (FLAGS_profile) {
          paddle::platform::ResetProfiler();
        }
-#endif
      }

      LOG(INFO) << "Thread " << tid << " run " << num_times << " times...";

--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <sys/time.h>
 #include <time.h>
 #include <fstream>
 #include <thread>  // NOLINT

--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -20,6 +20,7 @@ limitations under the License. */

 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.h"

 DECLARE_bool(use_mkldnn);

--- a/paddle/fluid/operators/beam_search_op_test.cc
+++ b/paddle/fluid/operators/beam_search_op_test.cc
@@ -46,7 +46,7 @@ void CreateInput(LoDTensor* ids, LoDTensor* scores) {
  auto* scores_data = scores->mutable_data<float>(place);
  vector<int64_t> _ids({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1});
  vector<float> _scores(
-      {0.5, 0.3, 0.2, 0.6, 0.3, 0.1, 0.9, 0.5, 0.1, 0.7, 0.5, 0.1});
+      {0.5f, 0.3f, 0.2f, 0.6f, 0.3f, 0.1f, 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f});

  for (int i = 0; i < 12; i++) {
    ids_data[i] = _ids[i];
@@ -80,7 +80,7 @@ TEST(DISABLED_beam_search_op, run) {
  ASSERT_EQ(sids.lod(), sscores.lod());

  vector<int> tids({4, 2, 3, 8});
-  vector<float> tscores({0.5, 0.6, 0.9, 0.7});
+  vector<float> tscores({0.5f, 0.6f, 0.9f, 0.7f});

  for (int i = 0; i < 4; i++) {
    ASSERT_EQ(tids[i], sids.data<int64_t>()[i]);

--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <sys/time.h>
 #include <limits>

 #include "glog/logging.h"  // For VLOG
@@ -20,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/grpc_client.h"
 #include "paddle/fluid/operators/distributed/grpc_serde.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
+#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.h"

 DECLARE_bool(rpc_disable_reuse_port);

--- a/paddle/fluid/operators/distributed/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <nccl.h>
 #endif
-#include <sys/time.h>
 #include <thread>  // NOLINT

 #include "google/protobuf/io/coded_stream.h"
@@ -26,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/grpc_variable_response.h"
 #include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.h"

 namespace paddle {

--- a/paddle/fluid/operators/distributed/grpc_serde.h
+++ b/paddle/fluid/operators/distributed/grpc_serde.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
-#include <sys/time.h>
+
 #include <iostream>
 #include <string>
 #include <vector>
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+#include "paddle/fluid/platform/port.h"

 #include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
 #include "paddle/fluid/operators/distributed/send_recv.pb.h"

--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
@@ -15,12 +15,12 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <nccl.h>
 #endif
-#include <sys/time.h>
 #include <thread>  // NOLINT

 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/operators/distributed/variable_response.h"
+#include "paddle/fluid/platform/port.h"

 DEFINE_bool(rpc_disable_reuse_port, false, "Disable SO_REUSEPORT or not.");


--- a/paddle/fluid/operators/distributed/sendrecvop_utils.h
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
-#include <sys/time.h>
 #include <iostream>
 #include <string>
 #include <vector>
@@ -24,6 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/platform/port.h"

 #include "paddle/fluid/operators/distributed/send_recv.pb.h"


--- a/paddle/fluid/operators/math/cpu_vec_test.cc
+++ b/paddle/fluid/operators/math/cpu_vec_test.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <sys/time.h>
 #include <cmath>
 #include <cstring>
 #include <random>
@@ -22,6 +21,7 @@ limitations under the License. */
 #include "gtest/gtest.h"

 #include "paddle/fluid/operators/math/cpu_vec.h"
+#include "paddle/fluid/platform/port.h"

 inline double GetCurrentUS() {
  struct timeval time;

--- a/paddle/fluid/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
@@ -14,9 +14,9 @@ limitations under the License. */

 #include "paddle/fluid/operators/math/im2col.h"
 #include <gtest/gtest.h>
-#include <sys/time.h>
 #include <vector>
 #include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
+#include "paddle/fluid/platform/port.h"

 template <typename DeviceContext, typename Place>
 void testIm2col() {

--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/math/jit_kernel.h"
-#include <sys/time.h>
 #include <cmath>    // for exp
 #include <cstring>  // for memcpy
 #include <random>
@@ -22,6 +21,7 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/platform/port.h"

 #ifdef PADDLE_WITH_MKLML
 #include "paddle/fluid/platform/dynload/mklml.h"

--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -62,7 +62,7 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {

 #define CUDNN_ENFORCE(condition)                                     \
  do {                                                               \
-    cudnnStatus_t status = condition;                                \
+    auto status = condition;                                         \
    if (UNLIKELY(status != CUDNN_STATUS_SUCCESS)) {                  \
      PADDLE_THROW(::paddle::platform::cudnnGetErrorString(status)); \
    }                                                                \

--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -51,7 +51,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
 #define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \
  struct DynLoad__##__name {                    \
    template <typename... Args>                 \
-    inline cudnnStatus_t operator()(Args... args) { \
+    inline auto operator()(Args... args) {      \
      return ::__name(args...);                 \
    }                                           \
  };                                            \

--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -19,7 +19,16 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/enforce.h"

-DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
+#ifndef _WIN32
+const float fraction_of_gpu_memory_to_use = 0.92f;
+#else
+// fraction_of_gpu_memory_to_use cannot be too high on windows,
+// since the win32 graphic sub-system can occupy some GPU memory
+// which may lead to insufficient memory left for paddle
+const float fraction_of_gpu_memory_to_use = 0.5f;
+#endif
+
+DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
              "Allocate a trunk of gpu memory that is this fraction of the "
              "total gpu memory size. Future memory usage will be allocated "
              "from the trunk. If the trunk doesn't have enough gpu memory, "

--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
@@ -14,11 +14,11 @@

 #pragma once

+#include <ThreadPool.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <functional>
 #include <memory>
-#include "ThreadPool.h"
 #include "paddle/fluid/platform/enforce.h"

 namespace paddle {

--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -30,11 +30,12 @@ namespace pybind11 {
 namespace detail {

 // Can be replaced by a generic lambda in C++14
-struct variant_caster_visitor : public boost::static_visitor<handle> {
+struct __attribute__((visibility("hidden"))) paddle_variant_caster_visitor
+    : public boost::static_visitor<handle> {
  return_value_policy policy;
  handle parent;

-  variant_caster_visitor(return_value_policy policy, handle parent)
+  paddle_variant_caster_visitor(return_value_policy policy, handle parent)
      : policy(policy), parent(parent) {}

  template <class T>
@@ -44,10 +45,10 @@ struct variant_caster_visitor : public boost::static_visitor<handle> {
 };

 template <class Variant>
-struct variant_caster;
+struct paddle_variant_caster;

 template <template <class...> class V, class... Ts>
-struct variant_caster<V<Ts...>> {
+struct paddle_variant_caster<V<Ts...>> {
  using Type = V<Ts...>;

  template <typename T>
@@ -90,7 +91,7 @@ struct variant_caster<V<Ts...>> {

  static handle cast(Type const &src, return_value_policy policy,
                     handle parent) {
-    variant_caster_visitor visitor(policy, parent);
+    paddle_variant_caster_visitor visitor(policy, parent);
    return boost::apply_visitor(visitor, src);
  }

@@ -101,7 +102,7 @@ struct variant_caster<V<Ts...>> {
 // Add specialization for concrete variant type
 template <class... Args>
 struct type_caster<boost::variant<Args...>>
-    : variant_caster<boost::variant<Args...>> {};
+    : paddle_variant_caster<boost::variant<Args...>> {};

 }  // namespace detail
 }  // namespace pybind11

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -86,12 +86,12 @@ bool IsCompiledWithDIST() {
 #endif
 }

-PYBIND11_PLUGIN(core) {
+PYBIND11_MODULE(core, m) {
  // Not used, just make sure cpu_info.cc is linked.
  paddle::platform::CpuTotalPhysicalMemory();

  paddle::memory::allocation::UseAllocatorStrategyGFlag();
-  py::module m("core", "C++ core of PaddlePaddle");
+  m.doc() = "C++ core of PaddlePaddle";

  // using framework in this function. Since it is inside a function, it will
  // not cause namespace pollution.
@@ -907,7 +907,6 @@ All parameter, weight, gradient are variables in Paddle.
      });

  BindRecordIOWriter(&m);
-  return m.ptr();
 }
 }  // namespace pybind
 }  // namespace paddle
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -21,7 +21,6 @@ limitations under the License. */
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/float16.h"
-#include "pybind11/common.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"


--- a/paddle/legacy/cuda/include/hl_warpctc_wrap.h
+++ b/paddle/legacy/cuda/include/hl_warpctc_wrap.h
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#ifndef _WIN32
 #ifndef HL_WARPCTC_WRAP_H_
 #define HL_WARPCTC_WRAP_H_
-
 #include "ctc.h"
 #include "hl_base.h"

@@ -91,3 +91,4 @@ extern void hl_warpctc_get_workspace_size(const int* cpuLabelLengths,
                                          size_t* bytes);

 #endif  // HL_WARPCTC_WRAP_H_
+#endif
--- a/paddle/legacy/cuda/src/hl_cuda_device.cc
+++ b/paddle/legacy/cuda/src/hl_cuda_device.cc
@@ -132,11 +132,15 @@ inline pid_t gettid() {
  uint64_t tid;
  pthread_threadid_np(NULL, &tid);
 #else
+#ifndef _WIN32
 #ifndef __NR_gettid
 #define __NR_gettid 224
 #endif
  pid_t tid = syscall(__NR_gettid);
 #endif
+#else   // _WIN32
+  pid_t tid = _getpid();
+#endif  // _WIN32
  CHECK_NE((int)tid, -1);
  return tid;
 }

--- a/paddle/legacy/utils/ThreadLocal.h
+++ b/paddle/legacy/utils/ThreadLocal.h
@@ -14,10 +14,12 @@ limitations under the License. */

 #pragma once

+#ifndef _WIN32
 #include <pthread.h>
 #include <sys/syscall.h>
-#include <sys/types.h>
 #include <unistd.h>
+#endif
+#include <sys/types.h>
 #include <map>
 #include <mutex>
 #include <random>

--- a/paddle/legacy/utils/Util.h
+++ b/paddle/legacy/utils/Util.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#ifndef _WIN32
 #include <sys/syscall.h>  // for syscall()
+#endif
 #include <sys/types.h>
 #include <algorithm>
 #include <cmath>
@@ -40,6 +42,31 @@ inline int rand_r(unsigned int* seedp) {
 }
 #endif

+#ifdef _WIN32
+#define NOMINMAX  // msvc max/min macro conflict with std::min/max
+#include <windows.h>
+
+template <typename T>
+inline int __builtin_clz(const T& value) {
+  DWORD leadning_zero = 0;
+  if (_BitScanReverse(&leadning_zero, value)) {
+    return static_cast<int>(sizeof(T) * 8 - leadning_zero);
+  } else {
+    return static_cast<int>(0);
+  }
+}
+
+inline int __builtin_clzl(const unsigned long& value) {
+  return __builtin_clz(value);
+}
+
+inline int __builtin_clzll(const unsigned long long& value) {
+  return __builtin_clz(value);
+}
+
+#define pid_t int
+#endif
+
 /**
 * Loop over the elements in a container
 * TODO(yuyang18): It's this foreach useful? Why not use C++ 11 foreach,

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -149,7 +149,7 @@ function cmake_gen() {
            elif [ "$1" == "cp37-cp37m" ]; then
                export LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH}
                export PATH=/opt/_internal/cpython-3.7.0/bin/:${PATH}
-                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.7.0/bin/python3
+                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.7.0/bin/python3.7
            -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m
            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so"
           fi

--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -3,8 +3,10 @@
 if(WITH_TESTING)
  add_library(paddle_test_main STATIC TestMain.cpp)
  add_dependencies(paddle_test_main paddle_proto ${external_project_dependencies})
+  if(NOT WIN32)
    add_library(paddle_test_util STATIC TestUtil.cpp)
    add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
+  endif(NOT WIN32)
  if(NOT MOBILE_INFERENCE)
    cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS device_context memory gtest gflags)
  endif()

--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -46,8 +46,8 @@ def _is_numpy_(var):


 def _is_number_(var):
-    return isinstance(var, int) or isinstance(var, float) or (isinstance(
-        var, np.ndarray) and var.shape == (1, ))
+    return isinstance(var, int) or isinstance(var, np.int64) or isinstance(
+        var, float) or (isinstance(var, np.ndarray) and var.shape == (1, ))


 def _is_number_or_matrix_(var):