cleard. staged

bf2e4cb1 · dzhwinter · ebfe5a02 · bf2e4cb1 · bf2e4cb1 · bf2e4cb1
16 changed file
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -173,6 +173,7 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 if (NOT WIN32) # windows msvc2015 support c++11 natively. 
 # -std=c++11 -fPIC not recoginize by msvc
 list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
+# in cuda9, suppress cuda warning on eigen with "-w"
 list(APPEND CUDA_NVCC_FLAGS "-w" "-Xcompiler -fPIC")
 else(NOT WIN32)
 list(APPEND CUDA_NVCC_FLAGS "-w" "-Xcompiler -fPIC" "-Xcompiler /w")
@@ -181,7 +182,7 @@ endif(NOT WIN32)
 if(WITH_FAST_MATH)
  # Make use of fast math library. https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
  list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
-# in cuda9, suppress cuda warning on eigen 
+endif(WITH_FAST_MATH)

 # Set :expt-relaxed-constexpr to suppress Eigen warnings
 list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")

--- a/cmake/external/threadpool.cmake
+++ b/cmake/external/threadpool.cmake
@@ -3,6 +3,7 @@ INCLUDE(ExternalProject)
 SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool)
 SET(THREADPOOL_INCLUDE_DIR ${THREADPOOL_SOURCE_DIR}/src/extern_threadpool)
 INCLUDE_DIRECTORIES(${THREADPOOL_INCLUDE_DIR})
+message("Debug" ${THREADPOOL_INCLUDE_DIR})

 ExternalProject_Add(
    extern_threadpool

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -143,26 +143,14 @@ set(GPU_COMMON_FLAGS
    -Wno-error=unused-function  # Warnings in Numpy Header.
    -Wno-error=array-bounds # Warnings in Eigen::array
 )
-set(COMMON_FLAGS 
-    -fPIC
-    -fno-omit-frame-pointer)
-set(GPU_COMMON_FLAGS
-    -fPIC
-    -fno-omit-frame-pointer)
-
-else(NOT WIN32)
-set(COMMON_FLAGS
-    "/w") #disable all warnings.
-
-set(GPU_COMMON_FLAGS
-    "/w") #disable all warnings
-
-endif(NOT WIN32)
-
 else(NOT WIN32)
 set(COMMON_FLAGS
+    -fPIC
+    -fno-omit-frame-pointer
    "/w") #disable all warnings.
 set(GPU_COMMON_FLAGS
+    -fPIC
+    -fno-omit-frame-pointer
    "/w") #disable all warnings
 endif(NOT WIN32)


--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -48,6 +48,7 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
  VLOG(5) << "destroy ExecutorPrepareContext";
 }

+#ifndef _WIN32
 template <typename RefCntMap>
 static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
                                GarbageCollector<Tensor>* gc,
@@ -82,6 +83,7 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
    gc->Add(erase_tensors);
  }
 }
+#endif

 Executor::Executor(const platform::Place& place) : place_(place) {}

@@ -331,97 +333,35 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,

 std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
    const ProgramDesc& program, int block_id) {
-  VLOG(3) << "before create prepare" << block_id << " " << program.Size();
  std::unique_ptr<ExecutorPrepareContext> ctx(
      new ExecutorPrepareContext(program, block_id));
-  VLOG(3) << "after create prepare";
-  // PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
-  VLOG(3) << "before create op_desc";
+  PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
  auto& block = program.Block(block_id);
-  VLOG(3) << "create before" << ctx->ops_.size() << " "
-          << block.AllOps().size();
  int counter = 0;
  for (auto& op_desc : block.AllOps()) {
    ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
-    VLOG(3) << "create op "
-            << "index " << ++counter << " type " << op_desc->Type();
  }
-  VLOG(3) << "create finished" << ctx->ops_.size() << " "
-          << block.AllOps().size();
  return ctx;
 }

 std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
    const ProgramDesc& program, const std::vector<int>& block_ids) {
-  VLOG(3) << "inside prepare";
  std::vector<std::shared_ptr<ExecutorPrepareContext>> result;
-  VLOG(3) << "before go through block_ids";
  for (auto& bid : block_ids) {
-    VLOG(3) << "block id" << bid;
    auto* ctx = new ExecutorPrepareContext(program, bid);
-    // PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
+    PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
    auto& block = program.Block(bid);
-    int counter = 0;
-    VLOG(3) << "create before" << ctx->ops_.size() << " "
-            << block.AllOps().size();
    for (auto& op_desc : block.AllOps()) {
      ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
-      VLOG(3) << "create op "
-              << "index " << ++counter << " type " << op_desc->Type();
    }
-    VLOG(3) << "create finished" << ctx->ops_.size() << " "
-            << block.AllOps().size();
    result.push_back(std::shared_ptr<ExecutorPrepareContext>(ctx));
  }
  return result;
 }

-// void CheckResult(const std::string op_type, ExecutorPrepareContext* ctx,
-// Scope* local_scope) {
-//     VLOG(3) << "before checking result";
-//   auto& dev_ctx = *platform::DeviceContextPool::Instance().Get(place_);
-//   std::vector<std::string> outputs;
-//   auto& block = ctx->prog_.Block(0);
-//   bool found = false;
-//   framework::OpDesc* myop = nullptr;
-//   for(auto& op : block.AllOps()) {
-//     if(op->Type() == "load_combine" || op->Type() == "fetch" || op->Type() ==
-//     "feed") return;
-//     if (op->Type() == op_type) {
-//         found = true;
-//         myop = op;
-//         break;
-//       }
-//     }
-//   }
-//   if(!found) {
-//     VLOG(3) << "not found op!";
-//     return;
-//   }
-//     auto* op = myop;
-//      VLOG(3) << "start op output" << op->Type();
-//     for(auto var_name: op->OutputArgumentNames()) {
-//       auto* var = local_scope->Var(var_name);
-//       auto* var_desc = block.FindVar(var_name);
-//       if (var_desc->Persistable()) continue;
-//       auto* tensor = var->GetMutable<framework::LoDTensor>();
-//       framework::Tensor check;
-//       VLOG(3) << "before tensor copy";
-//       framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check);
-//       VLOG(3) << "after tensor copy";
-//       float sum = .0;
-//       for(size_t i=0; i < check.numel(); ++i) {
-//           sum += check.data<float>()[i];
-//       }
-//       VLOG(3) << "op " << op->Type() << " output var " << var_name << " sum "
-//       << sum;
-//   VLOG(3) << "after checking result";
-// }
-
 void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                                  bool create_local_scope, bool create_vars,
                                  bool keep_kids) {
-  VLOG(3) << "RunPreparedContext inside";
  Scope* local_scope = scope;
  if (create_vars) {
    if (create_local_scope) {
@@ -430,6 +370,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
    CreateVariables(ctx->prog_, local_scope, ctx->block_id_);
  }

+#ifndef _WIN32
  int64_t max_memory_size = GetEagerDeletionThreshold();
  std::unique_ptr<GarbageCollector<Tensor>> gc;
  // WhileOp would set keep_kids to false
@@ -471,6 +412,16 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
  } else {
    platform::DeviceContextPool::Instance().Get(place_)->Wait();
  }
+#else   // WIN32
+  for (auto& op : ctx->ops_) {
+    op->Run(*local_scope, place_);
+    if (FLAGS_benchmark) {
+      VLOG(2) << "Memory used after operator " + op->Type() + " running: "
+              << memory::memory_usage(place_);
+    }
+  }
+  platform::DeviceContextPool::Instance().Get(place_)->Wait();
+#endif  // NOT WIN32

  if (local_scope != scope) {
    scope->DeleteScope(local_scope);

--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -17,12 +17,14 @@ limitations under the License. */
 #include <map>
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
+#ifndef _WIN32
+#include "paddle/fluid/framework/garbage_collector.h"
+#endif

 namespace paddle {
 namespace framework {

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -35,9 +35,10 @@ endif()

 # Create static library
 if (WIN32)
-cc_library(paddle_fluid DEPS ${fluid_modules} ${fluid_thirdpa} paddle_fluid_api paddle_inference_api)
+cc_library(paddle_fluid DEPS ${fluid_modules} ${fluid_third_partys} paddle_fluid_api paddle_inference_api)
 else(WIND32)
 cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array)
+endif(WIN32)

 if(NOT APPLE)
  # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -51,6 +51,7 @@ function(inference_api_test TARGET_NAME)
 endfunction(inference_api_test)

 cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope)
+cc_library(helper SRCS helper.cc DEPS reset_tensor_array lod_tensor scope)
 cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS reset_tensor_array lod_tensor scope)
 cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor)
 cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api)

--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -16,7 +16,6 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle_inference_api.h"

 namespace paddle {


--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -260,9 +260,8 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
  if (config.use_gpu) {
    // 1. GPU memeroy
    PADDLE_ENFORCE_GT(
-       config.fraction_of_gpu_memory, 0.f,
-       "fraction_of_gpu_memory in the config should be set to range (0.,
-       1.]");
+        config.fraction_of_gpu_memory, 0.f,
+        "fraction_of_gpu_memory in the config should be set to range (0.,1.]");
    PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
    std::vector<std::string> flags;
    if (config.fraction_of_gpu_memory >= 0.0f ||

--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -31,10 +31,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/inference/api/details/reset_tensor_array.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle_inference_api.h"  // NOLINT

 namespace paddle {


--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -14,8 +14,9 @@

 #pragma once

+#define GLOG_NO_ABBREVIATED_SEVERITIES
+#define GOOGLE_GLOG_DLL_DECL
 #include <glog/logging.h>
-
 #include <algorithm>
 #include <chrono>  // NOLINT
 #include <iterator>
@@ -23,9 +24,7 @@
 #include <sstream>
 #include <string>
 #include <vector>
-#include "paddle/fluid/string/printf.h"
-#include "paddle_inference_api.h"
-#include "timer.h"
+#include "paddle_inference_api.h"  //NOLINT

 namespace paddle {
 namespace inference {
@@ -97,7 +96,7 @@ static void TensorAssignData(PaddleTensor *tensor,
 }

 template <typename T>
-static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
+static int ZeroCopyTensorAssignData(paddle::ZeroCopyTensor *tensor,
                                    const std::vector<std::vector<T>> &data) {
  int size{0};
  auto *ptr = tensor->mutable_data<T>(PaddlePlace::kCPU);

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -284,12 +284,10 @@ op_library(array_to_lod_tensor_op DEPS lod_rank_table_op)
 op_library(max_sequence_len_op DEPS lod_rank_table)
 op_library(sequence_conv_op DEPS context_project)
 op_library(sequence_pool_op DEPS sequence_pooling)
-if (NOT WIN32)
-    op_library(lstm_op DEPS sequence2batch lstm_compute)
-    op_library(hierarchical_sigmoid_op DEPS matrix_bit_code)
-    op_library(lstmp_op DEPS sequence2batch lstm_compute)
-    op_library(gru_op DEPS sequence2batch gru_compute)
-endif(NOT WIN32)
+op_library(lstm_op DEPS sequence2batch lstm_compute)
+op_library(hierarchical_sigmoid_op DEPS matrix_bit_code)
+op_library(lstmp_op DEPS sequence2batch lstm_compute)
+op_library(gru_op DEPS sequence2batch gru_compute)
 op_library(recurrent_op DEPS executor)
 op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 op_library(cos_sim_op DEPS cos_sim_functor)

--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
@@ -31,12 +31,12 @@ namespace operators {

 template <typename T>
 __device__ bool GT_E(T a, T b) {
-  return (a > b) || fabs(a - b) < 1e-4;
+  return (a > b) || fabsf(static_cast<float>(a - b)) < 1e-4;
 }

 template <typename T>
 __device__ bool LT_E(T a, T b) {
-  return (a < b) || fabs(a - b) < 1e-4;
+  return (a < b) || fabsf(static_cast<float>(a - b)) < 1e-4;
 }

 template <typename T>

--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -57,9 +57,6 @@ math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function)
 math_library(sequence_scale)
 math_library(softmax DEPS math_function)
-if (NOT WIN32)
-    math_library(matrix_bit_code)
-endif (NOT WIN32)
 math_library(unpooling)
 math_library(vol2col)

@@ -75,7 +72,10 @@ if(WITH_GPU)
 endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
 cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
-cc_library(jit_kernel 
-    SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc
-    DEPS cpu_info cblas)
-cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
+if (NOT WIN32)
+    math_library(matrix_bit_code)
+    cc_library(jit_kernel
+        SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc
+        DEPS cpu_info cblas)
+    cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
+endif (NOT WIN32)
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -235,7 +235,9 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
            << ", Runtime Version: " << runtime_version_ / 1000 << "."
            << (runtime_version_ % 100) / 10;

+#ifndef _WIN32
  callback_manager_.reset(new StreamCallbackManager(stream_));
+#endif  // NOT WIN32
 }

 CUDADeviceContext::~CUDADeviceContext() {

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -31,7 +31,7 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include "paddle/fluid/platform/stream_callback_manager.h"
 #endif
 #include "unsupported/Eigen/CXX11/Tensor"
@@ -115,6 +115,7 @@ class CUDADeviceContext : public DeviceContext {
    PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
  }

+#ifndef _WIN32
  template <typename Callback>
  void AddStreamCallback(Callback&& callback) const {
    std::lock_guard<std::mutex> guard(callback_mtx_);
@@ -125,6 +126,16 @@ class CUDADeviceContext : public DeviceContext {
    std::lock_guard<std::mutex> guard(callback_mtx_);
    callback_manager_->Wait();
  }
+#else
+  template <typename Callback>
+  void AddStreamCallback(Callback&& callback) const {
+    // ugly empty functor.
+  }
+
+  void WaitStreamCallback() const {
+    // ugly empty functor.
+  }
+#endif

 private:
  CUDAPlace place_;
@@ -143,10 +154,12 @@ class CUDADeviceContext : public DeviceContext {

  mutable std::mutex mtx_;

+#ifndef _WIN32
  // This lock is only used by callback
  // If we use mtx_ for StreamCallbackManager, deadlock may occur sometimes
  mutable std::mutex callback_mtx_;
  std::unique_ptr<StreamCallbackManager> callback_manager_;
+#endif
 };

 template <>