提交 bf2e4cb1 编写于 作者: D dzhwinter

cleard. staged

上级 ebfe5a02
......@@ -173,6 +173,7 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
if (NOT WIN32) # windows msvc2015 support c++11 natively.
# -std=c++11 -fPIC not recoginize by msvc
list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
# in cuda9, suppress cuda warning on eigen with "-w"
list(APPEND CUDA_NVCC_FLAGS "-w" "-Xcompiler -fPIC")
else(NOT WIN32)
list(APPEND CUDA_NVCC_FLAGS "-w" "-Xcompiler -fPIC" "-Xcompiler /w")
......@@ -181,7 +182,7 @@ endif(NOT WIN32)
if(WITH_FAST_MATH)
# Make use of fast math library. https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
# in cuda9, suppress cuda warning on eigen
endif(WITH_FAST_MATH)
# Set :expt-relaxed-constexpr to suppress Eigen warnings
list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
......
......@@ -3,6 +3,7 @@ INCLUDE(ExternalProject)
SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool)
SET(THREADPOOL_INCLUDE_DIR ${THREADPOOL_SOURCE_DIR}/src/extern_threadpool)
INCLUDE_DIRECTORIES(${THREADPOOL_INCLUDE_DIR})
message("Debug" ${THREADPOOL_INCLUDE_DIR})
ExternalProject_Add(
extern_threadpool
......
......@@ -143,26 +143,14 @@ set(GPU_COMMON_FLAGS
-Wno-error=unused-function # Warnings in Numpy Header.
-Wno-error=array-bounds # Warnings in Eigen::array
)
set(COMMON_FLAGS
-fPIC
-fno-omit-frame-pointer)
set(GPU_COMMON_FLAGS
-fPIC
-fno-omit-frame-pointer)
else(NOT WIN32)
set(COMMON_FLAGS
"/w") #disable all warnings.
set(GPU_COMMON_FLAGS
"/w") #disable all warnings
endif(NOT WIN32)
else(NOT WIN32)
set(COMMON_FLAGS
-fPIC
-fno-omit-frame-pointer
"/w") #disable all warnings.
set(GPU_COMMON_FLAGS
-fPIC
-fno-omit-frame-pointer
"/w") #disable all warnings
endif(NOT WIN32)
......
......@@ -48,6 +48,7 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
VLOG(5) << "destroy ExecutorPrepareContext";
}
#ifndef _WIN32
template <typename RefCntMap>
static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
GarbageCollector<Tensor>* gc,
......@@ -82,6 +83,7 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
gc->Add(erase_tensors);
}
}
#endif
Executor::Executor(const platform::Place& place) : place_(place) {}
......@@ -331,97 +333,35 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
const ProgramDesc& program, int block_id) {
VLOG(3) << "before create prepare" << block_id << " " << program.Size();
std::unique_ptr<ExecutorPrepareContext> ctx(
new ExecutorPrepareContext(program, block_id));
VLOG(3) << "after create prepare";
// PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
VLOG(3) << "before create op_desc";
PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
auto& block = program.Block(block_id);
VLOG(3) << "create before" << ctx->ops_.size() << " "
<< block.AllOps().size();
int counter = 0;
for (auto& op_desc : block.AllOps()) {
ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
VLOG(3) << "create op "
<< "index " << ++counter << " type " << op_desc->Type();
}
VLOG(3) << "create finished" << ctx->ops_.size() << " "
<< block.AllOps().size();
return ctx;
}
std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
const ProgramDesc& program, const std::vector<int>& block_ids) {
VLOG(3) << "inside prepare";
std::vector<std::shared_ptr<ExecutorPrepareContext>> result;
VLOG(3) << "before go through block_ids";
for (auto& bid : block_ids) {
VLOG(3) << "block id" << bid;
auto* ctx = new ExecutorPrepareContext(program, bid);
// PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
auto& block = program.Block(bid);
int counter = 0;
VLOG(3) << "create before" << ctx->ops_.size() << " "
<< block.AllOps().size();
for (auto& op_desc : block.AllOps()) {
ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
VLOG(3) << "create op "
<< "index " << ++counter << " type " << op_desc->Type();
}
VLOG(3) << "create finished" << ctx->ops_.size() << " "
<< block.AllOps().size();
result.push_back(std::shared_ptr<ExecutorPrepareContext>(ctx));
}
return result;
}
// void CheckResult(const std::string op_type, ExecutorPrepareContext* ctx,
// Scope* local_scope) {
// VLOG(3) << "before checking result";
// auto& dev_ctx = *platform::DeviceContextPool::Instance().Get(place_);
// std::vector<std::string> outputs;
// auto& block = ctx->prog_.Block(0);
// bool found = false;
// framework::OpDesc* myop = nullptr;
// for(auto& op : block.AllOps()) {
// if(op->Type() == "load_combine" || op->Type() == "fetch" || op->Type() ==
// "feed") return;
// if (op->Type() == op_type) {
// found = true;
// myop = op;
// break;
// }
// }
// }
// if(!found) {
// VLOG(3) << "not found op!";
// return;
// }
// auto* op = myop;
// VLOG(3) << "start op output" << op->Type();
// for(auto var_name: op->OutputArgumentNames()) {
// auto* var = local_scope->Var(var_name);
// auto* var_desc = block.FindVar(var_name);
// if (var_desc->Persistable()) continue;
// auto* tensor = var->GetMutable<framework::LoDTensor>();
// framework::Tensor check;
// VLOG(3) << "before tensor copy";
// framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check);
// VLOG(3) << "after tensor copy";
// float sum = .0;
// for(size_t i=0; i < check.numel(); ++i) {
// sum += check.data<float>()[i];
// }
// VLOG(3) << "op " << op->Type() << " output var " << var_name << " sum "
// << sum;
// VLOG(3) << "after checking result";
// }
void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
bool create_local_scope, bool create_vars,
bool keep_kids) {
VLOG(3) << "RunPreparedContext inside";
Scope* local_scope = scope;
if (create_vars) {
if (create_local_scope) {
......@@ -430,6 +370,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
CreateVariables(ctx->prog_, local_scope, ctx->block_id_);
}
#ifndef _WIN32
int64_t max_memory_size = GetEagerDeletionThreshold();
std::unique_ptr<GarbageCollector<Tensor>> gc;
// WhileOp would set keep_kids to false
......@@ -471,6 +412,16 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
} else {
platform::DeviceContextPool::Instance().Get(place_)->Wait();
}
#else // WIN32
for (auto& op : ctx->ops_) {
op->Run(*local_scope, place_);
if (FLAGS_benchmark) {
VLOG(2) << "Memory used after operator " + op->Type() + " running: "
<< memory::memory_usage(place_);
}
}
platform::DeviceContextPool::Instance().Get(place_)->Wait();
#endif // NOT WIN32
if (local_scope != scope) {
scope->DeleteScope(local_scope);
......
......@@ -17,12 +17,14 @@ limitations under the License. */
#include <map>
#include <string>
#include <vector>
#include "paddle/fluid/framework/garbage_collector.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
#ifndef _WIN32
#include "paddle/fluid/framework/garbage_collector.h"
#endif
namespace paddle {
namespace framework {
......
......@@ -35,9 +35,10 @@ endif()
# Create static library
if (WIN32)
cc_library(paddle_fluid DEPS ${fluid_modules} ${fluid_thirdpa} paddle_fluid_api paddle_inference_api)
cc_library(paddle_fluid DEPS ${fluid_modules} ${fluid_third_partys} paddle_fluid_api paddle_inference_api)
else(WIND32)
cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array)
endif(WIN32)
if(NOT APPLE)
# TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
......
......@@ -51,6 +51,7 @@ function(inference_api_test TARGET_NAME)
endfunction(inference_api_test)
cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope)
cc_library(helper SRCS helper.cc DEPS reset_tensor_array lod_tensor scope)
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS reset_tensor_array lod_tensor scope)
cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor)
cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api)
......
......@@ -16,7 +16,6 @@
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle_inference_api.h"
namespace paddle {
......
......@@ -260,9 +260,8 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
if (config.use_gpu) {
// 1. GPU memeroy
PADDLE_ENFORCE_GT(
config.fraction_of_gpu_memory, 0.f,
"fraction_of_gpu_memory in the config should be set to range (0.,
1.]");
config.fraction_of_gpu_memory, 0.f,
"fraction_of_gpu_memory in the config should be set to range (0.,1.]");
PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
std::vector<std::string> flags;
if (config.fraction_of_gpu_memory >= 0.0f ||
......
......@@ -31,10 +31,10 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle_inference_api.h" // NOLINT
namespace paddle {
......
......@@ -14,8 +14,9 @@
#pragma once
#define GLOG_NO_ABBREVIATED_SEVERITIES
#define GOOGLE_GLOG_DLL_DECL
#include <glog/logging.h>
#include <algorithm>
#include <chrono> // NOLINT
#include <iterator>
......@@ -23,9 +24,7 @@
#include <sstream>
#include <string>
#include <vector>
#include "paddle/fluid/string/printf.h"
#include "paddle_inference_api.h"
#include "timer.h"
#include "paddle_inference_api.h" //NOLINT
namespace paddle {
namespace inference {
......@@ -97,7 +96,7 @@ static void TensorAssignData(PaddleTensor *tensor,
}
template <typename T>
static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
static int ZeroCopyTensorAssignData(paddle::ZeroCopyTensor *tensor,
const std::vector<std::vector<T>> &data) {
int size{0};
auto *ptr = tensor->mutable_data<T>(PaddlePlace::kCPU);
......
......@@ -284,12 +284,10 @@ op_library(array_to_lod_tensor_op DEPS lod_rank_table_op)
op_library(max_sequence_len_op DEPS lod_rank_table)
op_library(sequence_conv_op DEPS context_project)
op_library(sequence_pool_op DEPS sequence_pooling)
if (NOT WIN32)
op_library(lstm_op DEPS sequence2batch lstm_compute)
op_library(hierarchical_sigmoid_op DEPS matrix_bit_code)
op_library(lstmp_op DEPS sequence2batch lstm_compute)
op_library(gru_op DEPS sequence2batch gru_compute)
endif(NOT WIN32)
op_library(lstm_op DEPS sequence2batch lstm_compute)
op_library(hierarchical_sigmoid_op DEPS matrix_bit_code)
op_library(lstmp_op DEPS sequence2batch lstm_compute)
op_library(gru_op DEPS sequence2batch gru_compute)
op_library(recurrent_op DEPS executor)
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
op_library(cos_sim_op DEPS cos_sim_functor)
......
......@@ -31,12 +31,12 @@ namespace operators {
template <typename T>
__device__ bool GT_E(T a, T b) {
return (a > b) || fabs(a - b) < 1e-4;
return (a > b) || fabsf(static_cast<float>(a - b)) < 1e-4;
}
template <typename T>
__device__ bool LT_E(T a, T b) {
return (a < b) || fabs(a - b) < 1e-4;
return (a < b) || fabsf(static_cast<float>(a - b)) < 1e-4;
}
template <typename T>
......
......@@ -57,9 +57,6 @@ math_library(sequence_padding)
math_library(sequence_pooling DEPS math_function)
math_library(sequence_scale)
math_library(softmax DEPS math_function)
if (NOT WIN32)
math_library(matrix_bit_code)
endif (NOT WIN32)
math_library(unpooling)
math_library(vol2col)
......@@ -75,7 +72,10 @@ if(WITH_GPU)
endif()
cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
cc_library(jit_kernel
SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc
DEPS cpu_info cblas)
cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
if (NOT WIN32)
math_library(matrix_bit_code)
cc_library(jit_kernel
SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc
DEPS cpu_info cblas)
cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
endif (NOT WIN32)
......@@ -235,7 +235,9 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
<< ", Runtime Version: " << runtime_version_ / 1000 << "."
<< (runtime_version_ % 100) / 10;
#ifndef _WIN32
callback_manager_.reset(new StreamCallbackManager(stream_));
#endif // NOT WIN32
}
CUDADeviceContext::~CUDADeviceContext() {
......
......@@ -31,7 +31,7 @@ limitations under the License. */
#include "glog/logging.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#include "paddle/fluid/platform/stream_callback_manager.h"
#endif
#include "unsupported/Eigen/CXX11/Tensor"
......@@ -115,6 +115,7 @@ class CUDADeviceContext : public DeviceContext {
PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
}
#ifndef _WIN32
template <typename Callback>
void AddStreamCallback(Callback&& callback) const {
std::lock_guard<std::mutex> guard(callback_mtx_);
......@@ -125,6 +126,16 @@ class CUDADeviceContext : public DeviceContext {
std::lock_guard<std::mutex> guard(callback_mtx_);
callback_manager_->Wait();
}
#else
template <typename Callback>
void AddStreamCallback(Callback&& callback) const {
// ugly empty functor.
}
void WaitStreamCallback() const {
// ugly empty functor.
}
#endif
private:
CUDAPlace place_;
......@@ -143,10 +154,12 @@ class CUDADeviceContext : public DeviceContext {
mutable std::mutex mtx_;
#ifndef _WIN32
// This lock is only used by callback
// If we use mtx_ for StreamCallbackManager, deadlock may occur sometimes
mutable std::mutex callback_mtx_;
std::unique_ptr<StreamCallbackManager> callback_manager_;
#endif
};
template <>
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册