diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 99bf8ec8dc34d24478ca5dc8abfffb01ddb194e4..564878131c87afdba249a14f82f19adc67e7876c 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -173,6 +173,7 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF) if (NOT WIN32) # windows msvc2015 support c++11 natively. # -std=c++11 -fPIC not recoginize by msvc list(APPEND CUDA_NVCC_FLAGS "-std=c++11") +# in cuda9, suppress cuda warning on eigen with "-w" list(APPEND CUDA_NVCC_FLAGS "-w" "-Xcompiler -fPIC") else(NOT WIN32) list(APPEND CUDA_NVCC_FLAGS "-w" "-Xcompiler -fPIC" "-Xcompiler /w") @@ -181,7 +182,7 @@ endif(NOT WIN32) if(WITH_FAST_MATH) # Make use of fast math library. https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html list(APPEND CUDA_NVCC_FLAGS "--use_fast_math") -# in cuda9, suppress cuda warning on eigen +endif(WITH_FAST_MATH) # Set :expt-relaxed-constexpr to suppress Eigen warnings list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr") diff --git a/cmake/external/threadpool.cmake b/cmake/external/threadpool.cmake index 0159815fed81bdff6de3e561af569e9edc75f947..21527fe538b15abcf1dd13d11b1f5a3a08729a55 100644 --- a/cmake/external/threadpool.cmake +++ b/cmake/external/threadpool.cmake @@ -3,6 +3,7 @@ INCLUDE(ExternalProject) SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool) SET(THREADPOOL_INCLUDE_DIR ${THREADPOOL_SOURCE_DIR}/src/extern_threadpool) INCLUDE_DIRECTORIES(${THREADPOOL_INCLUDE_DIR}) +message("Debug" ${THREADPOOL_INCLUDE_DIR}) ExternalProject_Add( extern_threadpool diff --git a/cmake/flags.cmake b/cmake/flags.cmake index d2f64ef07cce2e362f950fd495e3e0cebef0585d..0476d2f598348adf51bf6c0e988eb4e28a0fcf36 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -143,26 +143,14 @@ set(GPU_COMMON_FLAGS -Wno-error=unused-function # Warnings in Numpy Header. -Wno-error=array-bounds # Warnings in Eigen::array ) -set(COMMON_FLAGS - -fPIC - -fno-omit-frame-pointer) -set(GPU_COMMON_FLAGS - -fPIC - -fno-omit-frame-pointer) - -else(NOT WIN32) -set(COMMON_FLAGS - "/w") #disable all warnings. - -set(GPU_COMMON_FLAGS - "/w") #disable all warnings - -endif(NOT WIN32) - else(NOT WIN32) set(COMMON_FLAGS + -fPIC + -fno-omit-frame-pointer "/w") #disable all warnings. set(GPU_COMMON_FLAGS + -fPIC + -fno-omit-frame-pointer "/w") #disable all warnings endif(NOT WIN32) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 814dec4aa473ee5b731d724ce0619117e8488dad..9ab1d1fa28dbca4a5f2eaa587a153293c00860ac 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -48,6 +48,7 @@ ExecutorPrepareContext::~ExecutorPrepareContext() { VLOG(5) << "destroy ExecutorPrepareContext"; } +#ifndef _WIN32 template static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op, GarbageCollector* gc, @@ -82,6 +83,7 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op, gc->Add(erase_tensors); } } +#endif Executor::Executor(const platform::Place& place) : place_(place) {} @@ -331,97 +333,35 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, std::unique_ptr Executor::Prepare( const ProgramDesc& program, int block_id) { - VLOG(3) << "before create prepare" << block_id << " " << program.Size(); std::unique_ptr ctx( new ExecutorPrepareContext(program, block_id)); - VLOG(3) << "after create prepare"; - // PADDLE_ENFORCE_LT(static_cast(block_id), program.Size()); - VLOG(3) << "before create op_desc"; + PADDLE_ENFORCE_LT(static_cast(block_id), program.Size()); auto& block = program.Block(block_id); - VLOG(3) << "create before" << ctx->ops_.size() << " " - << block.AllOps().size(); int counter = 0; for (auto& op_desc : block.AllOps()) { ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); - VLOG(3) << "create op " - << "index " << ++counter << " type " << op_desc->Type(); } - VLOG(3) << "create finished" << ctx->ops_.size() << " " - << block.AllOps().size(); return ctx; } std::vector> Executor::Prepare( const ProgramDesc& program, const std::vector& block_ids) { - VLOG(3) << "inside prepare"; std::vector> result; - VLOG(3) << "before go through block_ids"; for (auto& bid : block_ids) { - VLOG(3) << "block id" << bid; auto* ctx = new ExecutorPrepareContext(program, bid); - // PADDLE_ENFORCE_LT(static_cast(bid), program.Size()); + PADDLE_ENFORCE_LT(static_cast(bid), program.Size()); auto& block = program.Block(bid); - int counter = 0; - VLOG(3) << "create before" << ctx->ops_.size() << " " - << block.AllOps().size(); for (auto& op_desc : block.AllOps()) { ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); - VLOG(3) << "create op " - << "index " << ++counter << " type " << op_desc->Type(); } - VLOG(3) << "create finished" << ctx->ops_.size() << " " - << block.AllOps().size(); result.push_back(std::shared_ptr(ctx)); } return result; } -// void CheckResult(const std::string op_type, ExecutorPrepareContext* ctx, -// Scope* local_scope) { -// VLOG(3) << "before checking result"; -// auto& dev_ctx = *platform::DeviceContextPool::Instance().Get(place_); -// std::vector outputs; -// auto& block = ctx->prog_.Block(0); -// bool found = false; -// framework::OpDesc* myop = nullptr; -// for(auto& op : block.AllOps()) { -// if(op->Type() == "load_combine" || op->Type() == "fetch" || op->Type() == -// "feed") return; -// if (op->Type() == op_type) { -// found = true; -// myop = op; -// break; -// } -// } -// } -// if(!found) { -// VLOG(3) << "not found op!"; -// return; -// } -// auto* op = myop; -// VLOG(3) << "start op output" << op->Type(); -// for(auto var_name: op->OutputArgumentNames()) { -// auto* var = local_scope->Var(var_name); -// auto* var_desc = block.FindVar(var_name); -// if (var_desc->Persistable()) continue; -// auto* tensor = var->GetMutable(); -// framework::Tensor check; -// VLOG(3) << "before tensor copy"; -// framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check); -// VLOG(3) << "after tensor copy"; -// float sum = .0; -// for(size_t i=0; i < check.numel(); ++i) { -// sum += check.data()[i]; -// } -// VLOG(3) << "op " << op->Type() << " output var " << var_name << " sum " -// << sum; -// VLOG(3) << "after checking result"; -// } - void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, bool create_local_scope, bool create_vars, bool keep_kids) { - VLOG(3) << "RunPreparedContext inside"; Scope* local_scope = scope; if (create_vars) { if (create_local_scope) { @@ -430,6 +370,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, CreateVariables(ctx->prog_, local_scope, ctx->block_id_); } +#ifndef _WIN32 int64_t max_memory_size = GetEagerDeletionThreshold(); std::unique_ptr> gc; // WhileOp would set keep_kids to false @@ -471,6 +412,16 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, } else { platform::DeviceContextPool::Instance().Get(place_)->Wait(); } +#else // WIN32 + for (auto& op : ctx->ops_) { + op->Run(*local_scope, place_); + if (FLAGS_benchmark) { + VLOG(2) << "Memory used after operator " + op->Type() + " running: " + << memory::memory_usage(place_); + } + } + platform::DeviceContextPool::Instance().Get(place_)->Wait(); +#endif // NOT WIN32 if (local_scope != scope) { scope->DeleteScope(local_scope); diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 36b36d49c2728dbef93042158dffa26d8f56d529..a2a6c6bfb13be8a18e2a65343743f8b3e4518eb1 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -17,12 +17,14 @@ limitations under the License. */ #include #include #include -#include "paddle/fluid/framework/garbage_collector.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" +#ifndef _WIN32 +#include "paddle/fluid/framework/garbage_collector.h" +#endif namespace paddle { namespace framework { diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index b8311623b0ef071aeebc45a5d37d82fd196733dc..7b2f6e5bc622f7e26a5c130be99f653d73f1c142 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -35,9 +35,10 @@ endif() # Create static library if (WIN32) -cc_library(paddle_fluid DEPS ${fluid_modules} ${fluid_thirdpa} paddle_fluid_api paddle_inference_api) +cc_library(paddle_fluid DEPS ${fluid_modules} ${fluid_third_partys} paddle_fluid_api paddle_inference_api) else(WIND32) cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array) +endif(WIN32) if(NOT APPLE) # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac. diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index e2027b7cb4d584ffcc48624d2c01e65a61829975..aea75074af2dce776ab343d863652091fd0f7468 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -51,6 +51,7 @@ function(inference_api_test TARGET_NAME) endfunction(inference_api_test) cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope) +cc_library(helper SRCS helper.cc DEPS reset_tensor_array lod_tensor scope) cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS reset_tensor_array lod_tensor scope) cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor) cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api) diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc index 01ea942d3c8d20180cfc9664b8601ba87a898e86..20fab8078fedf837564496aa296648bf5970a348 100644 --- a/paddle/fluid/inference/api/api.cc +++ b/paddle/fluid/inference/api/api.cc @@ -16,7 +16,6 @@ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle_inference_api.h" namespace paddle { diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index ba9b32de35e10a04216cd5b6aba7ce8e1012e963..eea5689da642ba71c14204e39441295e81fb4e5d 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -260,9 +260,8 @@ std::unique_ptr CreatePaddlePredictor< if (config.use_gpu) { // 1. GPU memeroy PADDLE_ENFORCE_GT( - config.fraction_of_gpu_memory, 0.f, - "fraction_of_gpu_memory in the config should be set to range (0., - 1.]"); + config.fraction_of_gpu_memory, 0.f, + "fraction_of_gpu_memory in the config should be set to range (0.,1.]"); PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device); std::vector flags; if (config.fraction_of_gpu_memory >= 0.0f || diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h index 4e4ab47ca9c5e37f2714ebd48d250c23c7e9b117..ed3bdd8de7f59b4c086eef48bf5b51da635ab572 100644 --- a/paddle/fluid/inference/api/api_impl.h +++ b/paddle/fluid/inference/api/api_impl.h @@ -31,10 +31,10 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/inference/api/details/reset_tensor_array.h" -#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/io.h" #include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/profiler.h" +#include "paddle_inference_api.h" // NOLINT namespace paddle { diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index e7a5109648b6f03e7f3130efa42e491c0fcb8341..a3f3d67deca5d4d72297c5311b9ca201c45c150a 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -14,8 +14,9 @@ #pragma once +#define GLOG_NO_ABBREVIATED_SEVERITIES +#define GOOGLE_GLOG_DLL_DECL #include - #include #include // NOLINT #include @@ -23,9 +24,7 @@ #include #include #include -#include "paddle/fluid/string/printf.h" -#include "paddle_inference_api.h" -#include "timer.h" +#include "paddle_inference_api.h" //NOLINT namespace paddle { namespace inference { @@ -97,7 +96,7 @@ static void TensorAssignData(PaddleTensor *tensor, } template -static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, +static int ZeroCopyTensorAssignData(paddle::ZeroCopyTensor *tensor, const std::vector> &data) { int size{0}; auto *ptr = tensor->mutable_data(PaddlePlace::kCPU); diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 5c18c46aa6c94be1064fb27a539908f245a4438e..19a8e5f4b3de465670d78f8b6d16753e79f14bad 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -284,12 +284,10 @@ op_library(array_to_lod_tensor_op DEPS lod_rank_table_op) op_library(max_sequence_len_op DEPS lod_rank_table) op_library(sequence_conv_op DEPS context_project) op_library(sequence_pool_op DEPS sequence_pooling) -if (NOT WIN32) - op_library(lstm_op DEPS sequence2batch lstm_compute) - op_library(hierarchical_sigmoid_op DEPS matrix_bit_code) - op_library(lstmp_op DEPS sequence2batch lstm_compute) - op_library(gru_op DEPS sequence2batch gru_compute) -endif(NOT WIN32) +op_library(lstm_op DEPS sequence2batch lstm_compute) +op_library(hierarchical_sigmoid_op DEPS matrix_bit_code) +op_library(lstmp_op DEPS sequence2batch lstm_compute) +op_library(gru_op DEPS sequence2batch gru_compute) op_library(recurrent_op DEPS executor) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) op_library(cos_sim_op DEPS cos_sim_functor) diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu index c82930cc4994c3854e60f40ae9909a90d82cbff6..e70945a2bd1025ba542dbf4da556dc04a3f5b91f 100644 --- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu +++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu @@ -31,12 +31,12 @@ namespace operators { template __device__ bool GT_E(T a, T b) { - return (a > b) || fabs(a - b) < 1e-4; + return (a > b) || fabsf(static_cast(a - b)) < 1e-4; } template __device__ bool LT_E(T a, T b) { - return (a < b) || fabs(a - b) < 1e-4; + return (a < b) || fabsf(static_cast(a - b)) < 1e-4; } template diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 17b675fba8067851f6149edafcc9096690a3fd34..dcc3520abe90b28f93c7133b4c047fade2eeaff0 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -57,9 +57,6 @@ math_library(sequence_padding) math_library(sequence_pooling DEPS math_function) math_library(sequence_scale) math_library(softmax DEPS math_function) -if (NOT WIN32) - math_library(matrix_bit_code) -endif (NOT WIN32) math_library(unpooling) math_library(vol2col) @@ -75,7 +72,10 @@ if(WITH_GPU) endif() cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split) cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) -cc_library(jit_kernel - SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc - DEPS cpu_info cblas) -cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) +if (NOT WIN32) + math_library(matrix_bit_code) + cc_library(jit_kernel + SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc + DEPS cpu_info cblas) + cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) +endif (NOT WIN32) diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 7c511e20bae8304ee7720f8217a500ff797bfc28..fc365d0948a11c40e431f7304ffcc6db4688dbe6 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -235,7 +235,9 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) << ", Runtime Version: " << runtime_version_ / 1000 << "." << (runtime_version_ % 100) / 10; +#ifndef _WIN32 callback_manager_.reset(new StreamCallbackManager(stream_)); +#endif // NOT WIN32 } CUDADeviceContext::~CUDADeviceContext() { diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 942e13a724339dc85ed1fc72c11e208ddce36dbb..fcd7529b3112ab78264a10ccc6c08a446c212efc 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -31,7 +31,7 @@ limitations under the License. */ #include "glog/logging.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #include "paddle/fluid/platform/stream_callback_manager.h" #endif #include "unsupported/Eigen/CXX11/Tensor" @@ -115,6 +115,7 @@ class CUDADeviceContext : public DeviceContext { PADDLE_ENFORCE(cudaEventRecord(ev, stream_)); } +#ifndef _WIN32 template void AddStreamCallback(Callback&& callback) const { std::lock_guard guard(callback_mtx_); @@ -125,6 +126,16 @@ class CUDADeviceContext : public DeviceContext { std::lock_guard guard(callback_mtx_); callback_manager_->Wait(); } +#else + template + void AddStreamCallback(Callback&& callback) const { + // ugly empty functor. + } + + void WaitStreamCallback() const { + // ugly empty functor. + } +#endif private: CUDAPlace place_; @@ -143,10 +154,12 @@ class CUDADeviceContext : public DeviceContext { mutable std::mutex mtx_; +#ifndef _WIN32 // This lock is only used by callback // If we use mtx_ for StreamCallbackManager, deadlock may occur sometimes mutable std::mutex callback_mtx_; std::unique_ptr callback_manager_; +#endif }; template <>