diff --git a/CMakeLists.txt b/CMakeLists.txt index cb646d3ce5d660734a27c0ac9f18ad54cd3e1c1b..c31f51a3f7371bd7b1b0ca3234091a35868806ce 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -208,10 +208,10 @@ include(external/xxhash) # download xxhash include(external/dlpack) include(external/snappy) # download snappy include(external/snappystream) # download snappystream +include(external/warpctc) # download, build, install warpctc if (NOT WIN32) -# there is no official support of warpctc, nccl, cupti in windows -include(external/warpctc) # download, build, install warpctc +# there is no official support of nccl, cupti in windows include(cupti) include(external/gzstream) endif (NOT WIN32) diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index 07e1137e16afc1e4e9ab9640e1ccaea8008a0cd2..7b937c93febdfa1d7d5f4c73fc2a5830322688e5 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -26,25 +26,33 @@ SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" # Used in unit test test_WarpCTCLayer SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib" CACHE PATH "Warp-ctc Library Directory" FORCE) -SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" - CACHE FILEPATH "Warp-ctc Library" FORCE) -IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" ) +IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR WIN32) SET(USE_OMP OFF) ELSE() SET(USE_OMP ON) ENDIF() +IF(WIN32) + SET(WARPCTC_REPOSITORY "https://github.com/wopeizl/warp-ctc.git") +ELSE() + SET(WARPCTC_REPOSITORY "https://github.com/dzhwinter/warp-ctc.git") +ENDIF() + ExternalProject_Add( extern_warpctc ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/dzhwinter/warp-ctc.git" + GIT_REPOSITORY ${WARPCTC_REPOSITORY} PREFIX ${WARPCTC_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} -DWITH_GPU=${WITH_GPU} -DWITH_OMP=${USE_OMP} @@ -59,6 +67,18 @@ ExternalProject_Add( -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} ) +IF(WIN32) + IF(NOT EXISTS "${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}") + add_custom_command(TARGET extern_warpctc POST_BUILD + COMMAND cmake -E copy ${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX} ${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX} + ) + ENDIF() + SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" + CACHE FILEPATH "Warp-ctc Library" FORCE) +else(WIN32) + SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" + CACHE FILEPATH "Warp-ctc Library" FORCE) +ENDIF(WIN32) MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}") INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers. diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 2ced43f9e6c60da642f7a6252f889d9c9ab9748f..70d159b4f3549662e080794efad8af943ce1f0bc 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -84,7 +84,7 @@ function(op_library TARGET) endif() if (WIN32) # remove windows unsupported op, because windows has no nccl, no warpctc such ops. - foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op") + foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op") if ("${TARGET}" STREQUAL "${windows_unsupport_op}") return() endif() diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/framework/ngraph_bridge.cc index a5acfd70449e92663cb66ef90a141c087ff6ec88..5fcb17b9f3ac390548aba33db7d0b8350cde7e00 100644 --- a/paddle/fluid/framework/ngraph_bridge.cc +++ b/paddle/fluid/framework/ngraph_bridge.cc @@ -16,100 +16,25 @@ limitations under the License. */ #include #include +#include "ngraph/ngraph.hpp" #include "paddle/fluid/framework/ngraph_bridge.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/ngraph/ngraph_ops.h" #include "paddle/fluid/platform/enforce.h" - -#include "ngraph/ngraph.hpp" +#include "paddle/fluid/platform/ngraph_helper.h" namespace paddle { namespace framework { -static std::shared_ptr GetNode( - const std::shared_ptr& op, const std::string name, - const VariableNameMap& var_map, - std::shared_ptr< - std::unordered_map>> - ngb_node_map) { - auto& var_names = var_map.at(name); - PADDLE_ENFORCE_EQ(var_names.size(), 1, - "op %s name %s expects one associated var", op->Type(), - name); - if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) { - return (*ngb_node_map)[var_names[0]]; - } else { - return nullptr; - } -} - -static std::shared_ptr GetInputNode( - const std::shared_ptr& op, const std::string name, - std::shared_ptr< - std::unordered_map>> - ngb_node_map) { - return GetNode(op, name, op->Inputs(), ngb_node_map); -} - -static std::shared_ptr GetOutputNode( - const std::shared_ptr& op, const std::string name, - std::shared_ptr< - std::unordered_map>> - ngb_node_map) { - return GetNode(op, name, op->Outputs(), ngb_node_map); -} - -static void SetOutputNode( - const std::shared_ptr& op, const std::string name, - std::shared_ptr node, - std::shared_ptr< - std::unordered_map>> - ngb_node_map) { - auto& var_names = op->Outputs().at(name); - if (var_names.size() == 1) { - (*ngb_node_map)[var_names[0]] = node; - } else if (var_names.size() == 0) { - (*ngb_node_map)[""] = node; - } else { - PADDLE_THROW("name %s has more than 1 var_names.", name); - } -} - -static bool HasOutput(const std::shared_ptr& op, - const std::string name) { - auto& outputs = op->Outputs(); - if (outputs.find(name) == outputs.end()) return false; - return outputs.at(name).size() > 0; -} - -template -static void BuildBinaryNode( - const std::shared_ptr& op, - std::shared_ptr< - std::unordered_map>> - ngb_node_map) { - auto x = GetInputNode(op, "X", ngb_node_map); - auto y = GetInputNode(op, "Y", ngb_node_map); - auto out = std::make_shared(x, y); - SetOutputNode(op, "Out", out, ngb_node_map); -} - -template -static void BuildUnaryNode( - const std::shared_ptr& op, - std::shared_ptr< - std::unordered_map>> - ngb_node_map) { - auto input = GetInputNode(op, "X", ngb_node_map); - auto out = std::make_shared(input); - SetOutputNode(op, "Out", out, ngb_node_map); -} - std::map&, std::shared_ptr>>)>> - NgraphBridge::NG_NODE_MAP = {{"relu", BuildUnaryNode}, - {"tanh", BuildUnaryNode}}; + NgraphBridge::NG_NODE_MAP = { + {"mul", paddle::operators::ngraphs::BuildMulNode}, + {"mul_grad", paddle::operators::ngraphs::BuildMulGradNode}, + {"relu", paddle::operators::ngraphs::BuildUnaryNode}, + {"tanh", paddle::operators::ngraphs::BuildUnaryNode}}; void NgraphBridge::BuildNgNode(const std::shared_ptr& op) { auto& op_type = op->Type(); diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc index e2cdfc845fe531733acc65746e661bfc2eed4890..23f681ce886fd0d8c113ffe4e80e25e6a803e31b 100644 --- a/paddle/fluid/framework/ngraph_operator.cc +++ b/paddle/fluid/framework/ngraph_operator.cc @@ -278,7 +278,8 @@ std::shared_ptr NgraphEngine::backend_ = ngraph::runtime::Backend::create("CPU"); void NgraphEngine::GetNgInputShape(std::shared_ptr op) { - op->RuntimeInferShape(scope_, place_); + RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_); + op->RuntimeInferShape(scope_, place_, ctx); for (auto& var_name_item : op->Inputs()) { for (auto& var_name : var_name_item.second) { auto* var = scope_.FindVar(var_name); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index a62afe248baa2ab57edabfaac05bb858a1a1280c..8c83748668e9f91e9c333beb8494c3ef1db875dc 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -137,6 +137,23 @@ static LoD GetLoD(const Scope& scope, const std::string& name) { } } +RuntimeContext::RuntimeContext(const VariableNameMap& innames, + const VariableNameMap& outnames, + const Scope& scope) { + for (auto& var_name_item : innames) { + std::vector& input_vars = inputs[var_name_item.first]; + for (auto& var_name : var_name_item.second) { + input_vars.push_back(scope.FindVar(var_name)); + } + } + for (auto& var_name_item : outnames) { + std::vector& output_vars = outputs[var_name_item.first]; + for (auto& var_name : var_name_item.second) { + output_vars.push_back(scope.FindVar(var_name)); + } + } +} + void OperatorBase::Run(const Scope& scope, const platform::Place& place) { VLOG(4) << place << " " << DebugStringEx(&scope); if (platform::is_gpu_place(place)) { @@ -412,11 +429,48 @@ bool ExecutionContext::HasOutput(const std::string& name) const { return var != nullptr; } +const Variable* ExecutionContext::InputVar(const std::string& name) const { + auto it = ctx_.inputs.find(name); + if (it == ctx_.inputs.end()) return nullptr; + + PADDLE_ENFORCE_LE(it->second.size(), 1UL, + "Operator %s's input %s should contain only one variable.", + op_.Type(), name); + return it->second.empty() ? nullptr : it->second[0]; +} + +const Variable* ExecutionContext::LegacyInputVar( + const std::string& name) const { + auto ipt = op_.Input(name); + return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); +} + +Variable* ExecutionContext::OutputVar(const std::string& name) const { + auto it = ctx_.outputs.find(name); + if (it == ctx_.outputs.end()) return nullptr; + + PADDLE_ENFORCE_LE(it->second.size(), 1UL, + "Operator %s's output %s should contain only one variable.", + op_.Type(), name); + return it->second.empty() ? nullptr : it->second[0]; +} + +Variable* ExecutionContext::LegacyOutputVar(const std::string& name) const { + auto opt = op_.Output(name); + return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt); +} + template <> const Tensor* ExecutionContext::Input(const std::string& name) const { return Input(name); } +template <> +const Tensor* ExecutionContext::LegacyInput( + const std::string& name) const { + return LegacyInput(name); +} + template <> const std::vector ExecutionContext::MultiInput( const std::string& name) const { @@ -441,6 +495,11 @@ Tensor* ExecutionContext::Output(const std::string& name) const { return Output(name); } +template <> +Tensor* ExecutionContext::LegacyOutput(const std::string& name) const { + return LegacyOutput(name); +} + template <> std::vector ExecutionContext::MultiOutput( const std::string& name) const { @@ -477,23 +536,22 @@ bool OpSupportGPU(const std::string& op_type) { class RuntimeInferShapeContext : public InferShapeContext { public: - RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope) - : op_(op), scope_(scope) {} + RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope, + const RuntimeContext& ctx) + : op_(op), scope_(scope), ctx_(ctx) {} bool HasInput(const std::string& name) const override { // has only one input - const auto& ins = op_.Inputs(); + const auto& ins = ctx_.inputs; auto it = ins.find(name); if (it == ins.end()) { return false; } const auto& in = it->second; - if (in.size() == 0 || in[0] == kEmptyVarName) { - return false; - } + if (in.size() == 0) return false; PADDLE_ENFORCE_EQ(in.size(), 1UL, "Input %s should not have more than one inputs", name); - return scope_.FindVar(in[0]) != nullptr; + return in[0] != nullptr; } bool HasOutput(const std::string& name) const override { @@ -678,6 +736,7 @@ class RuntimeInferShapeContext : public InferShapeContext { private: const OperatorBase& op_; const Scope& scope_; + const RuntimeContext& ctx_; }; static void CheckTensorNANOrInf(const std::string& name, @@ -696,15 +755,15 @@ static void CheckTensorNANOrInf(const std::string& name, } void OperatorWithKernel::RuntimeInferShape(const Scope& scope, - const platform::Place& place) const { - RuntimeInferShapeContext infer_shape_ctx(*this, scope); + const platform::Place& place, + const RuntimeContext& ctx) const { + RuntimeInferShapeContext infer_shape_ctx(*this, scope, ctx); this->InferShape(&infer_shape_ctx); } void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place) const { - RuntimeInferShapeContext infer_shape_ctx(*this, scope); - this->InferShape(&infer_shape_ctx); + RuntimeContext ctx(Inputs(), Outputs(), scope); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.Get(place); @@ -718,15 +777,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, OpKernelMap& kernels = kernels_iter->second; - // TODO(dzhwinter) : kernel fallback mechanism will be added when all the - // transform functions are ready. - - // for (auto& candidate : kKernelPriority) { - // Do selection - // } - - auto expected_kernel_key = - this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx)); + auto expected_kernel_key = this->GetExpectedKernelType( + ExecutionContext(*this, scope, *dev_ctx, ctx)); VLOG(3) << "expected_kernel_key:" << expected_kernel_key; auto kernel_iter = kernels.find(expected_kernel_key); @@ -748,7 +800,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, // do data transformScope &transfer_scope; std::vector transfered_inplace_vars; auto* transfer_scope = - TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars); + PrepareData(scope, expected_kernel_key, &transfered_inplace_vars, &ctx); // exec scope is the scope that kernel actually executed on. const Scope& exec_scope = @@ -758,7 +810,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope, dev_ctx = pool.Get(expected_kernel_key.place_); } - kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx)); + RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx); + this->InferShape(&infer_shape_ctx); + // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext + // not Scope. Imperative mode only pass inputs and get outputs. + kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx, ctx)); if (!transfered_inplace_vars.empty()) { // there is inplace variable has been transfered. @@ -782,6 +838,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, } } } + void OperatorWithKernel::TransferInplaceVarsBack( const Scope& scope, const std::vector& inplace_vars, const Scope& transfer_scope) const { @@ -797,13 +854,19 @@ void OperatorWithKernel::TransferInplaceVarsBack( } } -Scope* OperatorWithKernel::TryTransferData( +Scope* OperatorWithKernel::PrepareData( const Scope& scope, const OpKernelType& expected_kernel_key, - std::vector* transfered_inplace_vars) const { + std::vector* transfered_inplace_vars, + RuntimeContext* ctx) const { Scope* new_scope = nullptr; for (auto& var_name_item : Inputs()) { - for (auto& var_name : var_name_item.second) { + std::vector& input_vars = ctx->inputs[var_name_item.first]; + + for (size_t i = 0; i < var_name_item.second.size(); ++i) { + auto& var_name = var_name_item.second[i]; auto* var = scope.FindVar(var_name); + input_vars[i] = var; + // Only tensor can be tranfer to another device. if (var == nullptr || !VarIsTensor(*var)) { continue; @@ -851,6 +914,7 @@ Scope* OperatorWithKernel::TryTransferData( } auto* trans_var = new_scope->Var(var_name); + input_vars[i] = trans_var; Tensor out; TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out); diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 0a6a28a5bce01d71cf56f25f5556033db94452c2..39190d07b4ccdd5ffd03e2d50bb0e577ac00af75 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -70,6 +70,15 @@ Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var); class OperatorBase; class ExecutionContext; +class RuntimeContext { + public: + RuntimeContext(const VariableNameMap& innames, + const VariableNameMap& outnames, const Scope& scope); + + VariableValueMap inputs; + VariableValueMap outputs; +}; + /** * OperatorBase has the basic elements that Net will call to do computation. * Only CreateOperator from OpRegistry will new Operator directly. User @@ -129,7 +138,8 @@ class OperatorBase { void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; } virtual void RuntimeInferShape(const Scope& scope, - const platform::Place& place) const {} + const platform::Place& place, + const RuntimeContext& ctx) const {} protected: std::string type_; @@ -156,8 +166,9 @@ class OperatorBase { class ExecutionContext { public: ExecutionContext(const OperatorBase& op, const Scope& scope, - const platform::DeviceContext& device_context) - : op_(op), scope_(scope), device_context_(device_context) {} + const platform::DeviceContext& device_context, + const RuntimeContext& ctx) + : op_(op), scope_(scope), device_context_(device_context), ctx_(ctx) {} const OperatorBase& op() const { return op_; } @@ -180,15 +191,9 @@ class ExecutionContext { return op_.Outputs(name).size(); } - const Variable* InputVar(const std::string& name) const { - auto ipt = op_.Input(name); - return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); - } + const Variable* InputVar(const std::string& name) const; - Variable* OutputVar(const std::string& name) const { - auto opt = op_.Output(name); - return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt); - } + Variable* OutputVar(const std::string& name) const; const std::vector MultiInputVar( const std::string& name) const { @@ -227,6 +232,22 @@ class ExecutionContext { return var == nullptr ? nullptr : var->GetMutable(); } + template + const T* LegacyInput(const std::string& name) const { + auto* var = LegacyInputVar(name); + return var == nullptr ? nullptr : &var->Get(); + } + + template + T* LegacyOutput(const std::string& name) const { + auto var = LegacyOutputVar(name); + return var == nullptr ? nullptr : var->GetMutable(); + } + + const Variable* LegacyInputVar(const std::string& name) const; + + Variable* LegacyOutputVar(const std::string& name) const; + template const std::vector MultiInput(const std::string& name) const { auto names = op_.Inputs(name); @@ -286,11 +307,16 @@ class ExecutionContext { const OperatorBase& op_; const Scope& scope_; const platform::DeviceContext& device_context_; + const RuntimeContext& ctx_; }; template <> const Tensor* ExecutionContext::Input(const std::string& name) const; +template <> +const Tensor* ExecutionContext::LegacyInput( + const std::string& name) const; + template <> const std::vector ExecutionContext::MultiInput( const std::string& name) const; @@ -298,6 +324,9 @@ const std::vector ExecutionContext::MultiInput( template <> Tensor* ExecutionContext::Output(const std::string& name) const; +template <> +Tensor* ExecutionContext::LegacyOutput(const std::string& name) const; + template <> std::vector ExecutionContext::MultiOutput( const std::string& name) const; @@ -350,8 +379,8 @@ class OperatorWithKernel : public OperatorBase { OpInfoMap::Instance().Get(Type()).infer_shape_(ctx); } - void RuntimeInferShape(const Scope& scope, - const platform::Place& place) const override; + void RuntimeInferShape(const Scope& scope, const platform::Place& place, + const RuntimeContext& ctx) const override; protected: virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const; @@ -371,9 +400,10 @@ class OperatorWithKernel : public OperatorBase { * * * transfered_inplace_vars is a output vector. */ - Scope* TryTransferData( - const Scope& scope, const OpKernelType& expected_kernel_key, - std::vector* transfered_inplace_vars) const; + Scope* PrepareData(const Scope& scope, + const OpKernelType& expected_kernel_key, + std::vector* transfered_inplace_vars, + RuntimeContext* ctx) const; void TransferInplaceVarsBack(const Scope& scope, const std::vector& inplace_vars, diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h index 2de6233a9e0d320ec9a06d547db3575eb61925c0..938e2024c3359c2acd65a1aa4af875a8350e4c58 100644 --- a/paddle/fluid/framework/type_defs.h +++ b/paddle/fluid/framework/type_defs.h @@ -28,8 +28,11 @@ class OperatorBase; class OpDesc; class InferShapeContext; class BlockDesc; +class Variable; using VariableNameMap = std::map>; +// TODO(panyx0718): Replace vector with something like gtl::Vector. +using VariableValueMap = std::map>; // The order should be as same as framework.proto using Attribute = diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 16ef5c9524e5d414350bd485d1930b20a139b1fe..01432af0f30ce4b87e0a5faef7a53a14a45b6965 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -65,9 +65,7 @@ endif() set(COMMON_OP_DEPS ${OP_HEADER_DEPS}) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor) -if (NOT WIN32) - set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) -endif() +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) if (WITH_GPU) diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc index ae9765b76138a34935619b662a8ffb7f46c8300c..7f2bde55c98277b9fd4b3374657001c42d673d43 100644 --- a/paddle/fluid/operators/beam_search_decode_op.cc +++ b/paddle/fluid/operators/beam_search_decode_op.cc @@ -122,7 +122,8 @@ class BeamSearchDecodeOp : public framework::OperatorBase { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& dev_ctx = *pool.Get(dev_place); - framework::ExecutionContext ctx(*this, scope, dev_ctx); + framework::RuntimeContext run_ctx(Inputs(), Outputs(), scope); + framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx); const LoDTensorArray* ids = ctx.Input("Ids"); const LoDTensorArray* scores = ctx.Input("Scores"); diff --git a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc index 6fed9ba92c16477c79509a30911920e67b9f9bdd..e4604db3a381616c7420f816f0b49a015c925bd4 100644 --- a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc +++ b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #endif #include +#include #include // NOLINT #include "paddle/fluid/framework/data_type.h" @@ -31,7 +32,12 @@ namespace distributed { class IOBufWriter { public: - static void Append(butil::IOBuf* iobuf, int k, const char* v, int64_t vlen) { + static void Append(const std::string& varname, butil::IOBuf* iobuf, int k, + const char* v, int64_t vlen) { + if (vlen >= std::numeric_limits::max() || vlen < 0) { + LOG(FATAL) << "AppendZeroCopy varname:" << varname << ", vlen:" << vlen; + } + iobuf->append(reinterpret_cast(&k), 4); iobuf->append(reinterpret_cast(&vlen), 8); iobuf->append(v, vlen); @@ -87,6 +93,10 @@ class IOBufWriter { int k, const char* v, int64_t vlen, bool in_cuda_pinned, void (*destroy)(void*), void* user_data) { + if (vlen >= std::numeric_limits::max() || vlen < 0) { + LOG(FATAL) << "AppendZeroCopy varname:" << varname << ", vlen:" << vlen; + } + #ifdef PADDLE_WITH_BRPC_RDMA IOBufWriter::AppendRdmaZeroCopy(varname, iobuf, k, v, vlen, in_cuda_pinned, destroy, user_data); @@ -134,7 +144,7 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var, request->set_type(::sendrecv::NCCL_ID); const ncclUniqueId& uid = var->Get(); // TODO(gongwb): use append_zero to avoid data copy. - IOBufWriter::Append(iobuf, + IOBufWriter::Append(name, iobuf, sendrecv::VariableMessage::kSerializedFieldNumber, uid.internal, NCCL_UNIQUE_ID_BYTES); return; @@ -149,7 +159,7 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var, // FIXME(gongwb): it seems that can use zero copy. if (var_is_not_stable) { IOBufWriter::Append( - iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, + name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, static_cast(payload->ptr()), payload->memory_size()); } else { if (platform::is_gpu_place(ctx.GetPlace())) { @@ -171,10 +181,11 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var, if (var->IsType()) { auto* slr = var->GetMutable(); - size_t rows_memory_size = - slr->rows().size() * framework::SizeOfType(typeid(int64_t)); + PADDLE_ENFORCE(VectorElemName(slr->rows()) == typeid(int64_t).name()); + size_t rows_memory_size = slr->rows().size() * sizeof(int64_t); - IOBufWriter::Append(iobuf, ::sendrecv::VariableMessage::kRowsFieldNumber, + IOBufWriter::Append(name, iobuf, + ::sendrecv::VariableMessage::kRowsFieldNumber, reinterpret_cast(slr->rows().data()), static_cast(rows_memory_size)); } diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc_serde.cc index 299dfe35438c35ec922dcdc75bf774a00f111bbb..a9dea9cfd2eeaa7e7ed8f052d2f51f5893c1e2e3 100644 --- a/paddle/fluid/operators/distributed/grpc_serde.cc +++ b/paddle/fluid/operators/distributed/grpc_serde.cc @@ -15,6 +15,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #endif +#include #include // NOLINT #include "google/protobuf/io/coded_stream.h" @@ -102,6 +103,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload->memory_size()); + if (payload->memory_size() >= std::numeric_limits::max()) { + LOG(FATAL) << "AppendZeroCopy varname:" << name + << ", vlen:" << payload->memory_size(); + } // steal reference of tensor data ::grpc::Slice slices[4]; // metadata, tensor, rows meta, rows int num_slices = 2; // only SelectedRows have rows buffer @@ -115,7 +120,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, if (var->IsType()) { auto* slr = var->GetMutable(); ProtoEncodeHelper e2(static_cast(buf), 128); + + PADDLE_ENFORCE(VectorElemName(slr->rows()) == typeid(int64_t).name()); size_t rows_memory_size = slr->rows().size() * sizeof(int64_t); + e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size); slices[2] = ::grpc::Slice(e2.size()); memcpy(const_cast(slices[2].begin()), e2.data(), e2.size()); diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h index 33eded0e6c0d90dadeeb63594983224d795fa244..6a87178be5daa02444c41a26f6e6c067713dd96f 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.h +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include #include +#include #include #include "paddle/fluid/framework/data_type.h" @@ -23,9 +24,8 @@ limitations under the License. */ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/platform/port.h" - #include "paddle/fluid/operators/distributed/send_recv.pb.h" +#include "paddle/fluid/platform/port.h" namespace paddle { namespace operators { @@ -83,6 +83,11 @@ inline framework::proto::VarType::Type ToVarType( } } +template