提交 8515ee3a 编写于 作者: J JiabinYang

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/add_prefech_hs

...@@ -208,10 +208,10 @@ include(external/xxhash) # download xxhash ...@@ -208,10 +208,10 @@ include(external/xxhash) # download xxhash
include(external/dlpack) include(external/dlpack)
include(external/snappy) # download snappy include(external/snappy) # download snappy
include(external/snappystream) # download snappystream include(external/snappystream) # download snappystream
include(external/warpctc) # download, build, install warpctc
if (NOT WIN32) if (NOT WIN32)
# there is no official support of warpctc, nccl, cupti in windows # there is no official support of nccl, cupti in windows
include(external/warpctc) # download, build, install warpctc
include(cupti) include(cupti)
include(external/gzstream) include(external/gzstream)
endif (NOT WIN32) endif (NOT WIN32)
......
...@@ -26,25 +26,33 @@ SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" ...@@ -26,25 +26,33 @@ SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
# Used in unit test test_WarpCTCLayer # Used in unit test test_WarpCTCLayer
SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib" SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib"
CACHE PATH "Warp-ctc Library Directory" FORCE) CACHE PATH "Warp-ctc Library Directory" FORCE)
SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
CACHE FILEPATH "Warp-ctc Library" FORCE)
IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" ) IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR WIN32)
SET(USE_OMP OFF) SET(USE_OMP OFF)
ELSE() ELSE()
SET(USE_OMP ON) SET(USE_OMP ON)
ENDIF() ENDIF()
IF(WIN32)
SET(WARPCTC_REPOSITORY "https://github.com/wopeizl/warp-ctc.git")
ELSE()
SET(WARPCTC_REPOSITORY "https://github.com/dzhwinter/warp-ctc.git")
ENDIF()
ExternalProject_Add( ExternalProject_Add(
extern_warpctc extern_warpctc
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/dzhwinter/warp-ctc.git" GIT_REPOSITORY ${WARPCTC_REPOSITORY}
PREFIX ${WARPCTC_SOURCES_DIR} PREFIX ${WARPCTC_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
-DWITH_GPU=${WITH_GPU} -DWITH_GPU=${WITH_GPU}
-DWITH_OMP=${USE_OMP} -DWITH_OMP=${USE_OMP}
...@@ -59,6 +67,18 @@ ExternalProject_Add( ...@@ -59,6 +67,18 @@ ExternalProject_Add(
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
) )
IF(WIN32)
IF(NOT EXISTS "${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}")
add_custom_command(TARGET extern_warpctc POST_BUILD
COMMAND cmake -E copy ${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX} ${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}
)
ENDIF()
SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
CACHE FILEPATH "Warp-ctc Library" FORCE)
else(WIN32)
SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
CACHE FILEPATH "Warp-ctc Library" FORCE)
ENDIF(WIN32)
MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}") MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers. INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers.
......
...@@ -84,7 +84,7 @@ function(op_library TARGET) ...@@ -84,7 +84,7 @@ function(op_library TARGET)
endif() endif()
if (WIN32) if (WIN32)
# remove windows unsupported op, because windows has no nccl, no warpctc such ops. # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op") foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op")
if ("${TARGET}" STREQUAL "${windows_unsupport_op}") if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
return() return()
endif() endif()
......
...@@ -16,100 +16,25 @@ limitations under the License. */ ...@@ -16,100 +16,25 @@ limitations under the License. */
#include <functional> #include <functional>
#include <vector> #include <vector>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/framework/ngraph_bridge.h" #include "paddle/fluid/framework/ngraph_bridge.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/ngraph/ngraph_ops.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/ngraph_helper.h"
#include "ngraph/ngraph.hpp"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
static std::shared_ptr<ngraph::Node> GetNode(
const std::shared_ptr<OperatorBase>& op, const std::string name,
const VariableNameMap& var_map,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto& var_names = var_map.at(name);
PADDLE_ENFORCE_EQ(var_names.size(), 1,
"op %s name %s expects one associated var", op->Type(),
name);
if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) {
return (*ngb_node_map)[var_names[0]];
} else {
return nullptr;
}
}
static std::shared_ptr<ngraph::Node> GetInputNode(
const std::shared_ptr<OperatorBase>& op, const std::string name,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
return GetNode(op, name, op->Inputs(), ngb_node_map);
}
static std::shared_ptr<ngraph::Node> GetOutputNode(
const std::shared_ptr<OperatorBase>& op, const std::string name,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
return GetNode(op, name, op->Outputs(), ngb_node_map);
}
static void SetOutputNode(
const std::shared_ptr<OperatorBase>& op, const std::string name,
std::shared_ptr<ngraph::Node> node,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto& var_names = op->Outputs().at(name);
if (var_names.size() == 1) {
(*ngb_node_map)[var_names[0]] = node;
} else if (var_names.size() == 0) {
(*ngb_node_map)[""] = node;
} else {
PADDLE_THROW("name %s has more than 1 var_names.", name);
}
}
static bool HasOutput(const std::shared_ptr<OperatorBase>& op,
const std::string name) {
auto& outputs = op->Outputs();
if (outputs.find(name) == outputs.end()) return false;
return outputs.at(name).size() > 0;
}
template <typename T>
static void BuildBinaryNode(
const std::shared_ptr<OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto x = GetInputNode(op, "X", ngb_node_map);
auto y = GetInputNode(op, "Y", ngb_node_map);
auto out = std::make_shared<T>(x, y);
SetOutputNode(op, "Out", out, ngb_node_map);
}
template <typename T>
static void BuildUnaryNode(
const std::shared_ptr<OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto input = GetInputNode(op, "X", ngb_node_map);
auto out = std::make_shared<T>(input);
SetOutputNode(op, "Out", out, ngb_node_map);
}
std::map<std::string, std::map<std::string,
std::function<void(const std::shared_ptr<OperatorBase>&, std::function<void(const std::shared_ptr<OperatorBase>&,
std::shared_ptr<std::unordered_map< std::shared_ptr<std::unordered_map<
std::string, std::shared_ptr<ngraph::Node>>>)>> std::string, std::shared_ptr<ngraph::Node>>>)>>
NgraphBridge::NG_NODE_MAP = {{"relu", BuildUnaryNode<ngraph::op::Relu>}, NgraphBridge::NG_NODE_MAP = {
{"tanh", BuildUnaryNode<ngraph::op::Tanh>}}; {"mul", paddle::operators::ngraphs::BuildMulNode},
{"mul_grad", paddle::operators::ngraphs::BuildMulGradNode},
{"relu", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Relu>},
{"tanh", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Tanh>}};
void NgraphBridge::BuildNgNode(const std::shared_ptr<OperatorBase>& op) { void NgraphBridge::BuildNgNode(const std::shared_ptr<OperatorBase>& op) {
auto& op_type = op->Type(); auto& op_type = op->Type();
......
...@@ -278,7 +278,8 @@ std::shared_ptr<ngraph::runtime::Backend> NgraphEngine::backend_ = ...@@ -278,7 +278,8 @@ std::shared_ptr<ngraph::runtime::Backend> NgraphEngine::backend_ =
ngraph::runtime::Backend::create("CPU"); ngraph::runtime::Backend::create("CPU");
void NgraphEngine::GetNgInputShape(std::shared_ptr<OperatorBase> op) { void NgraphEngine::GetNgInputShape(std::shared_ptr<OperatorBase> op) {
op->RuntimeInferShape(scope_, place_); RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_);
op->RuntimeInferShape(scope_, place_, ctx);
for (auto& var_name_item : op->Inputs()) { for (auto& var_name_item : op->Inputs()) {
for (auto& var_name : var_name_item.second) { for (auto& var_name : var_name_item.second) {
auto* var = scope_.FindVar(var_name); auto* var = scope_.FindVar(var_name);
......
...@@ -137,6 +137,23 @@ static LoD GetLoD(const Scope& scope, const std::string& name) { ...@@ -137,6 +137,23 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
} }
} }
RuntimeContext::RuntimeContext(const VariableNameMap& innames,
const VariableNameMap& outnames,
const Scope& scope) {
for (auto& var_name_item : innames) {
std::vector<Variable*>& input_vars = inputs[var_name_item.first];
for (auto& var_name : var_name_item.second) {
input_vars.push_back(scope.FindVar(var_name));
}
}
for (auto& var_name_item : outnames) {
std::vector<Variable*>& output_vars = outputs[var_name_item.first];
for (auto& var_name : var_name_item.second) {
output_vars.push_back(scope.FindVar(var_name));
}
}
}
void OperatorBase::Run(const Scope& scope, const platform::Place& place) { void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
VLOG(4) << place << " " << DebugStringEx(&scope); VLOG(4) << place << " " << DebugStringEx(&scope);
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
...@@ -412,11 +429,48 @@ bool ExecutionContext::HasOutput(const std::string& name) const { ...@@ -412,11 +429,48 @@ bool ExecutionContext::HasOutput(const std::string& name) const {
return var != nullptr; return var != nullptr;
} }
const Variable* ExecutionContext::InputVar(const std::string& name) const {
auto it = ctx_.inputs.find(name);
if (it == ctx_.inputs.end()) return nullptr;
PADDLE_ENFORCE_LE(it->second.size(), 1UL,
"Operator %s's input %s should contain only one variable.",
op_.Type(), name);
return it->second.empty() ? nullptr : it->second[0];
}
const Variable* ExecutionContext::LegacyInputVar(
const std::string& name) const {
auto ipt = op_.Input(name);
return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
}
Variable* ExecutionContext::OutputVar(const std::string& name) const {
auto it = ctx_.outputs.find(name);
if (it == ctx_.outputs.end()) return nullptr;
PADDLE_ENFORCE_LE(it->second.size(), 1UL,
"Operator %s's output %s should contain only one variable.",
op_.Type(), name);
return it->second.empty() ? nullptr : it->second[0];
}
Variable* ExecutionContext::LegacyOutputVar(const std::string& name) const {
auto opt = op_.Output(name);
return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt);
}
template <> template <>
const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const { const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
return Input<LoDTensor>(name); return Input<LoDTensor>(name);
} }
template <>
const Tensor* ExecutionContext::LegacyInput<Tensor>(
const std::string& name) const {
return LegacyInput<LoDTensor>(name);
}
template <> template <>
const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>( const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
const std::string& name) const { const std::string& name) const {
...@@ -441,6 +495,11 @@ Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const { ...@@ -441,6 +495,11 @@ Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const {
return Output<LoDTensor>(name); return Output<LoDTensor>(name);
} }
template <>
Tensor* ExecutionContext::LegacyOutput<Tensor>(const std::string& name) const {
return LegacyOutput<LoDTensor>(name);
}
template <> template <>
std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>( std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
const std::string& name) const { const std::string& name) const {
...@@ -477,23 +536,22 @@ bool OpSupportGPU(const std::string& op_type) { ...@@ -477,23 +536,22 @@ bool OpSupportGPU(const std::string& op_type) {
class RuntimeInferShapeContext : public InferShapeContext { class RuntimeInferShapeContext : public InferShapeContext {
public: public:
RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope) RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope,
: op_(op), scope_(scope) {} const RuntimeContext& ctx)
: op_(op), scope_(scope), ctx_(ctx) {}
bool HasInput(const std::string& name) const override { bool HasInput(const std::string& name) const override {
// has only one input // has only one input
const auto& ins = op_.Inputs(); const auto& ins = ctx_.inputs;
auto it = ins.find(name); auto it = ins.find(name);
if (it == ins.end()) { if (it == ins.end()) {
return false; return false;
} }
const auto& in = it->second; const auto& in = it->second;
if (in.size() == 0 || in[0] == kEmptyVarName) { if (in.size() == 0) return false;
return false;
}
PADDLE_ENFORCE_EQ(in.size(), 1UL, PADDLE_ENFORCE_EQ(in.size(), 1UL,
"Input %s should not have more than one inputs", name); "Input %s should not have more than one inputs", name);
return scope_.FindVar(in[0]) != nullptr; return in[0] != nullptr;
} }
bool HasOutput(const std::string& name) const override { bool HasOutput(const std::string& name) const override {
...@@ -678,6 +736,7 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -678,6 +736,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
private: private:
const OperatorBase& op_; const OperatorBase& op_;
const Scope& scope_; const Scope& scope_;
const RuntimeContext& ctx_;
}; };
static void CheckTensorNANOrInf(const std::string& name, static void CheckTensorNANOrInf(const std::string& name,
...@@ -696,15 +755,15 @@ static void CheckTensorNANOrInf(const std::string& name, ...@@ -696,15 +755,15 @@ static void CheckTensorNANOrInf(const std::string& name,
} }
void OperatorWithKernel::RuntimeInferShape(const Scope& scope, void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
const platform::Place& place) const { const platform::Place& place,
RuntimeInferShapeContext infer_shape_ctx(*this, scope); const RuntimeContext& ctx) const {
RuntimeInferShapeContext infer_shape_ctx(*this, scope, ctx);
this->InferShape(&infer_shape_ctx); this->InferShape(&infer_shape_ctx);
} }
void OperatorWithKernel::RunImpl(const Scope& scope, void OperatorWithKernel::RunImpl(const Scope& scope,
const platform::Place& place) const { const platform::Place& place) const {
RuntimeInferShapeContext infer_shape_ctx(*this, scope); RuntimeContext ctx(Inputs(), Outputs(), scope);
this->InferShape(&infer_shape_ctx);
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(place); auto* dev_ctx = pool.Get(place);
...@@ -718,15 +777,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -718,15 +777,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
OpKernelMap& kernels = kernels_iter->second; OpKernelMap& kernels = kernels_iter->second;
// TODO(dzhwinter) : kernel fallback mechanism will be added when all the auto expected_kernel_key = this->GetExpectedKernelType(
// transform functions are ready. ExecutionContext(*this, scope, *dev_ctx, ctx));
// for (auto& candidate : kKernelPriority) {
// Do selection
// }
auto expected_kernel_key =
this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
VLOG(3) << "expected_kernel_key:" << expected_kernel_key; VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
auto kernel_iter = kernels.find(expected_kernel_key); auto kernel_iter = kernels.find(expected_kernel_key);
...@@ -748,7 +800,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -748,7 +800,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
// do data transformScope &transfer_scope; // do data transformScope &transfer_scope;
std::vector<std::string> transfered_inplace_vars; std::vector<std::string> transfered_inplace_vars;
auto* transfer_scope = auto* transfer_scope =
TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars); PrepareData(scope, expected_kernel_key, &transfered_inplace_vars, &ctx);
// exec scope is the scope that kernel actually executed on. // exec scope is the scope that kernel actually executed on.
const Scope& exec_scope = const Scope& exec_scope =
...@@ -758,7 +810,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -758,7 +810,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
dev_ctx = pool.Get(expected_kernel_key.place_); dev_ctx = pool.Get(expected_kernel_key.place_);
} }
kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx)); RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx);
this->InferShape(&infer_shape_ctx);
// TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
// not Scope. Imperative mode only pass inputs and get outputs.
kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx, ctx));
if (!transfered_inplace_vars.empty()) { if (!transfered_inplace_vars.empty()) {
// there is inplace variable has been transfered. // there is inplace variable has been transfered.
...@@ -782,6 +838,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -782,6 +838,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
} }
} }
} }
void OperatorWithKernel::TransferInplaceVarsBack( void OperatorWithKernel::TransferInplaceVarsBack(
const Scope& scope, const std::vector<std::string>& inplace_vars, const Scope& scope, const std::vector<std::string>& inplace_vars,
const Scope& transfer_scope) const { const Scope& transfer_scope) const {
...@@ -797,13 +854,19 @@ void OperatorWithKernel::TransferInplaceVarsBack( ...@@ -797,13 +854,19 @@ void OperatorWithKernel::TransferInplaceVarsBack(
} }
} }
Scope* OperatorWithKernel::TryTransferData( Scope* OperatorWithKernel::PrepareData(
const Scope& scope, const OpKernelType& expected_kernel_key, const Scope& scope, const OpKernelType& expected_kernel_key,
std::vector<std::string>* transfered_inplace_vars) const { std::vector<std::string>* transfered_inplace_vars,
RuntimeContext* ctx) const {
Scope* new_scope = nullptr; Scope* new_scope = nullptr;
for (auto& var_name_item : Inputs()) { for (auto& var_name_item : Inputs()) {
for (auto& var_name : var_name_item.second) { std::vector<Variable*>& input_vars = ctx->inputs[var_name_item.first];
for (size_t i = 0; i < var_name_item.second.size(); ++i) {
auto& var_name = var_name_item.second[i];
auto* var = scope.FindVar(var_name); auto* var = scope.FindVar(var_name);
input_vars[i] = var;
// Only tensor can be tranfer to another device. // Only tensor can be tranfer to another device.
if (var == nullptr || !VarIsTensor(*var)) { if (var == nullptr || !VarIsTensor(*var)) {
continue; continue;
...@@ -851,6 +914,7 @@ Scope* OperatorWithKernel::TryTransferData( ...@@ -851,6 +914,7 @@ Scope* OperatorWithKernel::TryTransferData(
} }
auto* trans_var = new_scope->Var(var_name); auto* trans_var = new_scope->Var(var_name);
input_vars[i] = trans_var;
Tensor out; Tensor out;
TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out); TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out);
......
...@@ -70,6 +70,15 @@ Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var); ...@@ -70,6 +70,15 @@ Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var);
class OperatorBase; class OperatorBase;
class ExecutionContext; class ExecutionContext;
class RuntimeContext {
public:
RuntimeContext(const VariableNameMap& innames,
const VariableNameMap& outnames, const Scope& scope);
VariableValueMap inputs;
VariableValueMap outputs;
};
/** /**
* OperatorBase has the basic elements that Net will call to do computation. * OperatorBase has the basic elements that Net will call to do computation.
* Only CreateOperator from OpRegistry will new Operator directly. User * Only CreateOperator from OpRegistry will new Operator directly. User
...@@ -129,7 +138,8 @@ class OperatorBase { ...@@ -129,7 +138,8 @@ class OperatorBase {
void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; } void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; }
virtual void RuntimeInferShape(const Scope& scope, virtual void RuntimeInferShape(const Scope& scope,
const platform::Place& place) const {} const platform::Place& place,
const RuntimeContext& ctx) const {}
protected: protected:
std::string type_; std::string type_;
...@@ -156,8 +166,9 @@ class OperatorBase { ...@@ -156,8 +166,9 @@ class OperatorBase {
class ExecutionContext { class ExecutionContext {
public: public:
ExecutionContext(const OperatorBase& op, const Scope& scope, ExecutionContext(const OperatorBase& op, const Scope& scope,
const platform::DeviceContext& device_context) const platform::DeviceContext& device_context,
: op_(op), scope_(scope), device_context_(device_context) {} const RuntimeContext& ctx)
: op_(op), scope_(scope), device_context_(device_context), ctx_(ctx) {}
const OperatorBase& op() const { return op_; } const OperatorBase& op() const { return op_; }
...@@ -180,15 +191,9 @@ class ExecutionContext { ...@@ -180,15 +191,9 @@ class ExecutionContext {
return op_.Outputs(name).size(); return op_.Outputs(name).size();
} }
const Variable* InputVar(const std::string& name) const { const Variable* InputVar(const std::string& name) const;
auto ipt = op_.Input(name);
return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
}
Variable* OutputVar(const std::string& name) const { Variable* OutputVar(const std::string& name) const;
auto opt = op_.Output(name);
return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt);
}
const std::vector<const Variable*> MultiInputVar( const std::vector<const Variable*> MultiInputVar(
const std::string& name) const { const std::string& name) const {
...@@ -227,6 +232,22 @@ class ExecutionContext { ...@@ -227,6 +232,22 @@ class ExecutionContext {
return var == nullptr ? nullptr : var->GetMutable<T>(); return var == nullptr ? nullptr : var->GetMutable<T>();
} }
template <typename T>
const T* LegacyInput(const std::string& name) const {
auto* var = LegacyInputVar(name);
return var == nullptr ? nullptr : &var->Get<T>();
}
template <typename T>
T* LegacyOutput(const std::string& name) const {
auto var = LegacyOutputVar(name);
return var == nullptr ? nullptr : var->GetMutable<T>();
}
const Variable* LegacyInputVar(const std::string& name) const;
Variable* LegacyOutputVar(const std::string& name) const;
template <typename T> template <typename T>
const std::vector<const T*> MultiInput(const std::string& name) const { const std::vector<const T*> MultiInput(const std::string& name) const {
auto names = op_.Inputs(name); auto names = op_.Inputs(name);
...@@ -286,11 +307,16 @@ class ExecutionContext { ...@@ -286,11 +307,16 @@ class ExecutionContext {
const OperatorBase& op_; const OperatorBase& op_;
const Scope& scope_; const Scope& scope_;
const platform::DeviceContext& device_context_; const platform::DeviceContext& device_context_;
const RuntimeContext& ctx_;
}; };
template <> template <>
const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const; const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const;
template <>
const Tensor* ExecutionContext::LegacyInput<Tensor>(
const std::string& name) const;
template <> template <>
const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>( const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
const std::string& name) const; const std::string& name) const;
...@@ -298,6 +324,9 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>( ...@@ -298,6 +324,9 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
template <> template <>
Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const; Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const;
template <>
Tensor* ExecutionContext::LegacyOutput<Tensor>(const std::string& name) const;
template <> template <>
std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>( std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
const std::string& name) const; const std::string& name) const;
...@@ -350,8 +379,8 @@ class OperatorWithKernel : public OperatorBase { ...@@ -350,8 +379,8 @@ class OperatorWithKernel : public OperatorBase {
OpInfoMap::Instance().Get(Type()).infer_shape_(ctx); OpInfoMap::Instance().Get(Type()).infer_shape_(ctx);
} }
void RuntimeInferShape(const Scope& scope, void RuntimeInferShape(const Scope& scope, const platform::Place& place,
const platform::Place& place) const override; const RuntimeContext& ctx) const override;
protected: protected:
virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const; virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
...@@ -371,9 +400,10 @@ class OperatorWithKernel : public OperatorBase { ...@@ -371,9 +400,10 @@ class OperatorWithKernel : public OperatorBase {
* *
* * transfered_inplace_vars is a output vector. * * transfered_inplace_vars is a output vector.
*/ */
Scope* TryTransferData( Scope* PrepareData(const Scope& scope,
const Scope& scope, const OpKernelType& expected_kernel_key, const OpKernelType& expected_kernel_key,
std::vector<std::string>* transfered_inplace_vars) const; std::vector<std::string>* transfered_inplace_vars,
RuntimeContext* ctx) const;
void TransferInplaceVarsBack(const Scope& scope, void TransferInplaceVarsBack(const Scope& scope,
const std::vector<std::string>& inplace_vars, const std::vector<std::string>& inplace_vars,
......
...@@ -28,8 +28,11 @@ class OperatorBase; ...@@ -28,8 +28,11 @@ class OperatorBase;
class OpDesc; class OpDesc;
class InferShapeContext; class InferShapeContext;
class BlockDesc; class BlockDesc;
class Variable;
using VariableNameMap = std::map<std::string, std::vector<std::string>>; using VariableNameMap = std::map<std::string, std::vector<std::string>>;
// TODO(panyx0718): Replace vector with something like gtl::Vector.
using VariableValueMap = std::map<std::string, std::vector<Variable*>>;
// The order should be as same as framework.proto // The order should be as same as framework.proto
using Attribute = using Attribute =
......
...@@ -64,9 +64,7 @@ endif() ...@@ -64,9 +64,7 @@ endif()
set(COMMON_OP_DEPS ${OP_HEADER_DEPS}) set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
if (NOT WIN32) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
endif()
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
if (WITH_GPU) if (WITH_GPU)
......
...@@ -122,7 +122,8 @@ class BeamSearchDecodeOp : public framework::OperatorBase { ...@@ -122,7 +122,8 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& dev_ctx = *pool.Get(dev_place); auto& dev_ctx = *pool.Get(dev_place);
framework::ExecutionContext ctx(*this, scope, dev_ctx); framework::RuntimeContext run_ctx(Inputs(), Outputs(), scope);
framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx);
const LoDTensorArray* ids = ctx.Input<LoDTensorArray>("Ids"); const LoDTensorArray* ids = ctx.Input<LoDTensorArray>("Ids");
const LoDTensorArray* scores = ctx.Input<LoDTensorArray>("Scores"); const LoDTensorArray* scores = ctx.Input<LoDTensorArray>("Scores");
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <nccl.h> #include <nccl.h>
#endif #endif
#include <sys/time.h> #include <sys/time.h>
#include <limits>
#include <thread> // NOLINT #include <thread> // NOLINT
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
...@@ -31,7 +32,12 @@ namespace distributed { ...@@ -31,7 +32,12 @@ namespace distributed {
class IOBufWriter { class IOBufWriter {
public: public:
static void Append(butil::IOBuf* iobuf, int k, const char* v, int64_t vlen) { static void Append(const std::string& varname, butil::IOBuf* iobuf, int k,
const char* v, int64_t vlen) {
if (vlen >= std::numeric_limits<int>::max() || vlen < 0) {
LOG(FATAL) << "AppendZeroCopy varname:" << varname << ", vlen:" << vlen;
}
iobuf->append(reinterpret_cast<char*>(&k), 4); iobuf->append(reinterpret_cast<char*>(&k), 4);
iobuf->append(reinterpret_cast<char*>(&vlen), 8); iobuf->append(reinterpret_cast<char*>(&vlen), 8);
iobuf->append(v, vlen); iobuf->append(v, vlen);
...@@ -87,6 +93,10 @@ class IOBufWriter { ...@@ -87,6 +93,10 @@ class IOBufWriter {
int k, const char* v, int64_t vlen, int k, const char* v, int64_t vlen,
bool in_cuda_pinned, void (*destroy)(void*), bool in_cuda_pinned, void (*destroy)(void*),
void* user_data) { void* user_data) {
if (vlen >= std::numeric_limits<int>::max() || vlen < 0) {
LOG(FATAL) << "AppendZeroCopy varname:" << varname << ", vlen:" << vlen;
}
#ifdef PADDLE_WITH_BRPC_RDMA #ifdef PADDLE_WITH_BRPC_RDMA
IOBufWriter::AppendRdmaZeroCopy(varname, iobuf, k, v, vlen, in_cuda_pinned, IOBufWriter::AppendRdmaZeroCopy(varname, iobuf, k, v, vlen, in_cuda_pinned,
destroy, user_data); destroy, user_data);
...@@ -134,7 +144,7 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var, ...@@ -134,7 +144,7 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var,
request->set_type(::sendrecv::NCCL_ID); request->set_type(::sendrecv::NCCL_ID);
const ncclUniqueId& uid = var->Get<ncclUniqueId>(); const ncclUniqueId& uid = var->Get<ncclUniqueId>();
// TODO(gongwb): use append_zero to avoid data copy. // TODO(gongwb): use append_zero to avoid data copy.
IOBufWriter::Append(iobuf, IOBufWriter::Append(name, iobuf,
sendrecv::VariableMessage::kSerializedFieldNumber, sendrecv::VariableMessage::kSerializedFieldNumber,
uid.internal, NCCL_UNIQUE_ID_BYTES); uid.internal, NCCL_UNIQUE_ID_BYTES);
return; return;
...@@ -149,7 +159,7 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var, ...@@ -149,7 +159,7 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var,
// FIXME(gongwb): it seems that can use zero copy. // FIXME(gongwb): it seems that can use zero copy.
if (var_is_not_stable) { if (var_is_not_stable) {
IOBufWriter::Append( IOBufWriter::Append(
iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber,
static_cast<const char*>(payload->ptr()), payload->memory_size()); static_cast<const char*>(payload->ptr()), payload->memory_size());
} else { } else {
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
...@@ -171,10 +181,11 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var, ...@@ -171,10 +181,11 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var,
if (var->IsType<framework::SelectedRows>()) { if (var->IsType<framework::SelectedRows>()) {
auto* slr = var->GetMutable<framework::SelectedRows>(); auto* slr = var->GetMutable<framework::SelectedRows>();
size_t rows_memory_size = PADDLE_ENFORCE(VectorElemName(slr->rows()) == typeid(int64_t).name());
slr->rows().size() * framework::SizeOfType(typeid(int64_t)); size_t rows_memory_size = slr->rows().size() * sizeof(int64_t);
IOBufWriter::Append(iobuf, ::sendrecv::VariableMessage::kRowsFieldNumber, IOBufWriter::Append(name, iobuf,
::sendrecv::VariableMessage::kRowsFieldNumber,
reinterpret_cast<const char*>(slr->rows().data()), reinterpret_cast<const char*>(slr->rows().data()),
static_cast<int64_t>(rows_memory_size)); static_cast<int64_t>(rows_memory_size));
} }
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <nccl.h> #include <nccl.h>
#endif #endif
#include <limits>
#include <thread> // NOLINT #include <thread> // NOLINT
#include "google/protobuf/io/coded_stream.h" #include "google/protobuf/io/coded_stream.h"
...@@ -102,6 +103,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -102,6 +103,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
payload->memory_size()); payload->memory_size());
if (payload->memory_size() >= std::numeric_limits<int>::max()) {
LOG(FATAL) << "AppendZeroCopy varname:" << name
<< ", vlen:" << payload->memory_size();
}
// steal reference of tensor data // steal reference of tensor data
::grpc::Slice slices[4]; // metadata, tensor, rows meta, rows ::grpc::Slice slices[4]; // metadata, tensor, rows meta, rows
int num_slices = 2; // only SelectedRows have rows buffer int num_slices = 2; // only SelectedRows have rows buffer
...@@ -115,7 +120,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -115,7 +120,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
if (var->IsType<framework::SelectedRows>()) { if (var->IsType<framework::SelectedRows>()) {
auto* slr = var->GetMutable<framework::SelectedRows>(); auto* slr = var->GetMutable<framework::SelectedRows>();
ProtoEncodeHelper e2(static_cast<char*>(buf), 128); ProtoEncodeHelper e2(static_cast<char*>(buf), 128);
PADDLE_ENFORCE(VectorElemName(slr->rows()) == typeid(int64_t).name());
size_t rows_memory_size = slr->rows().size() * sizeof(int64_t); size_t rows_memory_size = slr->rows().size() * sizeof(int64_t);
e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size); e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
slices[2] = ::grpc::Slice(e2.size()); slices[2] = ::grpc::Slice(e2.size());
memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size()); memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size());
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <iostream> #include <iostream>
#include <string> #include <string>
#include <typeindex>
#include <vector> #include <vector>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
...@@ -23,9 +24,8 @@ limitations under the License. */ ...@@ -23,9 +24,8 @@ limitations under the License. */
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h" #include "paddle/fluid/operators/distributed/send_recv.pb.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -83,6 +83,11 @@ inline framework::proto::VarType::Type ToVarType( ...@@ -83,6 +83,11 @@ inline framework::proto::VarType::Type ToVarType(
} }
} }
template <template <typename> class T, typename Elem>
std::string VectorElemName(const T<Elem>& arg) {
return typeid(Elem).name();
}
} // namespace distributed } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -118,7 +118,7 @@ bool VariableResponse::CopyLodTensorData( ...@@ -118,7 +118,7 @@ bool VariableResponse::CopyLodTensorData(
VLOG(6) << "Tensor.memory_size = " << tensor->memory_size() VLOG(6) << "Tensor.memory_size = " << tensor->memory_size()
<< ", Buffer Size = " << length; << ", Buffer Size = " << length;
PADDLE_ENFORCE_EQ(tensor->memory_size(), length); PADDLE_ENFORCE_EQ(tensor->memory_size(), static_cast<unsigned int>(length));
return ReadRaw(input, ctx, tensor->place(), tensor_data, length); return ReadRaw(input, ctx, tensor->place(), tensor_data, length);
} }
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
/*
* This file contains the list of the ngraph operators for Paddle.
*
* ATTENTION: It requires some C++11 features, for lower version C++ or C, we
* might release another API.
*/
#pragma once
#include "ops/binary_unnary_op.h"
#include "ops/mul_op.h"
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#pragma once
#include <string>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
namespace operators {
namespace ngraphs {
template <typename T>
static void BuildBinaryNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
auto y = paddle::platform::GetInputNode(op, "Y", ngb_node_map);
auto out = std::make_shared<T>(x, y);
paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
}
template <typename T>
static void BuildUnaryNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto input = paddle::platform::GetInputNode(op, "X", ngb_node_map);
auto out = std::make_shared<T>(input);
paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
}
} // namespace ngraphs
} // namespace operators
} // namespace paddle
#endif
/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#pragma once
#include <string>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
namespace operators {
namespace ngraphs {
static void BuildMulNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto op_attrs = paddle::framework::AttrReader(op->Attrs());
int x_num_col_dims = op_attrs.Get<int>("x_num_col_dims");
int y_num_col_dims = op_attrs.Get<int>("y_num_col_dims");
auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
auto y = paddle::platform::GetInputNode(op, "Y", ngb_node_map);
auto x_reshape = x;
auto y_reshape = y;
if (x->get_shape().size() > 2) {
auto x_2d = paddle::platform::FlattenTo2d(x->get_shape(), x_num_col_dims);
x_reshape = paddle::platform::NgReshaper(x, x_2d);
}
if (y->get_shape().size() > 2) {
auto y_2d = paddle::platform::FlattenTo2d(y->get_shape(), y_num_col_dims);
y_reshape = paddle::platform::NgReshaper(y, y_2d);
}
std::shared_ptr<ngraph::Node> out =
std::make_shared<ngraph::op::Dot>(x_reshape, y_reshape);
auto dummy_out = paddle::platform::GetOutputNode(op, "Out", ngb_node_map);
if (dummy_out && dummy_out->get_shape() != out->get_shape()) {
out = paddle::platform::NgReshaper(out, dummy_out->get_shape());
}
paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
}
static void BuildMulGradNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto op_attrs = paddle::framework::AttrReader(op->Attrs());
int x_num_col_dims = op_attrs.Get<int>("x_num_col_dims");
int y_num_col_dims = op_attrs.Get<int>("y_num_col_dims");
auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
auto y = paddle::platform::GetInputNode(op, "Y", ngb_node_map);
auto dout = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
bool is_dx = paddle::platform::HasOutput(op, "X@GRAD") ? true : false;
bool is_dy = paddle::platform::HasOutput(op, "Y@GRAD") ? true : false;
auto x_shape = x->get_shape();
auto y_shape = y->get_shape();
auto x_reshape = x;
auto y_reshape = y;
if (x_shape.size() > 2) {
auto x_2d_shape = paddle::platform::FlattenTo2d(x_shape, x_num_col_dims);
x_reshape = paddle::platform::NgReshaper(x, x_2d_shape);
}
if (y_shape.size() > 2) {
auto y_2d_shape = paddle::platform::FlattenTo2d(y_shape, y_num_col_dims);
y_reshape = paddle::platform::NgReshaper(y, y_2d_shape);
}
auto x_reshape_shape = x_reshape->get_shape();
std::reverse(x_reshape_shape.begin(), x_reshape_shape.end());
auto x_transpose = std::make_shared<ngraph::op::Reshape>(
x_reshape, ngraph::AxisVector{1, 0}, x_reshape_shape);
auto y_reshape_shape = y_reshape->get_shape();
std::reverse(y_reshape_shape.begin(), y_reshape_shape.end());
auto y_transpose = std::make_shared<ngraph::op::Reshape>(
y_reshape, ngraph::AxisVector{1, 0}, y_reshape_shape);
if (is_dx) {
if (dout->get_shape().size() > 2) {
auto dout_2d_shape = paddle::platform::FlattenTo2d(dout->get_shape(), 2);
dout = paddle::platform::NgReshaper(dout, dout_2d_shape);
}
auto dx = std::make_shared<ngraph::op::Dot>(dout, y_transpose);
if (dx->get_shape() == x_shape) {
paddle::platform::SetOutputNode(op, "X@GRAD", dx, ngb_node_map);
} else {
auto dx_reshape = paddle::platform::NgReshaper(dx, x_shape);
paddle::platform::SetOutputNode(op, "X@GRAD", dx_reshape, ngb_node_map);
}
}
if (is_dy) {
if (dout->get_shape().size() > 2) {
auto dout_2d_shape = paddle::platform::FlattenTo2d(dout->get_shape(), 2);
dout = paddle::platform::NgReshaper(dout, dout_2d_shape);
}
auto dy = std::make_shared<ngraph::op::Dot>(x_transpose, dout);
if (dy->get_shape() == y_shape) {
paddle::platform::SetOutputNode(op, "Y@GRAD", dy, ngb_node_map);
} else {
auto dy_reshape = paddle::platform::NgReshaper(dy, y_shape);
paddle::platform::SetOutputNode(op, "Y@GRAD", dy_reshape, ngb_node_map);
}
}
}
} // namespace ngraphs
} // namespace operators
} // namespace paddle
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/mkldnn_reuse.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using framework::DataLayout;
template <typename T>
class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
const bool is_test = ctx.Attr<bool>("is_test");
PADDLE_ENFORCE(
is_test == true,
"ConvTransposeMKLDNN works only for inference!. Set is_test = True");
auto& dev_ctx =
ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
int ndims = axis.size();
auto* input = ctx.Input<Tensor>("X");
auto* output = ctx.Output<Tensor>("Out");
const T* input_data = input->data<T>();
if (ndims == 1) {
output->ShareDataWith(*input);
return;
}
std::vector<int> nchw_axis(ndims, 0);
for (size_t i = 0; i < nchw_axis.size(); ++i) {
nchw_axis[i] = i;
}
std::vector<int> nchw_tz = paddle::framework::vectorize2int(input->dims());
std::string data_format = ctx.Attr<std::string>("data_format");
auto src_md =
input->format() != mkldnn::memory::format::nchw
? platform::MKLDNNMemDesc(nchw_tz, platform::MKLDNNGetDataType<T>(),
input->format())
: Axis2MemoryDesc(nchw_tz, nchw_axis);
this->TransposeKernel(ctx.GetPlace(), Axis2MemoryDesc(nchw_tz, axis),
src_md, output, input_data, nchw_tz, mkldnn_engine);
}
protected:
mkldnn::memory::desc Axis2MemoryDesc(std::vector<int>& nchw_tz,
std::vector<int>& axis) const {
mkldnn_memory_desc_t mem_fmt;
mem_fmt.primitive_kind = mkldnn_memory;
mem_fmt.ndims = axis.size();
for (unsigned int i = 0; i < nchw_tz.size(); ++i) {
mem_fmt.dims[i] = nchw_tz[i]; // logical dimensions (nchw format,
// regardless physical layout)
}
mem_fmt.data_type = mkldnn_f32;
mem_fmt.format = mkldnn_blocked;
unsigned int total_stride = 1;
for (int i = nchw_tz.size() - 1; i >= 0; --i) {
mem_fmt.layout_desc.blocking.padding_dims[i] =
nchw_tz[i]; // logical dimensions (nchw format, regardless physical
// layout)
mem_fmt.layout_desc.blocking.block_dims[i] = 1;
mem_fmt.layout_desc.blocking.offset_padding_to_data[i] = 0; // no offset
mem_fmt.layout_desc.blocking.strides[0][axis[i]] = total_stride;
mem_fmt.layout_desc.blocking.strides[1][axis[i]] = 1;
total_stride *= nchw_tz[axis[i]];
}
mem_fmt.layout_desc.blocking.offset_padding = 0; // no initial offset
return mem_fmt;
}
void TransposeKernel(platform::Place place, mkldnn::memory::desc md_o,
mkldnn::memory::desc md_i, Tensor* output,
const T* data_i, std::vector<int>& nchw_dims,
const mkldnn::engine& eng) const {
// Make Memory primitive descriptors
auto mpd_o = mkldnn::memory::primitive_desc(md_o, eng);
auto mpd_i = mkldnn::memory::primitive_desc(md_i, eng);
auto data_o = output->mutable_data<T>(
place, paddle::memory::Allocator::kDefault, mpd_o.get_size());
auto src = mkldnn::memory(mpd_i, (T*)(data_i));
auto dst = mkldnn::memory(mpd_o, data_o);
auto r = mkldnn::reorder(src, dst);
mkldnn::stream(mkldnn::stream::kind::eager).submit({r}).wait();
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(transpose2, MKLDNN, ::paddle::platform::CPUPlace,
ops::TransposeMKLDNNOpKernel<float>);
REGISTER_OP_KERNEL(transpose, MKLDNN, ::paddle::platform::CPUPlace,
ops::TransposeMKLDNNOpKernel<float>);
...@@ -16,6 +16,10 @@ limitations under the License. */ ...@@ -16,6 +16,10 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -53,11 +57,32 @@ class TransposeOp : public framework::OperatorWithKernel { ...@@ -53,11 +57,32 @@ class TransposeOp : public framework::OperatorWithKernel {
} }
ctx->SetOutputDim("Out", out_dims); ctx->SetOutputDim("Out", out_dims);
} }
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
framework::LibraryType library_{framework::LibraryType::kPlain};
std::string data_format = ctx.Attr<std::string>("data_format");
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
#ifdef PADDLE_WITH_MKLDNN
if (library_ == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN;
layout_ = framework::DataLayout::kMKLDNN;
}
#endif
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
ctx.GetPlace(), layout_, library_);
}
}; };
class TransposeOpMaker : public framework::OpProtoAndCheckerMaker { class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddAttr<bool>("is_test",
"(bool, default false) Set to true for inference only, false "
"for training. Some layers may run faster when this is true.")
.SetDefault(false);
AddInput( AddInput(
"X", "X",
"(Tensor) The input tensor, tensors with rank up to 6 are supported."); "(Tensor) The input tensor, tensors with rank up to 6 are supported.");
...@@ -67,6 +92,16 @@ class TransposeOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -67,6 +92,16 @@ class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
"(vector<int>) A list of values, and the size of the list should be " "(vector<int>) A list of values, and the size of the list should be "
"the same with the input tensor rank. This operator permutes the input " "the same with the input tensor rank. This operator permutes the input "
"tensor's axes according to the values given."); "tensor's axes according to the values given.");
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<std::string>(
"data_format",
"(string, default NCHW) Only used in "
"An optional string from: \"NHWC\", \"NCHW\". "
"Defaults to \"NHWC\". Specify the data format of the output data, "
"the input will be transformed automatically. ")
.SetDefault("AnyLayout");
AddComment(R"DOC( AddComment(R"DOC(
Transpose Operator. Transpose Operator.
...@@ -144,8 +179,18 @@ class Transpose2Op : public TransposeOp { ...@@ -144,8 +179,18 @@ class Transpose2Op : public TransposeOp {
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(), framework::LibraryType library_{framework::LibraryType::kPlain};
ctx.device_context()); std::string data_format = ctx.Attr<std::string>("data_format");
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
#ifdef PADDLE_WITH_MKLDNN
if (library_ == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN;
layout_ = framework::DataLayout::kMKLDNN;
}
#endif
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
ctx.GetPlace(), layout_, library_);
} }
}; };
......
...@@ -16,9 +16,7 @@ if (CUPTI_FOUND) ...@@ -16,9 +16,7 @@ if (CUPTI_FOUND)
list(APPEND CUDA_SRCS cupti.cc) list(APPEND CUDA_SRCS cupti.cc)
endif(CUPTI_FOUND) endif(CUPTI_FOUND)
nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader) nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
if (NOT WIN32)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
endif(NOT WIN32)
if (WITH_MKLML) if (WITH_MKLML)
cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml) cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
endif() endif()
......
...@@ -34,7 +34,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name); ...@@ -34,7 +34,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ #define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \
struct DynLoad__##__name { \ struct DynLoad__##__name { \
template <typename... Args> \ template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \ auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using cudnn_func = decltype(&::__name); \ using cudnn_func = decltype(&::__name); \
std::call_once(cudnn_dso_flag, []() { \ std::call_once(cudnn_dso_flag, []() { \
cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \ cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
......
...@@ -201,6 +201,8 @@ void* GetCurandDsoHandle() { ...@@ -201,6 +201,8 @@ void* GetCurandDsoHandle() {
void* GetWarpCTCDsoHandle() { void* GetWarpCTCDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__) #if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib"); return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib");
#elif defined(_WIN32)
return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "warpctc.dll");
#else #else
return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so"); return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so");
#endif #endif
......
...@@ -18,6 +18,12 @@ namespace paddle { ...@@ -18,6 +18,12 @@ namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
#ifndef _WIN32
#define DECLARE_TYPE(__name, ...) decltype(__name(__VA_ARGS__))
#else
#define DECLARE_TYPE(__name, ...) decltype(auto)
#endif
void* GetCublasDsoHandle(); void* GetCublasDsoHandle();
void* GetCUDNNDsoHandle(); void* GetCUDNNDsoHandle();
void* GetCUPTIDsoHandle(); void* GetCUPTIDsoHandle();
......
...@@ -34,7 +34,7 @@ extern void* mklml_dso_handle; ...@@ -34,7 +34,7 @@ extern void* mklml_dso_handle;
#define DYNAMIC_LOAD_MKLML_WRAP(__name) \ #define DYNAMIC_LOAD_MKLML_WRAP(__name) \
struct DynLoad__##__name { \ struct DynLoad__##__name { \
template <typename... Args> \ template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \ auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using mklmlFunc = decltype(&::__name); \ using mklmlFunc = decltype(&::__name); \
std::call_once(mklml_dso_flag, []() { \ std::call_once(mklml_dso_flag, []() { \
mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \ mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \
......
...@@ -33,7 +33,7 @@ extern void* tensorrt_dso_handle; ...@@ -33,7 +33,7 @@ extern void* tensorrt_dso_handle;
#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name) \ #define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name) \
struct DynLoad__##__name { \ struct DynLoad__##__name { \
template <typename... Args> \ template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \ auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using tensorrt_func = decltype(__name(args...)) (*)(Args...); \ using tensorrt_func = decltype(__name(args...)) (*)(Args...); \
std::call_once(tensorrt_dso_flag, []() { \ std::call_once(tensorrt_dso_flag, []() { \
tensorrt_dso_handle = \ tensorrt_dso_handle = \
......
...@@ -34,7 +34,7 @@ extern void* warpctc_dso_handle; ...@@ -34,7 +34,7 @@ extern void* warpctc_dso_handle;
#define DYNAMIC_LOAD_WARPCTC_WRAP(__name) \ #define DYNAMIC_LOAD_WARPCTC_WRAP(__name) \
struct DynLoad__##__name { \ struct DynLoad__##__name { \
template <typename... Args> \ template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \ auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using warpctcFunc = decltype(&::__name); \ using warpctcFunc = decltype(&::__name); \
std::call_once(warpctc_dso_flag, []() { \ std::call_once(warpctc_dso_flag, []() { \
warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \ warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#pragma once
#include <functional>
#include <string>
#include <vector>
#include "ngraph/ngraph.hpp"
namespace paddle {
namespace platform {
static ngraph::Shape FlattenTo2d(ngraph::Shape sh, int num) {
auto x1 = std::accumulate(std::begin(sh), std::begin(sh) + num, 1,
std::multiplies<size_t>());
auto x2 = std::accumulate(std::begin(sh) + num, std::end(sh), 1,
std::multiplies<size_t>());
size_t x1_l = static_cast<size_t>(x1);
size_t x2_l = static_cast<size_t>(x2);
return ngraph::Shape{x1_l, x2_l};
}
static std::shared_ptr<ngraph::Node> NgReshaper(
std::shared_ptr<ngraph::Node> input, ngraph::Shape shape) {
std::vector<size_t> input_order(input->get_shape().size());
std::iota(std::begin(input_order), std::end(input_order), 0);
return std::make_shared<ngraph::op::Reshape>(
input, ngraph::AxisVector(input_order), shape);
}
static std::shared_ptr<ngraph::Node> GetNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
const std::string prm, const paddle::framework::VariableNameMap& var_map,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto& var_names = var_map.at(prm);
PADDLE_ENFORCE_EQ(var_names.size(), 1,
"op %s prm %s expects one associated var", op->Type(), prm);
if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) {
return (*ngb_node_map)[var_names[0]];
} else {
return nullptr;
}
}
static std::shared_ptr<ngraph::Node> GetInputNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
const std::string prm,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
return GetNode(op, prm, op->Inputs(), ngb_node_map);
}
static std::shared_ptr<ngraph::Node> GetOutputNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
const std::string prm,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
return GetNode(op, prm, op->Outputs(), ngb_node_map);
}
static void SetOutputNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
const std::string prm, std::shared_ptr<ngraph::Node> node,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto& var_names = op->Outputs().at(prm);
if (var_names.size() == 1) {
(*ngb_node_map)[var_names[0]] = node;
} else if (var_names.size() == 0) {
(*ngb_node_map)[""] = node;
} else {
PADDLE_THROW("prm %s has more than 1 var_names.", prm);
}
}
static bool HasOutput(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
const std::string prm) {
auto& outputs = op->Outputs();
if (outputs.find(prm) == outputs.end()) return false;
return outputs.at(prm).size() > 0;
}
} // namespace platform
} // namespace paddle
#endif
...@@ -55,7 +55,6 @@ static void *dlsym(void *handle, const char *symbol_name) { ...@@ -55,7 +55,6 @@ static void *dlsym(void *handle, const char *symbol_name) {
static void *dlopen(const char *filename, int flag) { static void *dlopen(const char *filename, int flag) {
std::string file_name(filename); std::string file_name(filename);
file_name.replace(0, file_name.size() - 1, '/', '\\');
HMODULE hModule = LoadLibrary(file_name.c_str()); HMODULE hModule = LoadLibrary(file_name.c_str());
if (!hModule) { if (!hModule) {
throw std::runtime_error(file_name + " not found."); throw std::runtime_error(file_name + " not found.");
......
...@@ -102,6 +102,13 @@ def __bootstrap__(): ...@@ -102,6 +102,13 @@ def __bootstrap__():
import sys import sys
import os import os
import platform import platform
if os.name == 'nt':
third_lib_path = os.path.abspath(os.path.dirname(
__file__)) + os.sep + '..' + os.sep + 'libs'
os.environ['path'] += ';' + third_lib_path
sys.path.append(third_lib_path)
from . import core from . import core
in_test = 'unittest' in sys.modules in_test = 'unittest' in sys.modules
...@@ -128,13 +135,12 @@ def __bootstrap__(): ...@@ -128,13 +135,12 @@ def __bootstrap__():
'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size",
'eager_delete_tensor_gb', 'fast_eager_deletion_mode', 'eager_delete_tensor_gb', 'fast_eager_deletion_mode',
'allocator_strategy', 'reader_queue_speed_test_mode', 'allocator_strategy', 'reader_queue_speed_test_mode',
'print_sub_graph_dir', 'pe_profile_fname' 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir'
] ]
if 'Darwin' not in sysstr: if 'Darwin' not in sysstr:
read_env_flags.append('use_pinned_memory') read_env_flags.append('use_pinned_memory')
if os.name != 'nt': if os.name != 'nt':
read_env_flags.append('warpctc_dir')
read_env_flags.append('cpu_deterministic') read_env_flags.append('cpu_deterministic')
if core.is_compiled_with_dist(): if core.is_compiled_with_dist():
......
...@@ -16,6 +16,7 @@ from __future__ import print_function ...@@ -16,6 +16,7 @@ from __future__ import print_function
import collections import collections
import contextlib import contextlib
import os
import re import re
import six import six
import sys import sys
...@@ -27,11 +28,18 @@ from .proto import framework_pb2 ...@@ -27,11 +28,18 @@ from .proto import framework_pb2
try: try:
from . import core from . import core
except ImportError as e: except ImportError as e:
raise ImportError( if os.name == 'nt':
"""NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\" raise ImportError(
if you encounters \"libmkldnn.so not found\" errors. If you have python """NOTE: You may need to run \"set PATH=c:\python27\lib:%PATH%\"
installed in other directory, replace \"/usr/local/lib\" with your own if you encounters \"mkldnn.dll not found\" errors. If you have python
directory. The original error is: \n""" + cpt.get_exception_message(e)) installed in other directory, replace \"c:\python27\lib" with your own
directory. The original error is: \n""" + cpt.get_exception_message(e))
else:
raise ImportError(
"""NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\"
if you encounters \"libmkldnn.so not found\" errors. If you have python
installed in other directory, replace \"/usr/local/lib\" with your own
directory. The original error is: \n""" + cpt.get_exception_message(e))
except Exception as e: except Exception as e:
raise e raise e
from . import unique_name from . import unique_name
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle.fluid.core as core
from paddle.fluid.tests.unittests.op_test import OpTest
from paddle.fluid.tests.unittests.test_activation_op import TestRelu, TestTanh
class TestNGRAPHReluDim2(TestRelu):
def setUp(self):
super(TestNGRAPHReluDim2, self).setUp()
class TestNGRAPHTanhDim2(TestTanh):
def setUp(self):
super(TestNGRAPHTanhDim2, self).setUp()
class TestNGRAPHReluDim4(TestRelu):
def setUp(self):
super(TestNGRAPHReluDim4, self).setUp()
x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
# The same reason with TestAbs
x[np.abs(x) < 0.005] = 0.02
out = np.maximum(x, 0)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
class TestNGRAPHTanhDim4(TestTanh):
def setUp(self):
super(TestNGRAPHTanhDim4, self).setUp()
self.inputs = {
'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
}
self.outputs = {'Out': np.tanh(self.inputs['X'])}
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
from paddle.fluid.tests.unittests.test_mul_op import TestMulOp, TestMulOp2, TestFP16MulOp1, TestFP16MulOp2
class TestNGRAPHMulOp(TestMulOp):
def init_dtype_type(self):
pass
class TestNGRAPHMulOp2(TestMulOp2):
def init_dtype_type(self):
pass
class TestNGRAPHFP16MulOp1(TestFP16MulOp1):
def init_dtype_type(self):
pass
class TestNGRAPHFP16MulOp2(TestFP16MulOp2):
def init_dtype_type(self):
pass
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
from test_transpose_op import TestTransposeOp
class TestTransposeMKLDNN(TestTransposeOp):
def init_op_type(self):
self.op_type = "transpose2"
self.use_mkldnn = True
self.is_test = True
return
def test_check_grad(self):
return
def test_check_grad_no_input(self):
return
def test_check_grad_no_filter(self):
return
class TestCase0MKLDNN(TestTransposeMKLDNN):
def initTestCase(self):
self.shape = (3, )
self.axis = (0, )
class TestCase1a(TestTransposeMKLDNN):
def initTestCase(self):
self.shape = (3, 4, 5)
self.axis = (0, 2, 1)
class TestCase1b(TestTransposeMKLDNN):
def initTestCase(self):
self.shape = (3, 4, 5)
self.axis = (2, 1, 0)
class TestCase2(TestTransposeMKLDNN):
def initTestCase(self):
self.shape = (2, 3, 4, 5)
self.axis = (0, 2, 3, 1)
class TestCase3(TestTransposeMKLDNN):
def initTestCase(self):
self.shape = (2, 3, 4, 5, 6)
self.axis = (4, 2, 3, 1, 0)
class TestCase4(TestTransposeMKLDNN):
def initTestCase(self):
self.shape = (2, 3, 4, 5, 6, 1)
self.axis = (4, 2, 3, 1, 0, 5)
if __name__ == '__main__':
unittest.main()
...@@ -21,15 +21,24 @@ from op_test import OpTest ...@@ -21,15 +21,24 @@ from op_test import OpTest
class TestTransposeOp(OpTest): class TestTransposeOp(OpTest):
def setUp(self): def setUp(self):
self.init_op_type()
self.initTestCase() self.initTestCase()
self.op_type = "transpose2"
self.inputs = {'X': np.random.random(self.shape).astype("float32")} self.inputs = {'X': np.random.random(self.shape).astype("float32")}
self.attrs = {'axis': list(self.axis)} self.attrs = {
'axis': list(self.axis),
'use_mkldnn': self.use_mkldnn,
'is_test': self.is_test,
}
self.outputs = { self.outputs = {
'XShape': np.random.random(self.shape).astype("float32"), 'XShape': np.random.random(self.shape).astype("float32"),
'Out': self.inputs['X'].transpose(self.axis) 'Out': self.inputs['X'].transpose(self.axis)
} }
def init_op_type(self):
self.op_type = "transpose2"
self.use_mkldnn = False
self.is_test = False
def test_check_output(self): def test_check_output(self):
self.check_output(no_check_set=['XShape']) self.check_output(no_check_set=['XShape'])
......
...@@ -160,10 +160,11 @@ if '${WITH_FLUID_ONLY}'== 'OFF': ...@@ -160,10 +160,11 @@ if '${WITH_FLUID_ONLY}'== 'OFF':
# put all thirdparty libraries in paddle.libs # put all thirdparty libraries in paddle.libs
libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs' libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs'
if os.name != 'nt':
package_data['paddle.libs']= [] package_data['paddle.libs']= []
package_data['paddle.libs']=['libwarpctc' + ext_name] package_data['paddle.libs']=[('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_name]
shutil.copy('${WARPCTC_LIBRARIES}', libs_path) shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
if '${WITH_MKL}' == 'ON': if '${WITH_MKL}' == 'ON':
shutil.copy('${MKLML_LIB}', libs_path) shutil.copy('${MKLML_LIB}', libs_path)
shutil.copy('${MKLML_IOMP_LIB}', libs_path) shutil.copy('${MKLML_IOMP_LIB}', libs_path)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册