提交 1c116462 编写于 作者: M minqiyang

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into imperative_mnist

test=develop
...@@ -208,10 +208,10 @@ include(external/xxhash) # download xxhash ...@@ -208,10 +208,10 @@ include(external/xxhash) # download xxhash
include(external/dlpack) include(external/dlpack)
include(external/snappy) # download snappy include(external/snappy) # download snappy
include(external/snappystream) # download snappystream include(external/snappystream) # download snappystream
include(external/warpctc) # download, build, install warpctc
if (NOT WIN32) if (NOT WIN32)
# there is no official support of warpctc, nccl, cupti in windows # there is no official support of nccl, cupti in windows
include(external/warpctc) # download, build, install warpctc
include(cupti) include(cupti)
include(external/gzstream) include(external/gzstream)
endif (NOT WIN32) endif (NOT WIN32)
......
...@@ -26,25 +26,33 @@ SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" ...@@ -26,25 +26,33 @@ SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
# Used in unit test test_WarpCTCLayer # Used in unit test test_WarpCTCLayer
SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib" SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib"
CACHE PATH "Warp-ctc Library Directory" FORCE) CACHE PATH "Warp-ctc Library Directory" FORCE)
SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
CACHE FILEPATH "Warp-ctc Library" FORCE)
IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" ) IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR WIN32)
SET(USE_OMP OFF) SET(USE_OMP OFF)
ELSE() ELSE()
SET(USE_OMP ON) SET(USE_OMP ON)
ENDIF() ENDIF()
IF(WIN32)
SET(WARPCTC_REPOSITORY "https://github.com/wopeizl/warp-ctc.git")
ELSE()
SET(WARPCTC_REPOSITORY "https://github.com/dzhwinter/warp-ctc.git")
ENDIF()
ExternalProject_Add( ExternalProject_Add(
extern_warpctc extern_warpctc
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/dzhwinter/warp-ctc.git" GIT_REPOSITORY ${WARPCTC_REPOSITORY}
PREFIX ${WARPCTC_SOURCES_DIR} PREFIX ${WARPCTC_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
-DWITH_GPU=${WITH_GPU} -DWITH_GPU=${WITH_GPU}
-DWITH_OMP=${USE_OMP} -DWITH_OMP=${USE_OMP}
...@@ -59,6 +67,18 @@ ExternalProject_Add( ...@@ -59,6 +67,18 @@ ExternalProject_Add(
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
) )
IF(WIN32)
IF(NOT EXISTS "${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}")
add_custom_command(TARGET extern_warpctc POST_BUILD
COMMAND cmake -E copy ${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX} ${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}
)
ENDIF()
SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
CACHE FILEPATH "Warp-ctc Library" FORCE)
else(WIN32)
SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
CACHE FILEPATH "Warp-ctc Library" FORCE)
ENDIF(WIN32)
MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}") MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers. INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers.
......
...@@ -84,7 +84,7 @@ function(op_library TARGET) ...@@ -84,7 +84,7 @@ function(op_library TARGET)
endif() endif()
if (WIN32) if (WIN32)
# remove windows unsupported op, because windows has no nccl, no warpctc such ops. # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op") foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op")
if ("${TARGET}" STREQUAL "${windows_unsupport_op}") if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
return() return()
endif() endif()
......
...@@ -350,6 +350,22 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b ...@@ -350,6 +350,22 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b
paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)) paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.contrib.load_persistables_for_increment ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.load_persistables_for_inference ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.convert_dist_to_sparse_program ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.HDFSClient.__init__ ArgSpec(args=['self', 'hadoop_home', 'configs'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.HDFSClient.delete ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.HDFSClient.download ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'unzip'], varargs=None, keywords=None, defaults=(False, False))
paddle.fluid.contrib.HDFSClient.is_dir ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.HDFSClient.is_exist ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.HDFSClient.ls ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.HDFSClient.lsr ArgSpec(args=['self', 'hdfs_path', 'only_file', 'sort'], varargs=None, keywords=None, defaults=(True, True))
paddle.fluid.contrib.HDFSClient.make_local_dirs ArgSpec(args=['local_path'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.HDFSClient.makedirs ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.HDFSClient.rename ArgSpec(args=['self', 'hdfs_src_path', 'hdfs_dst_path', 'overwrite'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.contrib.HDFSClient.upload ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'retry_times'], varargs=None, keywords=None, defaults=(False, 5))
paddle.fluid.contrib.multi_download ArgSpec(args=['client', 'hdfs_path', 'local_path', 'trainer_id', 'trainers', 'multi_processes'], varargs=None, keywords=None, defaults=(5,))
paddle.fluid.contrib.multi_upload ArgSpec(args=['client', 'hdfs_path', 'local_path', 'multi_processes', 'overwrite', 'sync'], varargs=None, keywords=None, defaults=(5, False, True))
paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
...@@ -376,7 +392,7 @@ paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learnin ...@@ -376,7 +392,7 @@ paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learnin
paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None)) paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None))
paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)) paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False))
paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
......
...@@ -131,9 +131,7 @@ std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy( ...@@ -131,9 +131,7 @@ std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy(
std::unique_ptr<ir::Graph> BuildStrategy::Apply( std::unique_ptr<ir::Graph> BuildStrategy::Apply(
const ProgramDesc &main_program, const std::vector<platform::Place> &places, const ProgramDesc &main_program, const std::vector<platform::Place> &places,
const std::string &loss_var_name, const std::string &loss_var_name, const std::vector<Scope *> &local_scopes,
const std::unordered_set<std::string> &param_names,
const std::vector<Scope *> &local_scopes,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const { const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const {
#else #else
...@@ -149,9 +147,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply( ...@@ -149,9 +147,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
pass->SetNotOwned<const std::vector<platform::Place>>("places", &places); pass->SetNotOwned<const std::vector<platform::Place>>("places", &places);
pass->Erase("loss_var_name"); pass->Erase("loss_var_name");
pass->SetNotOwned<const std::string>("loss_var_name", &loss_var_name); pass->SetNotOwned<const std::string>("loss_var_name", &loss_var_name);
pass->Erase("params");
pass->SetNotOwned<const std::unordered_set<std::string>>("params",
&param_names);
pass->Erase("local_scopes"); pass->Erase("local_scopes");
pass->SetNotOwned<const std::vector<Scope *>>("local_scopes", pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
&local_scopes); &local_scopes);
......
...@@ -106,16 +106,15 @@ struct BuildStrategy { ...@@ -106,16 +106,15 @@ struct BuildStrategy {
// Apply the passes built by the pass_builder_. The passes will be // Apply the passes built by the pass_builder_. The passes will be
// applied to the Program and output an ir::Graph. // applied to the Program and output an ir::Graph.
std::unique_ptr<ir::Graph> Apply( std::unique_ptr<ir::Graph> Apply(const ProgramDesc &main_program,
const ProgramDesc &main_program, const std::vector<platform::Place> &places,
const std::vector<platform::Place> &places, const std::string &loss_var_name,
const std::string &loss_var_name, const std::vector<Scope *> &local_scopes,
const std::unordered_set<std::string> &param_names,
const std::vector<Scope *> &local_scopes,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const; const bool use_cuda,
platform::NCCLContextMap *nccl_ctxs) const;
#else #else
const bool use_cuda) const; const bool use_cuda) const;
#endif #endif
private: private:
......
...@@ -130,7 +130,6 @@ void AddOutputToLeafOps(ir::Graph *graph) { ...@@ -130,7 +130,6 @@ void AddOutputToLeafOps(ir::Graph *graph) {
static const char kLossVarName[] = "loss_var_name"; static const char kLossVarName[] = "loss_var_name";
static const char kPlaces[] = "places"; static const char kPlaces[] = "places";
static const char kParams[] = "params";
static const char kLocalScopes[] = "local_scopes"; static const char kLocalScopes[] = "local_scopes";
static const char kStrategy[] = "strategy"; static const char kStrategy[] = "strategy";
static const char kNumTrainers[] = "num_trainers"; static const char kNumTrainers[] = "num_trainers";
...@@ -147,9 +146,6 @@ void MultiDevSSAGraphBuilder::Init() const { ...@@ -147,9 +146,6 @@ void MultiDevSSAGraphBuilder::Init() const {
nccl_ctxs_ = &Get<platform::NCCLContextMap>("nccl_ctxs"); nccl_ctxs_ = &Get<platform::NCCLContextMap>("nccl_ctxs");
#endif #endif
for (auto &p : Get<const std::unordered_set<std::string>>(kParams)) {
grad_names_.insert(GradVarName(p));
}
balance_vars_.resize(places_.size(), 0); balance_vars_.resize(places_.size(), 0);
if (strategy_.enable_data_balance_ && places_.size() == 1) { if (strategy_.enable_data_balance_ && places_.size() == 1) {
LOG(WARNING) << "It is no need to enable data balance when there is only " LOG(WARNING) << "It is no need to enable data balance when there is only "
...@@ -896,7 +892,6 @@ REGISTER_PASS(multi_devices_pass, ...@@ -896,7 +892,6 @@ REGISTER_PASS(multi_devices_pass,
paddle::framework::details::MultiDevSSAGraphBuilder) paddle::framework::details::MultiDevSSAGraphBuilder)
.RequirePassAttr(paddle::framework::details::kLossVarName) .RequirePassAttr(paddle::framework::details::kLossVarName)
.RequirePassAttr(paddle::framework::details::kPlaces) .RequirePassAttr(paddle::framework::details::kPlaces)
.RequirePassAttr(paddle::framework::details::kParams)
.RequirePassAttr(paddle::framework::details::kLocalScopes) .RequirePassAttr(paddle::framework::details::kLocalScopes)
.RequirePassAttr(paddle::framework::details::kStrategy) .RequirePassAttr(paddle::framework::details::kStrategy)
.RequirePassAttr(paddle::framework::details::kNumTrainers); .RequirePassAttr(paddle::framework::details::kNumTrainers);
...@@ -102,7 +102,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass { ...@@ -102,7 +102,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
mutable std::string loss_var_name_; mutable std::string loss_var_name_;
mutable std::vector<platform::Place> places_; mutable std::vector<platform::Place> places_;
mutable std::vector<Scope *> local_scopes_; mutable std::vector<Scope *> local_scopes_;
mutable std::unordered_set<std::string> grad_names_;
mutable BuildStrategy strategy_; mutable BuildStrategy strategy_;
mutable std::unordered_map<std::string, VarDesc *> all_vars_; mutable std::unordered_map<std::string, VarDesc *> all_vars_;
......
...@@ -24,35 +24,6 @@ namespace paddle { ...@@ -24,35 +24,6 @@ namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
// The function keeps the graph consistent by replacing
// a node 'from' in the set of inputs nodes
// of the visited node by a node 'to'.
void CorrectGraphEdges(Graph* graph, Node* from, Node* to) {
for (auto& node : GraphTraits::DFS(*graph)) {
auto from_in_inputs =
std::find(std::begin(node.inputs), std::end(node.inputs), from);
if (from_in_inputs != std::end(node.inputs)) {
IR_NODE_LINK_TO(to, (&node));
auto inputs = node.Op()->Inputs();
using input_type = VariableNameMap::value_type;
std::for_each(std::begin(inputs), std::end(inputs),
[from, to, &node](const input_type& i) -> void {
auto param_names = i.second;
auto pi = std::find(std::begin(param_names),
std::end(param_names), from->Name());
if (pi != std::end(param_names)) {
node.Op()->SetInput(i.first, {to->Name()});
}
});
}
}
}
bool IsReachable(ir::Graph* graph, Node* from, Node* to) { bool IsReachable(ir::Graph* graph, Node* from, Node* to) {
auto find_node = [](ir::Graph* graph, const Node* node) -> Node* { auto find_node = [](ir::Graph* graph, const Node* node) -> Node* {
for (auto n : graph->Nodes()) { for (auto n : graph->Nodes()) {
...@@ -99,25 +70,12 @@ bool IsReachable(ir::Graph* graph, Node* from, Node* to) { ...@@ -99,25 +70,12 @@ bool IsReachable(ir::Graph* graph, Node* from, Node* to) {
return false; return false;
} }
boost::optional<Node*> HasBias(const Node& op, const std::string& bias_name) { template <typename T>
auto bias_input_names = op.Op()->Inputs(); boost::optional<T> HasAttribute(const Node& op, const std::string& attr) {
auto bias_it = bias_input_names.find(bias_name); if (op.Op()->HasAttr(attr))
return boost::get<T>(op.Op()->GetAttr(attr));
if (bias_it != std::end(bias_input_names)) { else
bool has_bias = !bias_it->second.empty(); return boost::none;
if (has_bias) {
auto bias_names = bias_it->second;
auto bias_names_it =
std::find_if(std::begin(op.inputs), std::end(op.inputs),
[&bias_names](Node* n) -> bool {
return n->Name() == bias_names[0];
});
return *bias_names_it;
}
}
return boost::none;
} }
ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::IdentityFuseHandle( ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::IdentityFuseHandle(
...@@ -151,40 +109,18 @@ void ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::operator()( ...@@ -151,40 +109,18 @@ void ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::operator()(
if (!IsReachable(graph, elementwise_add_identity, conv_output)) return; if (!IsReachable(graph, elementwise_add_identity, conv_output)) return;
OpDesc op_desc; auto fuse_relu = HasAttribute<bool>(*conv_op, "fuse_relu");
op_desc.SetType("conv2d"); if (fuse_relu && *fuse_relu) return;
op_desc.SetInput("Input", {conv_input->Name()});
op_desc.SetInput("Filter", {conv_filter->Name()});
op_desc.SetInput("ResidualData", {elementwise_add_identity->Name()});
op_desc.SetOutput("Output", {conv_output->Name()});
auto conv_bias = HasBias(*conv_op, "Bias"); conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()});
conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
conv_op->Op()->SetAttr("fuse_residual_connection", true);
if (conv_bias) { GraphSafeRemoveNodes(graph, {conv_output, elementwise_add_op});
op_desc.SetInput("Bias", {(*conv_bias)->Name()});
}
for (const auto& attr : conv_op->Op()->GetAttrMap()) {
op_desc.SetAttr(attr.first, attr.second);
}
op_desc.SetAttr("fuse_residual_connection", true);
auto fused_conv_op = graph->CreateOpNode(&op_desc); IR_NODE_LINK_TO(elementwise_add_identity, conv_op);
IR_NODE_LINK_TO(conv_op, elementwise_add_out);
IR_NODE_LINK_TO(conv_input, fused_conv_op);
IR_NODE_LINK_TO(conv_filter, fused_conv_op);
IR_NODE_LINK_TO(elementwise_add_identity, fused_conv_op);
IR_NODE_LINK_TO(fused_conv_op, conv_output);
if (conv_bias) {
IR_NODE_LINK_TO((*conv_bias), fused_conv_op);
}
CorrectGraphEdges(graph, elementwise_add_out, conv_output);
GraphSafeRemoveNodes(graph,
{elementwise_add_out, conv_op, elementwise_add_op});
(*fusion_stats)++; (*fusion_stats)++;
} }
...@@ -229,60 +165,33 @@ void ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::operator()( ...@@ -229,60 +165,33 @@ void ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::operator()(
Node* projection_node; Node* projection_node;
Node* residual_conv_op; Node* residual_conv_op;
Node* residual_conv_input;
Node* residual_conv_filter;
Node* residual_conv_output; Node* residual_conv_output;
if (IsReachable(graph, conv_x_input, conv_y_output)) { if (IsReachable(graph, conv_x_input, conv_y_output)) {
projection_node = conv_x_output; projection_node = conv_x_output;
residual_conv_op = conv_y_op; residual_conv_op = conv_y_op;
residual_conv_input = conv_y_input;
residual_conv_filter = conv_y_filter;
residual_conv_output = conv_y_output; residual_conv_output = conv_y_output;
} else if (IsReachable(graph, conv_y_input, conv_x_output)) { } else if (IsReachable(graph, conv_y_input, conv_x_output)) {
projection_node = conv_y_output; projection_node = conv_y_output;
residual_conv_op = conv_x_op; residual_conv_op = conv_x_op;
residual_conv_input = conv_x_input;
residual_conv_filter = conv_x_filter;
residual_conv_output = conv_x_output; residual_conv_output = conv_x_output;
} else { } else {
return; return;
} }
OpDesc op_desc; auto fuse_relu = HasAttribute<bool>(*residual_conv_op, "fuse_relu");
op_desc.SetType("conv2d"); if (fuse_relu && *fuse_relu) return;
op_desc.SetInput("Input", {residual_conv_input->Name()}); residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()});
op_desc.SetInput("Filter", {residual_conv_filter->Name()}); residual_conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
op_desc.SetInput("ResidualData", {projection_node->Name()});
op_desc.SetOutput("Output", {residual_conv_output->Name()});
auto residual_conv_bias = HasBias(*residual_conv_op, "Bias"); residual_conv_op->Op()->SetAttr("fuse_residual_connection", true);
if (residual_conv_bias) { GraphSafeRemoveNodes(graph, {residual_conv_output, elementwise_add_op});
op_desc.SetInput("Bias", {(*residual_conv_bias)->Name()});
}
for (const auto& attr : residual_conv_op->Op()->GetAttrMap()) {
op_desc.SetAttr(attr.first, attr.second);
}
op_desc.SetAttr("fuse_residual_connection", true);
auto fused_conv_op = graph->CreateOpNode(&op_desc); IR_NODE_LINK_TO(projection_node, residual_conv_op);
IR_NODE_LINK_TO(residual_conv_op, elementwise_add_out);
IR_NODE_LINK_TO(residual_conv_input, fused_conv_op);
IR_NODE_LINK_TO(residual_conv_filter, fused_conv_op);
IR_NODE_LINK_TO(projection_node, fused_conv_op);
IR_NODE_LINK_TO(fused_conv_op, residual_conv_output);
if (residual_conv_bias) {
IR_NODE_LINK_TO((*residual_conv_bias), fused_conv_op);
}
CorrectGraphEdges(graph, elementwise_add_out, residual_conv_output);
GraphSafeRemoveNodes(
graph, {elementwise_add_out, residual_conv_op, elementwise_add_op});
(*fusion_stats)++; (*fusion_stats)++;
} }
......
...@@ -16,100 +16,25 @@ limitations under the License. */ ...@@ -16,100 +16,25 @@ limitations under the License. */
#include <functional> #include <functional>
#include <vector> #include <vector>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/framework/ngraph_bridge.h" #include "paddle/fluid/framework/ngraph_bridge.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/ngraph/ngraph_ops.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/ngraph_helper.h"
#include "ngraph/ngraph.hpp"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
static std::shared_ptr<ngraph::Node> GetNode(
const std::shared_ptr<OperatorBase>& op, const std::string name,
const VariableNameMap& var_map,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto& var_names = var_map.at(name);
PADDLE_ENFORCE_EQ(var_names.size(), 1,
"op %s name %s expects one associated var", op->Type(),
name);
if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) {
return (*ngb_node_map)[var_names[0]];
} else {
return nullptr;
}
}
static std::shared_ptr<ngraph::Node> GetInputNode(
const std::shared_ptr<OperatorBase>& op, const std::string name,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
return GetNode(op, name, op->Inputs(), ngb_node_map);
}
static std::shared_ptr<ngraph::Node> GetOutputNode(
const std::shared_ptr<OperatorBase>& op, const std::string name,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
return GetNode(op, name, op->Outputs(), ngb_node_map);
}
static void SetOutputNode(
const std::shared_ptr<OperatorBase>& op, const std::string name,
std::shared_ptr<ngraph::Node> node,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto& var_names = op->Outputs().at(name);
if (var_names.size() == 1) {
(*ngb_node_map)[var_names[0]] = node;
} else if (var_names.size() == 0) {
(*ngb_node_map)[""] = node;
} else {
PADDLE_THROW("name %s has more than 1 var_names.", name);
}
}
static bool HasOutput(const std::shared_ptr<OperatorBase>& op,
const std::string name) {
auto& outputs = op->Outputs();
if (outputs.find(name) == outputs.end()) return false;
return outputs.at(name).size() > 0;
}
template <typename T>
static void BuildBinaryNode(
const std::shared_ptr<OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto x = GetInputNode(op, "X", ngb_node_map);
auto y = GetInputNode(op, "Y", ngb_node_map);
auto out = std::make_shared<T>(x, y);
SetOutputNode(op, "Out", out, ngb_node_map);
}
template <typename T>
static void BuildUnaryNode(
const std::shared_ptr<OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto input = GetInputNode(op, "X", ngb_node_map);
auto out = std::make_shared<T>(input);
SetOutputNode(op, "Out", out, ngb_node_map);
}
std::map<std::string, std::map<std::string,
std::function<void(const std::shared_ptr<OperatorBase>&, std::function<void(const std::shared_ptr<OperatorBase>&,
std::shared_ptr<std::unordered_map< std::shared_ptr<std::unordered_map<
std::string, std::shared_ptr<ngraph::Node>>>)>> std::string, std::shared_ptr<ngraph::Node>>>)>>
NgraphBridge::NG_NODE_MAP = {{"relu", BuildUnaryNode<ngraph::op::Relu>}, NgraphBridge::NG_NODE_MAP = {
{"tanh", BuildUnaryNode<ngraph::op::Tanh>}}; {"mul", paddle::operators::ngraphs::BuildMulNode},
{"mul_grad", paddle::operators::ngraphs::BuildMulGradNode},
{"relu", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Relu>},
{"tanh", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Tanh>}};
void NgraphBridge::BuildNgNode(const std::shared_ptr<OperatorBase>& op) { void NgraphBridge::BuildNgNode(const std::shared_ptr<OperatorBase>& op) {
auto& op_type = op->Type(); auto& op_type = op->Type();
......
...@@ -278,7 +278,8 @@ std::shared_ptr<ngraph::runtime::Backend> NgraphEngine::backend_ = ...@@ -278,7 +278,8 @@ std::shared_ptr<ngraph::runtime::Backend> NgraphEngine::backend_ =
ngraph::runtime::Backend::create("CPU"); ngraph::runtime::Backend::create("CPU");
void NgraphEngine::GetNgInputShape(std::shared_ptr<OperatorBase> op) { void NgraphEngine::GetNgInputShape(std::shared_ptr<OperatorBase> op) {
op->RuntimeInferShape(scope_, place_); RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_);
op->RuntimeInferShape(scope_, place_, ctx);
for (auto& var_name_item : op->Inputs()) { for (auto& var_name_item : op->Inputs()) {
for (auto& var_name : var_name_item.second) { for (auto& var_name : var_name_item.second) {
auto* var = scope_.FindVar(var_name); auto* var = scope_.FindVar(var_name);
......
...@@ -110,22 +110,125 @@ class CompileTimeInferShapeContext : public InferShapeContext { ...@@ -110,22 +110,125 @@ class CompileTimeInferShapeContext : public InferShapeContext {
} }
} }
std::vector<InferShapeVarPtr> GetInputVarPtrs(
const std::string &name) override {
const std::vector<std::string> arg_names = Inputs(name);
std::vector<InferShapeVarPtr> res;
res.reserve(arg_names.size());
std::transform(arg_names.begin(), arg_names.end(), std::back_inserter(res),
[this](const std::string &name) {
return block_.FindVarRecursive(name);
});
return res;
}
std::vector<InferShapeVarPtr> GetOutputVarPtrs(
const std::string &name) override {
const std::vector<std::string> arg_names = Outputs(name);
std::vector<InferShapeVarPtr> res;
res.reserve(arg_names.size());
std::transform(arg_names.begin(), arg_names.end(), std::back_inserter(res),
[this](const std::string &name) {
return block_.FindVarRecursive(name);
});
return res;
}
DDim GetInputDim(const std::string &name) const override {
const std::vector<std::string> &arg_names = Inputs(name);
PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
"Input(%s) should hold one element, but now it holds %d",
name, arg_names.size());
return this->GetDim(arg_names[0]);
}
std::vector<DDim> GetInputsDim(const std::string &name) const override {
const std::vector<std::string> &arg_names = Inputs(name);
return GetDims(arg_names);
}
bool IsRuntime() const override; bool IsRuntime() const override;
std::vector<proto::VarType::Type> GetInputsVarType(
const std::string &name) const override {
return GetVarTypes(Inputs(name));
}
std::vector<proto::VarType::Type> GetOutputsVarType(
const std::string &name) const override {
return GetVarTypes(Outputs(name));
}
void SetOutputDim(const std::string &name, const DDim &dim) override {
auto &arg_names = Outputs(name);
PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
"Output(%s) should hold one element, but now it holds %d",
name, arg_names.size());
SetDim(arg_names[0], dim);
}
void SetOutputsDim(const std::string &name,
const std::vector<DDim> &dims) override {
auto &names = Outputs(name);
SetDims(names, dims);
}
protected: protected:
proto::VarType::Type GetVarType(const std::string &name) const override; std::vector<proto::VarType::Type> GetVarTypes(
const std::vector<std::string> &names) const {
std::vector<proto::VarType::Type> retv;
retv.resize(names.size());
std::transform(
names.begin(), names.end(), retv.begin(),
std::bind(std::mem_fn(&CompileTimeInferShapeContext::GetVarType), this,
std::placeholders::_1));
return retv;
}
proto::VarType::Type GetVarType(const std::string &name) const;
DDim GetDim(const std::string &name) const {
auto var = block_.FindVarRecursive(name);
PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
DDim res;
try {
auto shape = var->GetShape();
res = shape.empty() ? make_ddim({0UL}) : make_ddim(shape);
} catch (...) {
VLOG(5) << "GetDim of variable " << name << " error";
std::rethrow_exception(std::current_exception());
}
return res;
}
DDim GetDim(const std::string &name) const override; std::vector<DDim> GetDims(const std::vector<std::string> &names) const {
std::vector<DDim> ret;
ret.reserve(names.size());
std::transform(
names.begin(), names.end(), std::back_inserter(ret),
[this](const std::string &name) { return this->GetDim(name); });
return ret;
}
void SetDim(const std::string &name, const DDim &dim);
void SetDim(const std::string &name, const DDim &dim) override; void SetDims(const std::vector<std::string> &names,
const std::vector<DDim> &dims) {
size_t length = names.size();
PADDLE_ENFORCE_EQ(length, dims.size());
for (size_t i = 0; i < length; ++i) {
if (names[i] == framework::kEmptyVarName) {
continue;
}
SetDim(names[i], dims[i]);
}
}
std::vector<DDim> GetRepeatedDims(const std::string &name) const override; std::vector<DDim> GetRepeatedDims(const std::string &name) const override;
void SetRepeatedDims(const std::string &name, void SetRepeatedDims(const std::string &name,
const std::vector<DDim> &dims) override; const std::vector<DDim> &dims) override;
InferShapeVarPtr GetVarPtr(const std::string &name) override;
const OpDesc &op_; const OpDesc &op_;
const BlockDesc &block_; const BlockDesc &block_;
}; };
...@@ -644,20 +747,6 @@ const std::vector<std::string> &CompileTimeInferShapeContext::Outputs( ...@@ -644,20 +747,6 @@ const std::vector<std::string> &CompileTimeInferShapeContext::Outputs(
return op_.Output(name); return op_.Output(name);
} }
DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
auto var = block_.FindVarRecursive(name);
PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
DDim res;
try {
auto shape = var->GetShape();
res = shape.empty() ? make_ddim({0UL}) : make_ddim(shape);
} catch (...) {
VLOG(5) << "GetDim of variable " << name << " error";
std::rethrow_exception(std::current_exception());
}
return res;
}
std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims( std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims(
const std::string &name) const { const std::string &name) const {
auto var = block_.FindVarRecursive(name); auto var = block_.FindVarRecursive(name);
...@@ -696,10 +785,5 @@ proto::VarType::Type CompileTimeInferShapeContext::GetVarType( ...@@ -696,10 +785,5 @@ proto::VarType::Type CompileTimeInferShapeContext::GetVarType(
return block_.FindVarRecursive(name)->GetType(); return block_.FindVarRecursive(name)->GetType();
} }
InferShapeVarPtr CompileTimeInferShapeContext::GetVarPtr(
const std::string &name) {
return block_.FindVarRecursive(name);
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -137,6 +137,25 @@ static LoD GetLoD(const Scope& scope, const std::string& name) { ...@@ -137,6 +137,25 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
} }
} }
RuntimeContext::RuntimeContext(const VariableNameMap& innames,
const VariableNameMap& outnames,
const Scope& scope) {
for (auto& var_name_item : innames) {
std::vector<Variable*>& input_vars = inputs[var_name_item.first];
input_vars.reserve(var_name_item.second.size());
for (auto& var_name : var_name_item.second) {
input_vars.push_back(scope.FindVar(var_name));
}
}
for (auto& var_name_item : outnames) {
std::vector<Variable*>& output_vars = outputs[var_name_item.first];
output_vars.reserve(var_name_item.second.size());
for (auto& var_name : var_name_item.second) {
output_vars.push_back(scope.FindVar(var_name));
}
}
}
void OperatorBase::Run(const Scope& scope, const platform::Place& place) { void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
VLOG(4) << place << " " << DebugStringEx(&scope); VLOG(4) << place << " " << DebugStringEx(&scope);
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
...@@ -412,11 +431,48 @@ bool ExecutionContext::HasOutput(const std::string& name) const { ...@@ -412,11 +431,48 @@ bool ExecutionContext::HasOutput(const std::string& name) const {
return var != nullptr; return var != nullptr;
} }
const Variable* ExecutionContext::InputVar(const std::string& name) const {
auto it = ctx_.inputs.find(name);
if (it == ctx_.inputs.end()) return nullptr;
PADDLE_ENFORCE_LE(it->second.size(), 1UL,
"Operator %s's input %s should contain only one variable.",
op_.Type(), name);
return it->second.empty() ? nullptr : it->second[0];
}
const Variable* ExecutionContext::LegacyInputVar(
const std::string& name) const {
auto ipt = op_.Input(name);
return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
}
Variable* ExecutionContext::OutputVar(const std::string& name) const {
auto it = ctx_.outputs.find(name);
if (it == ctx_.outputs.end()) return nullptr;
PADDLE_ENFORCE_LE(it->second.size(), 1UL,
"Operator %s's output %s should contain only one variable.",
op_.Type(), name);
return it->second.empty() ? nullptr : it->second[0];
}
Variable* ExecutionContext::LegacyOutputVar(const std::string& name) const {
auto opt = op_.Output(name);
return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt);
}
template <> template <>
const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const { const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
return Input<LoDTensor>(name); return Input<LoDTensor>(name);
} }
template <>
const Tensor* ExecutionContext::LegacyInput<Tensor>(
const std::string& name) const {
return LegacyInput<LoDTensor>(name);
}
template <> template <>
const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>( const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
const std::string& name) const { const std::string& name) const {
...@@ -441,6 +497,11 @@ Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const { ...@@ -441,6 +497,11 @@ Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const {
return Output<LoDTensor>(name); return Output<LoDTensor>(name);
} }
template <>
Tensor* ExecutionContext::LegacyOutput<Tensor>(const std::string& name) const {
return LegacyOutput<LoDTensor>(name);
}
template <> template <>
std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>( std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
const std::string& name) const { const std::string& name) const {
...@@ -477,51 +538,48 @@ bool OpSupportGPU(const std::string& op_type) { ...@@ -477,51 +538,48 @@ bool OpSupportGPU(const std::string& op_type) {
class RuntimeInferShapeContext : public InferShapeContext { class RuntimeInferShapeContext : public InferShapeContext {
public: public:
RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope) RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope,
: op_(op), scope_(scope) {} const RuntimeContext& ctx)
: op_(op), scope_(scope), ctx_(ctx) {}
bool HasInput(const std::string& name) const override { bool HasInput(const std::string& name) const override {
// has only one input // has only one input
const auto& ins = op_.Inputs(); const auto& ins = ctx_.inputs;
auto it = ins.find(name); auto it = ins.find(name);
if (it == ins.end()) { if (it == ins.end()) {
return false; return false;
} }
const auto& in = it->second; const auto& in = it->second;
if (in.size() == 0 || in[0] == kEmptyVarName) { if (in.size() == 0) return false;
return false;
}
PADDLE_ENFORCE_EQ(in.size(), 1UL, PADDLE_ENFORCE_EQ(in.size(), 1UL,
"Input %s should not have more than one inputs", name); "Input %s should not have more than one inputs", name);
return scope_.FindVar(in[0]) != nullptr; return in[0] != nullptr;
} }
bool HasOutput(const std::string& name) const override { bool HasOutput(const std::string& name) const override {
// has only one output // has only one output
const auto& outs = op_.Outputs(); const auto& outs = ctx_.outputs;
auto it = outs.find(name); auto it = outs.find(name);
if (it == outs.end()) { if (it == outs.end()) {
return false; return false;
} }
const auto& out = it->second; const auto& out = it->second;
if (out.size() == 0 || out[0] == kEmptyVarName) { if (out.size() == 0) {
return false; return false;
} }
PADDLE_ENFORCE_EQ(out.size(), 1UL, PADDLE_ENFORCE_EQ(out.size(), 1UL,
"Output %s should not have more than one outputs", name); "Output %s should not have more than one outputs", name);
return scope_.FindVar(out[0]) != nullptr; return out[0] != nullptr;
} }
bool HasInputs(const std::string& name) const override { bool HasInputs(const std::string& name) const override {
if (!op_.HasInputs(name)) { const auto& ins = ctx_.inputs;
return false; auto it = ins.find(name);
} if (it == ins.end() || it->second.empty()) {
auto inputs = op_.Inputs(name);
if (inputs.empty()) {
return false; return false;
} }
for (auto& input : inputs) { for (auto& input : it->second) {
if (scope_.FindVar(input) == nullptr) { if (input == nullptr) {
return false; return false;
} }
} }
...@@ -529,15 +587,13 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -529,15 +587,13 @@ class RuntimeInferShapeContext : public InferShapeContext {
} }
bool HasOutputs(const std::string& name) const override { bool HasOutputs(const std::string& name) const override {
if (!op_.HasOutputs(name)) { const auto& outs = ctx_.outputs;
return false; auto it = outs.find(name);
} if (it == outs.end() || it->second.empty()) {
auto outputs = op_.Outputs(name);
if (outputs.empty()) {
return false; return false;
} }
for (auto& output : outputs) { for (auto& output : it->second) {
if (scope_.FindVar(output) == nullptr) { if (output == nullptr) {
return false; return false;
} }
} }
...@@ -558,16 +614,18 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -558,16 +614,18 @@ class RuntimeInferShapeContext : public InferShapeContext {
void ShareDim(const std::string& in, const std::string& out, size_t i = 0, void ShareDim(const std::string& in, const std::string& out, size_t i = 0,
size_t j = 0) override { size_t j = 0) override {
PADDLE_ENFORCE_LT(i, Inputs(in).size()); auto in_it = ctx_.inputs.find(in);
PADDLE_ENFORCE_LT(j, Outputs(out).size()); auto out_it = ctx_.outputs.find(out);
const std::string& input_n = Inputs(in)[i]; PADDLE_ENFORCE(in_it != ctx_.inputs.end() && in_it->second.size() > i,
const std::string& output_n = Outputs(out)[j]; "Inputs %s should have %llu argument", in, i);
PADDLE_ENFORCE(out_it != ctx_.outputs.end() && out_it->second.size() > j,
"Outputs %s should have %llu argument", out, j);
Variable* in_var = in_it->second[i];
Variable* out_var = out_it->second[j];
Variable* in_var = scope_.FindVar(input_n);
Variable* out_var = scope_.FindVar(output_n);
PADDLE_ENFORCE(in_var->Type() == out_var->Type(), PADDLE_ENFORCE(in_var->Type() == out_var->Type(),
"The type of %s and %s is not the same.", output_n, "The type of %s and %s is not the same.", in, out);
GetDim(input_n));
if (in_var->IsType<framework::SelectedRows>()) { if (in_var->IsType<framework::SelectedRows>()) {
auto& in_sele_rows = in_var->Get<framework::SelectedRows>(); auto& in_sele_rows = in_var->Get<framework::SelectedRows>();
...@@ -588,13 +646,16 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -588,13 +646,16 @@ class RuntimeInferShapeContext : public InferShapeContext {
void ShareLoD(const std::string& in, const std::string& out, size_t i = 0, void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
size_t j = 0) const override { size_t j = 0) const override {
const std::vector<std::string>& inputs = Inputs(in); auto in_it = ctx_.inputs.find(in);
const std::vector<std::string>& outputs = Outputs(out); auto out_it = ctx_.outputs.find(out);
PADDLE_ENFORCE_LT(i, inputs.size()); PADDLE_ENFORCE(in_it != ctx_.inputs.end() && in_it->second.size() > i,
PADDLE_ENFORCE_LT(j, outputs.size()); "Inputs %s should have %llu argument", in, i);
Variable* in_var = scope_.FindVar(inputs.at(i)); PADDLE_ENFORCE(out_it != ctx_.outputs.end() && out_it->second.size() > j,
"Outputs %s should have %llu argument", out, j);
Variable* in_var = in_it->second.at(i);
if (!in_var->IsType<LoDTensor>()) return; if (!in_var->IsType<LoDTensor>()) return;
Variable* out_var = scope_.FindVar(outputs.at(j)); Variable* out_var = out_it->second.at(j);
PADDLE_ENFORCE(out_var->IsType<LoDTensor>(), PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
"The %d-th output of Output(%s) must be LoDTensor.", j, out); "The %d-th output of Output(%s) must be LoDTensor.", j, out);
auto in_tensor = in_var->Get<LoDTensor>(); auto in_tensor = in_var->Get<LoDTensor>();
...@@ -629,9 +690,64 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -629,9 +690,64 @@ class RuntimeInferShapeContext : public InferShapeContext {
bool IsRuntime() const override { return true; } bool IsRuntime() const override { return true; }
// TODO(paddle-dev): Can this be template?
std::vector<InferShapeVarPtr> GetInputVarPtrs(
const std::string& name) override {
const std::vector<Variable*>& vars = InputVars(name);
std::vector<InferShapeVarPtr> res;
res.reserve(vars.size());
res.insert(res.begin(), vars.begin(), vars.end());
return res;
}
std::vector<InferShapeVarPtr> GetOutputVarPtrs(
const std::string& name) override {
const std::vector<Variable*>& vars = OutputVars(name);
std::vector<InferShapeVarPtr> res;
res.reserve(vars.size());
res.insert(res.begin(), vars.begin(), vars.end());
return res;
}
DDim GetInputDim(const std::string& name) const override {
const std::vector<Variable*>& vars = InputVars(name);
PADDLE_ENFORCE_EQ(vars.size(), 1UL,
"Input(%s) should hold one element, but now it holds %d",
name, vars.size());
return this->GetDim(vars[0]);
}
std::vector<DDim> GetInputsDim(const std::string& name) const override {
const std::vector<Variable*>& vars = InputVars(name);
return GetDims(vars);
}
std::vector<proto::VarType::Type> GetInputsVarType(
const std::string& name) const override {
return GetVarTypes(InputVars(name));
}
std::vector<proto::VarType::Type> GetOutputsVarType(
const std::string& name) const override {
return GetVarTypes(OutputVars(name));
}
void SetOutputDim(const std::string& name, const DDim& dim) override {
auto& vars = OutputVars(name);
PADDLE_ENFORCE_EQ(vars.size(), 1UL,
"Output(%s) should hold one element, but now it holds %d",
name, vars.size());
SetDim(vars[0], dim);
}
void SetOutputsDim(const std::string& name,
const std::vector<DDim>& dims) override {
auto& vars = OutputVars(name);
SetDims(vars, dims);
}
protected: protected:
DDim GetDim(const std::string& name) const override { DDim GetDim(Variable* var) const {
Variable* var = scope_.FindVar(name);
PADDLE_ENFORCE_NOT_NULL(var); PADDLE_ENFORCE_NOT_NULL(var);
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
return var->Get<LoDTensor>().dims(); return var->Get<LoDTensor>().dims();
...@@ -639,25 +755,44 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -639,25 +755,44 @@ class RuntimeInferShapeContext : public InferShapeContext {
return var->Get<SelectedRows>().GetCompleteDims(); return var->Get<SelectedRows>().GetCompleteDims();
} else { } else {
PADDLE_THROW( PADDLE_THROW(
"Only LoDTensor/SelectedRows support 'GetDim', but Variable %s's " "Only LoDTensor/SelectedRows support 'GetDim', but Variables "
"type_id is %s.", "type_id is %s.",
name, var->Type().name()); var->Type().name());
} }
} }
std::vector<DDim> GetDims(const std::vector<Variable*>& vars) const {
std::vector<DDim> ret;
ret.reserve(vars.size());
std::transform(vars.begin(), vars.end(), std::back_inserter(ret),
[this](Variable* var) { return this->GetDim(var); });
return ret;
}
std::vector<DDim> GetRepeatedDims(const std::string& name) const override { std::vector<DDim> GetRepeatedDims(const std::string& name) const override {
PADDLE_THROW("Only compile time support this method"); PADDLE_THROW("Only compile time support this method");
} }
void SetDim(const std::string& name, const DDim& dim) override { void SetDim(Variable* var, const DDim& dim) {
Variable* var = scope_.FindVar(name);
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
var->GetMutable<LoDTensor>()->Resize(dim); var->GetMutable<LoDTensor>()->Resize(dim);
} else if (var->IsType<SelectedRows>()) { } else if (var->IsType<SelectedRows>()) {
var->GetMutable<SelectedRows>()->set_height(dim[0]); var->GetMutable<SelectedRows>()->set_height(dim[0]);
} else { } else {
PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.", PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
name, var->Type().name()); var->Type().name());
}
}
void SetDims(const std::vector<Variable*>& vars,
const std::vector<DDim>& dims) {
size_t length = vars.size();
PADDLE_ENFORCE_EQ(length, dims.size());
for (size_t i = 0; i < length; ++i) {
if (vars[i] == nullptr) {
continue;
}
SetDim(vars[i], dims[i]);
} }
} }
...@@ -666,18 +801,39 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -666,18 +801,39 @@ class RuntimeInferShapeContext : public InferShapeContext {
PADDLE_THROW("Only compile time support this method"); PADDLE_THROW("Only compile time support this method");
} }
proto::VarType::Type GetVarType(const std::string& name) const override { std::vector<proto::VarType::Type> GetVarTypes(
auto* var = scope_.FindVar(name); const std::vector<Variable*>& vars) const {
return ToVarType(var->Type()); std::vector<proto::VarType::Type> retv;
retv.resize(vars.size());
std::transform(vars.begin(), vars.end(), retv.begin(),
std::bind(std::mem_fn(&RuntimeInferShapeContext::GetVarType),
this, std::placeholders::_1));
return retv;
} }
InferShapeVarPtr GetVarPtr(const std::string& name) override { proto::VarType::Type GetVarType(Variable* var) const {
return scope_.FindVar(name); return ToVarType(var->Type());
} }
private: private:
const std::vector<Variable*>& InputVars(const std::string& name) const {
auto it = ctx_.inputs.find(name);
PADDLE_ENFORCE(it != ctx_.inputs.end(),
"Operator %s does not have the input %s.", op_.Type(), name);
return it->second;
}
const std::vector<Variable*>& OutputVars(const std::string& name) const {
auto it = ctx_.outputs.find(name);
PADDLE_ENFORCE(it != ctx_.outputs.end(),
"Operator %s does not have the outputs %s.", op_.Type(),
name);
return it->second;
}
const OperatorBase& op_; const OperatorBase& op_;
const Scope& scope_; const Scope& scope_;
const RuntimeContext& ctx_;
}; };
static void CheckTensorNANOrInf(const std::string& name, static void CheckTensorNANOrInf(const std::string& name,
...@@ -696,15 +852,15 @@ static void CheckTensorNANOrInf(const std::string& name, ...@@ -696,15 +852,15 @@ static void CheckTensorNANOrInf(const std::string& name,
} }
void OperatorWithKernel::RuntimeInferShape(const Scope& scope, void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
const platform::Place& place) const { const platform::Place& place,
RuntimeInferShapeContext infer_shape_ctx(*this, scope); const RuntimeContext& ctx) const {
RuntimeInferShapeContext infer_shape_ctx(*this, scope, ctx);
this->InferShape(&infer_shape_ctx); this->InferShape(&infer_shape_ctx);
} }
void OperatorWithKernel::RunImpl(const Scope& scope, void OperatorWithKernel::RunImpl(const Scope& scope,
const platform::Place& place) const { const platform::Place& place) const {
RuntimeInferShapeContext infer_shape_ctx(*this, scope); RuntimeContext ctx(Inputs(), Outputs(), scope);
this->InferShape(&infer_shape_ctx);
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(place); auto* dev_ctx = pool.Get(place);
...@@ -718,15 +874,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -718,15 +874,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
OpKernelMap& kernels = kernels_iter->second; OpKernelMap& kernels = kernels_iter->second;
// TODO(dzhwinter) : kernel fallback mechanism will be added when all the auto expected_kernel_key = this->GetExpectedKernelType(
// transform functions are ready. ExecutionContext(*this, scope, *dev_ctx, ctx));
// for (auto& candidate : kKernelPriority) {
// Do selection
// }
auto expected_kernel_key =
this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
VLOG(3) << "expected_kernel_key:" << expected_kernel_key; VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
auto kernel_iter = kernels.find(expected_kernel_key); auto kernel_iter = kernels.find(expected_kernel_key);
...@@ -748,7 +897,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -748,7 +897,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
// do data transformScope &transfer_scope; // do data transformScope &transfer_scope;
std::vector<std::string> transfered_inplace_vars; std::vector<std::string> transfered_inplace_vars;
auto* transfer_scope = auto* transfer_scope =
TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars); PrepareData(scope, expected_kernel_key, &transfered_inplace_vars, &ctx);
// exec scope is the scope that kernel actually executed on. // exec scope is the scope that kernel actually executed on.
const Scope& exec_scope = const Scope& exec_scope =
...@@ -758,7 +907,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -758,7 +907,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
dev_ctx = pool.Get(expected_kernel_key.place_); dev_ctx = pool.Get(expected_kernel_key.place_);
} }
kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx)); RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx);
this->InferShape(&infer_shape_ctx);
// TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
// not Scope. Imperative mode only pass inputs and get outputs.
kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx, ctx));
if (!transfered_inplace_vars.empty()) { if (!transfered_inplace_vars.empty()) {
// there is inplace variable has been transfered. // there is inplace variable has been transfered.
...@@ -782,6 +935,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -782,6 +935,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
} }
} }
} }
void OperatorWithKernel::TransferInplaceVarsBack( void OperatorWithKernel::TransferInplaceVarsBack(
const Scope& scope, const std::vector<std::string>& inplace_vars, const Scope& scope, const std::vector<std::string>& inplace_vars,
const Scope& transfer_scope) const { const Scope& transfer_scope) const {
...@@ -797,13 +951,18 @@ void OperatorWithKernel::TransferInplaceVarsBack( ...@@ -797,13 +951,18 @@ void OperatorWithKernel::TransferInplaceVarsBack(
} }
} }
Scope* OperatorWithKernel::TryTransferData( Scope* OperatorWithKernel::PrepareData(
const Scope& scope, const OpKernelType& expected_kernel_key, const Scope& scope, const OpKernelType& expected_kernel_key,
std::vector<std::string>* transfered_inplace_vars) const { std::vector<std::string>* transfered_inplace_vars,
RuntimeContext* ctx) const {
Scope* new_scope = nullptr; Scope* new_scope = nullptr;
for (auto& var_name_item : Inputs()) { for (auto& var_name_item : Inputs()) {
for (auto& var_name : var_name_item.second) { std::vector<Variable*>& input_vars = ctx->inputs[var_name_item.first];
auto* var = scope.FindVar(var_name);
for (size_t i = 0; i < var_name_item.second.size(); ++i) {
auto& var_name = var_name_item.second[i];
auto* var = input_vars[i];
// Only tensor can be tranfer to another device. // Only tensor can be tranfer to another device.
if (var == nullptr || !VarIsTensor(*var)) { if (var == nullptr || !VarIsTensor(*var)) {
continue; continue;
...@@ -851,6 +1010,7 @@ Scope* OperatorWithKernel::TryTransferData( ...@@ -851,6 +1010,7 @@ Scope* OperatorWithKernel::TryTransferData(
} }
auto* trans_var = new_scope->Var(var_name); auto* trans_var = new_scope->Var(var_name);
input_vars[i] = trans_var;
Tensor out; Tensor out;
TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out); TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out);
......
...@@ -70,6 +70,15 @@ Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var); ...@@ -70,6 +70,15 @@ Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var);
class OperatorBase; class OperatorBase;
class ExecutionContext; class ExecutionContext;
class RuntimeContext {
public:
RuntimeContext(const VariableNameMap& innames,
const VariableNameMap& outnames, const Scope& scope);
VariableValueMap inputs;
VariableValueMap outputs;
};
/** /**
* OperatorBase has the basic elements that Net will call to do computation. * OperatorBase has the basic elements that Net will call to do computation.
* Only CreateOperator from OpRegistry will new Operator directly. User * Only CreateOperator from OpRegistry will new Operator directly. User
...@@ -129,7 +138,8 @@ class OperatorBase { ...@@ -129,7 +138,8 @@ class OperatorBase {
void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; } void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; }
virtual void RuntimeInferShape(const Scope& scope, virtual void RuntimeInferShape(const Scope& scope,
const platform::Place& place) const {} const platform::Place& place,
const RuntimeContext& ctx) const {}
protected: protected:
std::string type_; std::string type_;
...@@ -156,8 +166,9 @@ class OperatorBase { ...@@ -156,8 +166,9 @@ class OperatorBase {
class ExecutionContext { class ExecutionContext {
public: public:
ExecutionContext(const OperatorBase& op, const Scope& scope, ExecutionContext(const OperatorBase& op, const Scope& scope,
const platform::DeviceContext& device_context) const platform::DeviceContext& device_context,
: op_(op), scope_(scope), device_context_(device_context) {} const RuntimeContext& ctx)
: op_(op), scope_(scope), device_context_(device_context), ctx_(ctx) {}
const OperatorBase& op() const { return op_; } const OperatorBase& op() const { return op_; }
...@@ -180,15 +191,9 @@ class ExecutionContext { ...@@ -180,15 +191,9 @@ class ExecutionContext {
return op_.Outputs(name).size(); return op_.Outputs(name).size();
} }
const Variable* InputVar(const std::string& name) const { const Variable* InputVar(const std::string& name) const;
auto ipt = op_.Input(name);
return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
}
Variable* OutputVar(const std::string& name) const { Variable* OutputVar(const std::string& name) const;
auto opt = op_.Output(name);
return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt);
}
const std::vector<const Variable*> MultiInputVar( const std::vector<const Variable*> MultiInputVar(
const std::string& name) const { const std::string& name) const {
...@@ -227,6 +232,22 @@ class ExecutionContext { ...@@ -227,6 +232,22 @@ class ExecutionContext {
return var == nullptr ? nullptr : var->GetMutable<T>(); return var == nullptr ? nullptr : var->GetMutable<T>();
} }
template <typename T>
const T* LegacyInput(const std::string& name) const {
auto* var = LegacyInputVar(name);
return var == nullptr ? nullptr : &var->Get<T>();
}
template <typename T>
T* LegacyOutput(const std::string& name) const {
auto var = LegacyOutputVar(name);
return var == nullptr ? nullptr : var->GetMutable<T>();
}
const Variable* LegacyInputVar(const std::string& name) const;
Variable* LegacyOutputVar(const std::string& name) const;
template <typename T> template <typename T>
const std::vector<const T*> MultiInput(const std::string& name) const { const std::vector<const T*> MultiInput(const std::string& name) const {
auto names = op_.Inputs(name); auto names = op_.Inputs(name);
...@@ -286,11 +307,16 @@ class ExecutionContext { ...@@ -286,11 +307,16 @@ class ExecutionContext {
const OperatorBase& op_; const OperatorBase& op_;
const Scope& scope_; const Scope& scope_;
const platform::DeviceContext& device_context_; const platform::DeviceContext& device_context_;
const RuntimeContext& ctx_;
}; };
template <> template <>
const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const; const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const;
template <>
const Tensor* ExecutionContext::LegacyInput<Tensor>(
const std::string& name) const;
template <> template <>
const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>( const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
const std::string& name) const; const std::string& name) const;
...@@ -298,6 +324,9 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>( ...@@ -298,6 +324,9 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
template <> template <>
Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const; Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const;
template <>
Tensor* ExecutionContext::LegacyOutput<Tensor>(const std::string& name) const;
template <> template <>
std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>( std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
const std::string& name) const; const std::string& name) const;
...@@ -350,8 +379,8 @@ class OperatorWithKernel : public OperatorBase { ...@@ -350,8 +379,8 @@ class OperatorWithKernel : public OperatorBase {
OpInfoMap::Instance().Get(Type()).infer_shape_(ctx); OpInfoMap::Instance().Get(Type()).infer_shape_(ctx);
} }
void RuntimeInferShape(const Scope& scope, void RuntimeInferShape(const Scope& scope, const platform::Place& place,
const platform::Place& place) const override; const RuntimeContext& ctx) const override;
protected: protected:
virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const; virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
...@@ -371,9 +400,10 @@ class OperatorWithKernel : public OperatorBase { ...@@ -371,9 +400,10 @@ class OperatorWithKernel : public OperatorBase {
* *
* * transfered_inplace_vars is a output vector. * * transfered_inplace_vars is a output vector.
*/ */
Scope* TryTransferData( Scope* PrepareData(const Scope& scope,
const Scope& scope, const OpKernelType& expected_kernel_key, const OpKernelType& expected_kernel_key,
std::vector<std::string>* transfered_inplace_vars) const; std::vector<std::string>* transfered_inplace_vars,
RuntimeContext* ctx) const;
void TransferInplaceVarsBack(const Scope& scope, void TransferInplaceVarsBack(const Scope& scope,
const std::vector<std::string>& inplace_vars, const std::vector<std::string>& inplace_vars,
......
...@@ -190,7 +190,6 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() { ...@@ -190,7 +190,6 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
ParallelExecutor::ParallelExecutor( ParallelExecutor::ParallelExecutor(
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const std::unordered_set<std::string> &params,
const std::unordered_set<std::string> &bcast_vars, const std::unordered_set<std::string> &bcast_vars,
const ProgramDesc &main_program, const std::string &loss_var_name, const ProgramDesc &main_program, const std::string &loss_var_name,
Scope *scope, const std::vector<Scope *> &local_scopes, Scope *scope, const std::vector<Scope *> &local_scopes,
...@@ -209,7 +208,7 @@ ParallelExecutor::ParallelExecutor( ...@@ -209,7 +208,7 @@ ParallelExecutor::ParallelExecutor(
"the number of places must be greater than 1."); "the number of places must be greater than 1.");
} }
// Step 1. Bcast the params to devs. // Step 1. Bcast the bcast_vars to devs.
// Create local scopes // Create local scopes
if (local_scopes.empty()) { if (local_scopes.empty()) {
member_->own_local_scope_ = true; member_->own_local_scope_ = true;
...@@ -249,12 +248,12 @@ ParallelExecutor::ParallelExecutor( ...@@ -249,12 +248,12 @@ ParallelExecutor::ParallelExecutor(
// ncclOp // ncclOp
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
std::unique_ptr<ir::Graph> graph = build_strategy.Apply( std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
main_program, member_->places_, loss_var_name, params, main_program, member_->places_, loss_var_name, member_->local_scopes_,
member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get()); member_->use_cuda_, member_->nccl_ctxs_.get());
#else #else
std::unique_ptr<ir::Graph> graph = std::unique_ptr<ir::Graph> graph =
build_strategy.Apply(main_program, member_->places_, loss_var_name, build_strategy.Apply(main_program, member_->places_, loss_var_name,
params, member_->local_scopes_, member_->use_cuda_); member_->local_scopes_, member_->use_cuda_);
#endif #endif
auto max_memory_size = GetEagerDeletionThreshold(); auto max_memory_size = GetEagerDeletionThreshold();
if (max_memory_size >= 0) { if (max_memory_size >= 0) {
......
...@@ -41,7 +41,6 @@ class ParallelExecutor { ...@@ -41,7 +41,6 @@ class ParallelExecutor {
public: public:
explicit ParallelExecutor(const std::vector<platform::Place> &places, explicit ParallelExecutor(const std::vector<platform::Place> &places,
const std::unordered_set<std::string> &params,
const std::unordered_set<std::string> &bcast_vars, const std::unordered_set<std::string> &bcast_vars,
const ProgramDesc &main_program, const ProgramDesc &main_program,
const std::string &loss_var_name, Scope *scope, const std::string &loss_var_name, Scope *scope,
......
...@@ -22,20 +22,6 @@ limitations under the License. */ ...@@ -22,20 +22,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
DDim InferShapeContext::GetInputDim(const std::string &name) const {
const std::vector<std::string> &arg_names = Inputs(name);
PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
"Input(%s) should hold one element, but now it holds %d",
name, arg_names.size());
return this->GetDim(arg_names[0]);
}
std::vector<DDim> InferShapeContext::GetInputsDim(
const std::string &name) const {
const std::vector<std::string> &arg_names = Inputs(name);
return GetDims(arg_names);
}
std::vector<DDim> InferShapeContext::GetReaderDims( std::vector<DDim> InferShapeContext::GetReaderDims(
const std::string &name) const { const std::string &name) const {
const std::vector<std::string> &arg_names = Inputs(name); const std::vector<std::string> &arg_names = Inputs(name);
...@@ -46,26 +32,6 @@ std::vector<DDim> InferShapeContext::GetReaderDims( ...@@ -46,26 +32,6 @@ std::vector<DDim> InferShapeContext::GetReaderDims(
return this->GetRepeatedDims(arg_names[0]); return this->GetRepeatedDims(arg_names[0]);
} }
DDim InferShapeContext::GetInputsElementDim(const std::string &name,
int idx) const {
const std::vector<std::string> &names = Inputs(name);
return this->GetDim(names[idx]);
}
void InferShapeContext::SetOutputDim(const std::string &name, const DDim &dim) {
auto &arg_names = Outputs(name);
PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
"Output(%s) should hold one element, but now it holds %d",
name, arg_names.size());
SetDim(arg_names[0], dim);
}
void InferShapeContext::SetOutputsDim(const std::string &name,
const std::vector<DDim> &dims) {
auto &names = Outputs(name);
SetDims(names, dims);
}
void InferShapeContext::SetReaderDims(const std::string &name, void InferShapeContext::SetReaderDims(const std::string &name,
const std::vector<DDim> &dims) { const std::vector<DDim> &dims) {
const std::vector<std::string> &arg_names = Outputs(name); const std::vector<std::string> &arg_names = Outputs(name);
...@@ -76,69 +42,5 @@ void InferShapeContext::SetReaderDims(const std::string &name, ...@@ -76,69 +42,5 @@ void InferShapeContext::SetReaderDims(const std::string &name,
return this->SetRepeatedDims(arg_names[0], dims); return this->SetRepeatedDims(arg_names[0], dims);
} }
std::vector<InferShapeVarPtr> InferShapeContext::GetInputVarPtrs(
const std::string &name) {
const std::vector<std::string> arg_names = Inputs(name);
std::vector<InferShapeVarPtr> res;
res.reserve(arg_names.size());
std::transform(
arg_names.begin(), arg_names.end(), std::back_inserter(res),
[this](const std::string &name) { return this->GetVarPtr(name); });
return res;
}
std::vector<InferShapeVarPtr> InferShapeContext::GetOutputVarPtrs(
const std::string &name) {
const std::vector<std::string> arg_names = Outputs(name);
std::vector<InferShapeVarPtr> res;
res.reserve(arg_names.size());
std::transform(
arg_names.begin(), arg_names.end(), std::back_inserter(res),
[this](const std::string &name) { return this->GetVarPtr(name); });
return res;
}
std::vector<DDim> InferShapeContext::GetDims(
const std::vector<std::string> &names) const {
std::vector<DDim> ret;
ret.reserve(names.size());
std::transform(
names.begin(), names.end(), std::back_inserter(ret),
[this](const std::string &name) { return this->GetDim(name); });
return ret;
}
void InferShapeContext::SetDims(const std::vector<std::string> &names,
const std::vector<DDim> &dims) {
size_t length = names.size();
PADDLE_ENFORCE_EQ(length, dims.size());
for (size_t i = 0; i < length; ++i) {
if (names[i] == framework::kEmptyVarName) {
continue;
}
SetDim(names[i], dims[i]);
}
}
std::vector<proto::VarType::Type> InferShapeContext::GetInputsVarType(
const std::string &name) const {
return GetVarTypes(Inputs(name));
}
std::vector<proto::VarType::Type> InferShapeContext::GetOutputsVarType(
const std::string &name) const {
return GetVarTypes(Outputs(name));
}
std::vector<proto::VarType::Type> InferShapeContext::GetVarTypes(
const std::vector<std::string> &names) const {
std::vector<proto::VarType::Type> retv;
retv.resize(names.size());
std::transform(names.begin(), names.end(), retv.begin(),
std::bind(std::mem_fn(&InferShapeContext::GetVarType), this,
std::placeholders::_1));
return retv;
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -33,22 +33,23 @@ class InferShapeContext { ...@@ -33,22 +33,23 @@ class InferShapeContext {
virtual bool HasInput(const std::string &name) const = 0; virtual bool HasInput(const std::string &name) const = 0;
virtual bool HasOutput(const std::string &name) const = 0; virtual bool HasOutput(const std::string &name) const = 0;
std::vector<proto::VarType::Type> GetInputsVarType( virtual std::vector<proto::VarType::Type> GetInputsVarType(
const std::string &name) const; const std::string &name) const = 0;
std::vector<proto::VarType::Type> GetOutputsVarType( virtual std::vector<proto::VarType::Type> GetOutputsVarType(
const std::string &name) const; const std::string &name) const = 0;
virtual bool HasInputs(const std::string &name) const = 0; virtual bool HasInputs(const std::string &name) const = 0;
virtual bool HasOutputs(const std::string &name) const = 0; virtual bool HasOutputs(const std::string &name) const = 0;
DDim GetInputDim(const std::string &name) const; virtual DDim GetInputDim(const std::string &name) const = 0;
std::vector<DDim> GetInputsDim(const std::string &name) const; virtual std::vector<DDim> GetInputsDim(const std::string &name) const = 0;
std::vector<DDim> GetReaderDims(const std::string &name) const; virtual std::vector<DDim> GetReaderDims(const std::string &name) const;
DDim GetInputsElementDim(const std::string &name, int idx) const;
void SetOutputDim(const std::string &name, const DDim &dim); virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0;
void SetOutputsDim(const std::string &name, const std::vector<DDim> &dims); virtual void SetOutputsDim(const std::string &name,
void SetReaderDims(const std::string &name, const std::vector<DDim> &dims); const std::vector<DDim> &dims) = 0;
virtual void SetReaderDims(const std::string &name,
const std::vector<DDim> &dims);
virtual AttrReader Attrs() const = 0; virtual AttrReader Attrs() const = 0;
virtual const std::vector<std::string> &Inputs( virtual const std::vector<std::string> &Inputs(
...@@ -67,27 +68,15 @@ class InferShapeContext { ...@@ -67,27 +68,15 @@ class InferShapeContext {
virtual bool IsRuntime() const = 0; virtual bool IsRuntime() const = 0;
std::vector<InferShapeVarPtr> GetInputVarPtrs(const std::string &name); virtual std::vector<InferShapeVarPtr> GetInputVarPtrs(
std::vector<InferShapeVarPtr> GetOutputVarPtrs(const std::string &name); const std::string &name) = 0;
virtual InferShapeVarPtr GetVarPtr(const std::string &name) = 0; virtual std::vector<InferShapeVarPtr> GetOutputVarPtrs(
const std::string &name) = 0;
// Note: In while op, we need this to be public
void SetDims(const std::vector<std::string> &names,
const std::vector<DDim> &dims);
protected: protected:
virtual DDim GetDim(const std::string &name) const = 0;
virtual void SetDim(const std::string &name, const DDim &dim) = 0;
virtual std::vector<DDim> GetRepeatedDims(const std::string &name) const = 0; virtual std::vector<DDim> GetRepeatedDims(const std::string &name) const = 0;
virtual void SetRepeatedDims(const std::string &name, virtual void SetRepeatedDims(const std::string &name,
const std::vector<DDim> &dims) = 0; const std::vector<DDim> &dims) = 0;
std::vector<DDim> GetDims(const std::vector<std::string> &names) const;
std::vector<proto::VarType::Type> GetVarTypes(
const std::vector<std::string> &names) const;
virtual proto::VarType::Type GetVarType(const std::string &name) const = 0;
}; };
} // namespace framework } // namespace framework
......
...@@ -28,8 +28,11 @@ class OperatorBase; ...@@ -28,8 +28,11 @@ class OperatorBase;
class OpDesc; class OpDesc;
class InferShapeContext; class InferShapeContext;
class BlockDesc; class BlockDesc;
class Variable;
using VariableNameMap = std::map<std::string, std::vector<std::string>>; using VariableNameMap = std::map<std::string, std::vector<std::string>>;
// TODO(panyx0718): Replace vector with something like gtl::Vector.
using VariableValueMap = std::map<std::string, std::vector<Variable*>>;
// The order should be as same as framework.proto // The order should be as same as framework.proto
using Attribute = using Attribute =
......
...@@ -217,9 +217,6 @@ std::vector<Variable*> OpBase::ApplyGrad(framework::Scope* scope) { ...@@ -217,9 +217,6 @@ std::vector<Variable*> OpBase::ApplyGrad(framework::Scope* scope) {
VarBase* origin_var = (*input_vars_)[i]; VarBase* origin_var = (*input_vars_)[i];
for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) {
Variable* var = scope->FindVar(outvar); Variable* var = scope->FindVar(outvar);
if (var->IsInitialized()) {
VLOG(3) << "get grad op output var " << outvar;
}
std::string orig_var_name = grad_to_var_->at(outvar); std::string orig_var_name = grad_to_var_->at(outvar);
if (origin_var->var_desc_->Name() != orig_var_name || if (origin_var->var_desc_->Name() != orig_var_name ||
origin_var->stop_gradient_) { origin_var->stop_gradient_) {
......
...@@ -64,9 +64,7 @@ endif() ...@@ -64,9 +64,7 @@ endif()
set(COMMON_OP_DEPS ${OP_HEADER_DEPS}) set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
if (NOT WIN32) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
endif()
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
if (WITH_GPU) if (WITH_GPU)
......
...@@ -122,7 +122,8 @@ class BeamSearchDecodeOp : public framework::OperatorBase { ...@@ -122,7 +122,8 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& dev_ctx = *pool.Get(dev_place); auto& dev_ctx = *pool.Get(dev_place);
framework::ExecutionContext ctx(*this, scope, dev_ctx); framework::RuntimeContext run_ctx(Inputs(), Outputs(), scope);
framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx);
const LoDTensorArray* ids = ctx.Input<LoDTensorArray>("Ids"); const LoDTensorArray* ids = ctx.Input<LoDTensorArray>("Ids");
const LoDTensorArray* scores = ctx.Input<LoDTensorArray>("Scores"); const LoDTensorArray* scores = ctx.Input<LoDTensorArray>("Scores");
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
namespace paddle {
namespace operators {
static constexpr char kInputs[] = "inputs";
static constexpr char kParameters[] = "parameters";
static constexpr char kPlaces[] = "places";
static constexpr char kOutputs[] = "outputs";
static constexpr char kParallelScopes[] = "parallel_scopes";
static constexpr char kParallelBlock[] = "sub_block";
static constexpr char kUseNCCL[] = "use_nccl";
using LoDTensor = framework::LoDTensor;
using SelectedRows = framework::SelectedRows;
static void SplitTensorAndMoveTensorToScopes(
const framework::Scope &scope, std::vector<framework::Scope *> *sub_scopes,
const std::vector<platform::Place> &places,
const std::vector<std::string> &names) {
size_t num_sub_scopes = 0;
for (auto &argu : names) {
const auto &tensor =
detail::Ref(scope.FindVar(argu),
"Cannot find variable %s in the parent scope", argu)
.Get<LoDTensor>();
auto lod_tensors = tensor.SplitLoDTensor(places);
for (auto &lod : lod_tensors) {
VLOG(3) << lod.dims();
}
if (num_sub_scopes == 0) {
num_sub_scopes = lod_tensors.size();
} else {
PADDLE_ENFORCE_EQ(num_sub_scopes, lod_tensors.size());
}
PADDLE_ENFORCE_NE(num_sub_scopes, 0);
if (sub_scopes->size() == 0) {
sub_scopes->reserve(num_sub_scopes);
for (size_t i = 0; i < num_sub_scopes; ++i) {
sub_scopes->emplace_back(&scope.NewScope());
}
}
for (size_t i = 0; i < lod_tensors.size(); ++i) {
*detail::Ref(sub_scopes->at(i)->Var(argu),
"Cannot find variable in the sub-scope", argu)
.GetMutable<LoDTensor>() = lod_tensors[i];
}
}
}
inline void CopyOrShare(const framework::Variable &src,
const platform::Place &dst_place,
framework::Variable *dst) {
if (src.IsType<LoDTensor>()) {
if (src.Get<LoDTensor>().place() == dst_place) {
dst->GetMutable<LoDTensor>()->ShareDataWith(src.Get<LoDTensor>());
dst->GetMutable<LoDTensor>()->set_lod(src.Get<LoDTensor>().lod());
} else {
TensorCopy(src.Get<LoDTensor>(), dst_place, dst->GetMutable<LoDTensor>());
}
} else if (src.IsType<SelectedRows>()) {
auto &src_sr = src.Get<SelectedRows>();
auto *dst_sr = dst->GetMutable<SelectedRows>();
dst_sr->set_height(src_sr.height());
if (src_sr.value().place() == dst_place) {
dst_sr->mutable_value()->ShareDataWith(src_sr.value());
dst_sr->set_rows(src_sr.rows());
} else {
TensorCopy(src_sr.value(), dst_place, dst_sr->mutable_value());
}
} else {
PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name());
}
}
void WaitOnPlace(const platform::Place place) {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
dev_ctx.Wait();
}
void WaitOnPlaces(const std::vector<platform::Place> places) {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
for (auto &place : places) {
auto &dev_ctx = *pool.Get(place);
dev_ctx.Wait();
}
}
class ParallelDoOp : public framework::OperatorBase {
public:
ParallelDoOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: framework::OperatorBase(type, inputs, outputs, attrs) {}
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
auto *block = Attr<framework::BlockDesc *>(kParallelBlock);
auto *program = block->Program();
auto &places = scope.FindVar(Input(kPlaces))->Get<platform::PlaceList>();
auto &sub_scopes = *scope.FindVar(Output(kParallelScopes))
->GetMutable<std::vector<framework::Scope *>>();
// split input
SplitTensorAndMoveTensorToScopes(scope, &sub_scopes, places,
Inputs(kInputs));
// copy parameter
for (auto &param : Inputs(kParameters)) {
PADDLE_ENFORCE(scope.FindVar(param)->IsType<LoDTensor>(),
"Only support parameter type as LoDTensor");
auto &src = scope.FindVar(param)->Get<LoDTensor>();
auto *sub_scope0 = sub_scopes[0];
auto *dst0 = sub_scope0->Var(param)->GetMutable<LoDTensor>();
dst0->ShareDataWith(src);
for (size_t i = 1; i < sub_scopes.size(); ++i) {
auto &place = places[i];
auto *sub_scope = sub_scopes[i];
auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>();
framework::TensorCopy(src, place, dst);
}
}
WaitOnPlaces(places);
std::vector<std::future<void>> workers;
workers.reserve(places.size());
for (size_t place_idx = 0; place_idx < sub_scopes.size(); ++place_idx) {
auto &place = places[place_idx];
auto *cur_scope = sub_scopes[place_idx];
workers.emplace_back(framework::Async([program, cur_scope, place, block] {
framework::Executor executor(place);
executor.Run(*program, cur_scope, block->ID(),
false /*create_local_scope*/);
}));
}
for (auto &worker : workers) {
worker.wait();
}
WaitOnPlaces(places);
// merge output
for (auto &o_name : Outputs(kOutputs)) {
std::vector<const framework::LoDTensor *> lod_tensors;
lod_tensors.reserve(sub_scopes.size());
for (auto *sub_scope : sub_scopes) {
lod_tensors.emplace_back(&sub_scope->FindVar(o_name)->Get<LoDTensor>());
}
auto *lod_tensor_to_be_merged =
scope.FindVar(o_name)->GetMutable<LoDTensor>();
lod_tensor_to_be_merged->MergeLoDTensor(lod_tensors, dev_ctx.GetPlace());
}
WaitOnPlaces(places);
}
};
class ParallelDoOpProtoMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput(kInputs, "").AsDuplicable();
AddInput(kParameters, "").AsDuplicable();
AddInput(kPlaces, "");
AddOutput(kOutputs, "").AsDuplicable();
AddOutput(kParallelScopes, "");
AddAttr<framework::BlockDesc *>(kParallelBlock, "");
AddAttr<bool>(kUseNCCL, "true if we use nccl on backward")
.SetDefault(false);
AddComment(R"DOC(
ParallelDo Operator.
)DOC");
}
};
class ParallelDoGradOp : public framework::OperatorBase {
public:
ParallelDoGradOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: framework::OperatorBase(type, inputs, outputs, attrs) {}
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto *block = Attr<framework::BlockDesc *>(kParallelBlock);
auto *program = block->Program();
auto &sub_scopes = scope.FindVar(Input(kParallelScopes))
->Get<std::vector<framework::Scope *>>();
auto &places = scope.FindVar(Input(kPlaces))->Get<platform::PlaceList>();
// feed output@grad
SplitTensorAndMoveTensorToScopes(
scope, const_cast<std::vector<framework::Scope *> *>(&sub_scopes),
places, Inputs(framework::GradVarName(kOutputs)));
WaitOnPlaces(places);
// exe run
std::vector<std::future<void>> workers;
for (size_t i = 0; i < sub_scopes.size(); ++i) {
auto &place = places[i];
auto *cur_scope = sub_scopes[i];
// execute
workers.emplace_back(framework::Async([program, cur_scope, place, block] {
framework::Executor executor(place);
executor.Run(*program, cur_scope, block->ID(),
false /*create_local_scope*/);
}));
}
for (auto &worker : workers) {
worker.wait();
}
WaitOnPlaces(places);
// NCCL allreduce op will be added by backward,
// so no need to explicitly accumulate grad
if (!(Attr<bool>(kUseNCCL))) {
AccumulateGrad(scope, place, sub_scopes, places);
} else {
for (auto &place : places) {
PADDLE_ENFORCE(platform::is_gpu_place(place),
"NCCL only supports cuda place");
}
}
for (auto &s : Outputs(framework::GradVarName(kParameters))) {
if (s == framework::kEmptyVarName) {
continue;
}
VLOG(3) << "Moving " << s;
CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s));
}
WaitOnPlaces(places);
}
void AccumulateGrad(const framework::Scope &scope,
const platform::Place &place,
const std::vector<framework::Scope *> &sub_scopes,
const platform::PlaceList &places) const {
for (auto &s : Outputs(framework::GradVarName(kParameters))) {
if (s == framework::kEmptyVarName) {
continue;
}
VLOG(3) << "Accumulating " << s;
if (s == framework::kEmptyVarName) continue;
std::string tmp_name;
auto *tmp = sub_scopes[0]->Var(&tmp_name);
for (size_t i = 1; i < sub_scopes.size(); ++i) {
CopyOrShare(*sub_scopes[i]->FindVar(s), places[0], tmp);
WaitOnPlaces(places);
auto sum_op = framework::OpRegistry::CreateOp(
"sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
framework::AttributeMap{{"use_mkldnn", {false}}});
VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]);
sum_op->Run(*sub_scopes[0], places[0]);
WaitOnPlace(places[0]);
}
CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s));
}
WaitOnPlaces(places);
}
};
std::ostream &operator<<(std::ostream &sout,
const std::vector<std::string> &strs) {
std::copy(strs.begin(), strs.end(),
std::ostream_iterator<std::string>(sout, ","));
return sout;
}
class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
virtual std::unique_ptr<framework::OpDesc> Apply() const {
auto *grad = new framework::OpDesc();
grad->SetType("parallel_do_grad");
for (auto &input_param : this->InputNames()) {
VLOG(3) << input_param;
grad->SetInput(input_param, this->Input(input_param));
if (input_param != kPlaces) {
grad->SetOutput(framework::GradVarName(input_param),
this->InputGrad(input_param, false));
}
}
auto *g_block = this->grad_block_[0];
// All variable name that needed by gradient operators
std::unordered_set<std::string> all_inputs_in_grad_blocks;
for (size_t i = 0; i < g_block->OpSize(); ++i) {
auto *op = g_block->Op(i);
for (auto &var_name : op->InputArgumentNames()) {
all_inputs_in_grad_blocks.insert(var_name);
}
}
for (auto &output_param : this->OutputNames()) {
if (output_param == kParallelScopes) {
grad->SetInput(output_param, this->Output(output_param));
grad->SetInput(framework::GradVarName(output_param),
this->Output(output_param));
} else {
grad->SetInput(output_param, this->Output(output_param));
std::vector<std::string> og_names;
for (auto &og_name : this->OutputGrad(output_param)) {
if (all_inputs_in_grad_blocks.count(og_name) != 0) {
// there are some gradient operators who need the OG. So make this
// OG as an input of parallel.do
og_names.push_back(og_name);
}
// else, there is no operator who need the OG. Do not use this OG as
// an input
}
grad->SetInput(framework::GradVarName(output_param), og_names);
}
}
grad->SetInput("Communicator", {"nccl_com__do_not_change_"});
grad->SetAttrMap(this->Attrs());
grad->SetBlockAttr(kParallelBlock, grad_block_[0]);
return std::unique_ptr<framework::OpDesc>(grad);
}
};
class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInputs(kParameters));
PADDLE_ENFORCE(ctx->HasInputs(kInputs));
PADDLE_ENFORCE(ctx->HasInputs(kOutputs));
ctx->SetOutputsDim(framework::GradVarName(kParameters),
ctx->GetInputsDim(kParameters));
auto i_dims = ctx->GetInputsDim(kInputs);
auto ig_names = ctx->Outputs(framework::GradVarName(kInputs));
for (size_t i = 0; i < ig_names.size(); ++i) {
auto &ig_name = ig_names[i];
if (ig_name == framework::kEmptyVarName) {
continue;
}
ctx->SetDims({ig_name}, {i_dims[i]});
}
auto p_dims = ctx->GetInputsDim(kParameters);
auto pg_names = ctx->Outputs(framework::GradVarName(kParameters));
for (size_t i = 0; i < pg_names.size(); ++i) {
auto &pg_name = pg_names[i];
if (pg_name == framework::kEmptyVarName) {
continue;
}
ctx->SetDims({pg_name}, {p_dims[i]});
}
}
};
class ParallelDoGradOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
framework::BlockDesc *sub_block =
boost::get<framework::BlockDesc *>(op_desc.GetAttr(kParallelBlock));
for (auto &out_vars : op_desc.Outputs()) {
for (auto &out_var : out_vars.second) {
auto &var = block->FindRecursiveOrCreateVar(out_var);
auto sub_var = sub_block->FindRecursiveOrCreateVar(out_var);
if (sub_var.GetType() != var.GetType()) {
var.SetType(sub_var.GetType());
}
}
}
}
};
} // namespace operators
} // namespace paddle
REGISTER_OPERATOR(parallel_do, paddle::operators::ParallelDoOp,
paddle::operators::ParallelDoOpProtoMaker,
paddle::operators::ParallelDoGradOpDescMaker);
REGISTER_OPERATOR(parallel_do_grad, paddle::operators::ParallelDoGradOp,
paddle::operators::ParallelDoGradOpShapeInference,
paddle::operators::ParallelDoGradOpVarTypeInference);
...@@ -399,26 +399,41 @@ class WhileGradOpShapeInference : public framework::InferShapeBase { ...@@ -399,26 +399,41 @@ class WhileGradOpShapeInference : public framework::InferShapeBase {
ctx->HasInputs(kOutputs); ctx->HasInputs(kOutputs);
ctx->HasInputs(framework::GradVarName(kOutputs)); ctx->HasInputs(framework::GradVarName(kOutputs));
auto p_names = ctx->Inputs(kX);
auto pg_ig_names = ctx->Outputs(kXGRAD); auto pg_ig_names = ctx->Outputs(kXGRAD);
auto var_types = ctx->GetInputsVarType(kX); std::vector<framework::InferShapeVarPtr> in_var_ptrs =
std::vector<std::string> names_to_set; ctx->GetInputVarPtrs(kX);
std::vector<framework::DDim> dims_to_set; std::vector<framework::InferShapeVarPtr> out_var_ptrs =
for (size_t i = 0; i < p_names.size(); ++i) { ctx->GetOutputVarPtrs(kXGRAD);
PADDLE_ENFORCE(in_var_ptrs.size() == out_var_ptrs.size());
for (size_t i = 0; i < in_var_ptrs.size(); ++i) {
if (pg_ig_names[i] == framework::kEmptyVarName) { if (pg_ig_names[i] == framework::kEmptyVarName) {
continue; continue;
} }
auto dims = ctx->GetInputsElementDim(kX, i); if (ctx->IsRuntime()) {
if (var_types[i] == framework::proto::VarType::LOD_TENSOR) { framework::Variable *in_var =
names_to_set.push_back(pg_ig_names[i]); boost::get<framework::Variable *>(in_var_ptrs[i]);
dims_to_set.push_back(dims); framework::Variable *out_var =
} else if (var_types[i] == framework::proto::VarType::LOD_TENSOR_ARRAY) { boost::get<framework::Variable *>(out_var_ptrs[i]);
// not sure how to set the dim of LOD_TENSOR_ARRAY
names_to_set.push_back(pg_ig_names[i]); auto type = framework::ToVarType(in_var->Type());
dims_to_set.push_back(dims); if (type == framework::proto::VarType::LOD_TENSOR) {
out_var->GetMutable<LoDTensor>()->Resize(
in_var->Get<framework::LoDTensor>().dims());
} else if (type == framework::proto::VarType::SELECTED_ROWS) {
out_var->GetMutable<framework::SelectedRows>()->set_height(
in_var->Get<framework::SelectedRows>().GetCompleteDims()[0]);
} else if (type == framework::proto::VarType::LOD_TENSOR_ARRAY) {
PADDLE_THROW("WhileGradOp doesn't support type %d",
static_cast<int>(type));
}
} else {
framework::VarDesc *in_var =
boost::get<framework::VarDesc *>(in_var_ptrs[i]);
boost::get<framework::VarDesc *>(out_var_ptrs[i])
->SetShape(in_var->GetShape());
} }
} }
ctx->SetDims(names_to_set, dims_to_set);
} }
}; };
......
...@@ -155,11 +155,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -155,11 +155,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto chosen_memory_format = auto chosen_memory_format =
platform::data_format_to_memory_format(data_format); platform::data_format_to_memory_format(data_format);
if (is_conv3d) { weights_format = mkldnn::memory::format::any;
chosen_memory_format = // Check the format for user's special output
platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format); if (chosen_memory_format != mkldnn::memory::format::any) {
if (is_conv3d) {
chosen_memory_format =
platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format);
}
} }
weights_format = GetWeightsFormat(chosen_memory_format, g, is_conv3d);
auto src_md = platform::MKLDNNMemDesc( auto src_md = platform::MKLDNNMemDesc(
src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format); src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
...@@ -435,11 +438,14 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -435,11 +438,14 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
auto chosen_memory_format = auto chosen_memory_format =
platform::data_format_to_memory_format(data_format); platform::data_format_to_memory_format(data_format);
if (is_conv3d) { weights_format = mkldnn::memory::format::any;
chosen_memory_format = // Check the format for user's special output
platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format); if (chosen_memory_format != mkldnn::memory::format::any) {
if (is_conv3d) {
chosen_memory_format =
platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format);
}
} }
weights_format = GetWeightsFormat(chosen_memory_format, g, is_conv3d);
auto src_md = platform::MKLDNNMemDesc( auto src_md = platform::MKLDNNMemDesc(
src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format); src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <nccl.h> #include <nccl.h>
#endif #endif
#include <sys/time.h> #include <sys/time.h>
#include <limits>
#include <thread> // NOLINT #include <thread> // NOLINT
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
...@@ -31,7 +32,12 @@ namespace distributed { ...@@ -31,7 +32,12 @@ namespace distributed {
class IOBufWriter { class IOBufWriter {
public: public:
static void Append(butil::IOBuf* iobuf, int k, const char* v, int64_t vlen) { static void Append(const std::string& varname, butil::IOBuf* iobuf, int k,
const char* v, int64_t vlen) {
if (vlen >= std::numeric_limits<int>::max() || vlen < 0) {
LOG(FATAL) << "AppendZeroCopy varname:" << varname << ", vlen:" << vlen;
}
iobuf->append(reinterpret_cast<char*>(&k), 4); iobuf->append(reinterpret_cast<char*>(&k), 4);
iobuf->append(reinterpret_cast<char*>(&vlen), 8); iobuf->append(reinterpret_cast<char*>(&vlen), 8);
iobuf->append(v, vlen); iobuf->append(v, vlen);
...@@ -87,6 +93,10 @@ class IOBufWriter { ...@@ -87,6 +93,10 @@ class IOBufWriter {
int k, const char* v, int64_t vlen, int k, const char* v, int64_t vlen,
bool in_cuda_pinned, void (*destroy)(void*), bool in_cuda_pinned, void (*destroy)(void*),
void* user_data) { void* user_data) {
if (vlen >= std::numeric_limits<int>::max() || vlen < 0) {
LOG(FATAL) << "AppendZeroCopy varname:" << varname << ", vlen:" << vlen;
}
#ifdef PADDLE_WITH_BRPC_RDMA #ifdef PADDLE_WITH_BRPC_RDMA
IOBufWriter::AppendRdmaZeroCopy(varname, iobuf, k, v, vlen, in_cuda_pinned, IOBufWriter::AppendRdmaZeroCopy(varname, iobuf, k, v, vlen, in_cuda_pinned,
destroy, user_data); destroy, user_data);
...@@ -134,7 +144,7 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var, ...@@ -134,7 +144,7 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var,
request->set_type(::sendrecv::NCCL_ID); request->set_type(::sendrecv::NCCL_ID);
const ncclUniqueId& uid = var->Get<ncclUniqueId>(); const ncclUniqueId& uid = var->Get<ncclUniqueId>();
// TODO(gongwb): use append_zero to avoid data copy. // TODO(gongwb): use append_zero to avoid data copy.
IOBufWriter::Append(iobuf, IOBufWriter::Append(name, iobuf,
sendrecv::VariableMessage::kSerializedFieldNumber, sendrecv::VariableMessage::kSerializedFieldNumber,
uid.internal, NCCL_UNIQUE_ID_BYTES); uid.internal, NCCL_UNIQUE_ID_BYTES);
return; return;
...@@ -149,7 +159,7 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var, ...@@ -149,7 +159,7 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var,
// FIXME(gongwb): it seems that can use zero copy. // FIXME(gongwb): it seems that can use zero copy.
if (var_is_not_stable) { if (var_is_not_stable) {
IOBufWriter::Append( IOBufWriter::Append(
iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber,
static_cast<const char*>(payload->ptr()), payload->memory_size()); static_cast<const char*>(payload->ptr()), payload->memory_size());
} else { } else {
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
...@@ -171,10 +181,11 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var, ...@@ -171,10 +181,11 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var,
if (var->IsType<framework::SelectedRows>()) { if (var->IsType<framework::SelectedRows>()) {
auto* slr = var->GetMutable<framework::SelectedRows>(); auto* slr = var->GetMutable<framework::SelectedRows>();
size_t rows_memory_size = PADDLE_ENFORCE(VectorElemName(slr->rows()) == typeid(int64_t).name());
slr->rows().size() * framework::SizeOfType(typeid(int64_t)); size_t rows_memory_size = slr->rows().size() * sizeof(int64_t);
IOBufWriter::Append(iobuf, ::sendrecv::VariableMessage::kRowsFieldNumber, IOBufWriter::Append(name, iobuf,
::sendrecv::VariableMessage::kRowsFieldNumber,
reinterpret_cast<const char*>(slr->rows().data()), reinterpret_cast<const char*>(slr->rows().data()),
static_cast<int64_t>(rows_memory_size)); static_cast<int64_t>(rows_memory_size));
} }
......
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <stdlib.h>
#include <limits> #include <limits>
#include "glog/logging.h" // For VLOG #include "glog/logging.h" // For VLOG
...@@ -420,7 +421,15 @@ void GRPCClient::Proceed() { ...@@ -420,7 +421,15 @@ void GRPCClient::Proceed() {
sync_cond_.notify_all(); sync_cond_.notify_all();
} }
} }
VLOG(3) << "GRPCClient Proceed end";
// Last log message
// Avoid using VLOG() and LOG(): in the destructor of google::LogMessage() a
// static Mutex log_mutex is used for synchronization, which might have been
// destructed at this moment.
if (FLAGS_v >= 3) {
std::string msg("GRPCClient Proceed end");
fwrite(msg.c_str(), msg.length(), 1, stdout);
}
} }
std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) { std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <nccl.h> #include <nccl.h>
#endif #endif
#include <limits>
#include <thread> // NOLINT #include <thread> // NOLINT
#include "google/protobuf/io/coded_stream.h" #include "google/protobuf/io/coded_stream.h"
...@@ -102,6 +103,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -102,6 +103,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
payload->memory_size()); payload->memory_size());
if (payload->memory_size() >= std::numeric_limits<int>::max()) {
LOG(FATAL) << "AppendZeroCopy varname:" << name
<< ", vlen:" << payload->memory_size();
}
// steal reference of tensor data // steal reference of tensor data
::grpc::Slice slices[4]; // metadata, tensor, rows meta, rows ::grpc::Slice slices[4]; // metadata, tensor, rows meta, rows
int num_slices = 2; // only SelectedRows have rows buffer int num_slices = 2; // only SelectedRows have rows buffer
...@@ -115,7 +120,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -115,7 +120,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
if (var->IsType<framework::SelectedRows>()) { if (var->IsType<framework::SelectedRows>()) {
auto* slr = var->GetMutable<framework::SelectedRows>(); auto* slr = var->GetMutable<framework::SelectedRows>();
ProtoEncodeHelper e2(static_cast<char*>(buf), 128); ProtoEncodeHelper e2(static_cast<char*>(buf), 128);
PADDLE_ENFORCE(VectorElemName(slr->rows()) == typeid(int64_t).name());
size_t rows_memory_size = slr->rows().size() * sizeof(int64_t); size_t rows_memory_size = slr->rows().size() * sizeof(int64_t);
e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size); e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
slices[2] = ::grpc::Slice(e2.size()); slices[2] = ::grpc::Slice(e2.size());
memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size()); memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size());
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <iostream> #include <iostream>
#include <string> #include <string>
#include <typeindex>
#include <vector> #include <vector>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
...@@ -23,9 +24,8 @@ limitations under the License. */ ...@@ -23,9 +24,8 @@ limitations under the License. */
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h" #include "paddle/fluid/operators/distributed/send_recv.pb.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -83,6 +83,11 @@ inline framework::proto::VarType::Type ToVarType( ...@@ -83,6 +83,11 @@ inline framework::proto::VarType::Type ToVarType(
} }
} }
template <template <typename> class T, typename Elem>
std::string VectorElemName(const T<Elem>& arg) {
return typeid(Elem).name();
}
} // namespace distributed } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -118,7 +118,7 @@ bool VariableResponse::CopyLodTensorData( ...@@ -118,7 +118,7 @@ bool VariableResponse::CopyLodTensorData(
VLOG(6) << "Tensor.memory_size = " << tensor->memory_size() VLOG(6) << "Tensor.memory_size = " << tensor->memory_size()
<< ", Buffer Size = " << length; << ", Buffer Size = " << length;
PADDLE_ENFORCE_EQ(tensor->memory_size(), length); PADDLE_ENFORCE_EQ(tensor->memory_size(), static_cast<unsigned int>(length));
return ReadRaw(input, ctx, tensor->place(), tensor_data, length); return ReadRaw(input, ctx, tensor->place(), tensor_data, length);
} }
......
...@@ -26,6 +26,13 @@ class MergeSelectedRowsOp : public framework::OperatorWithKernel { ...@@ -26,6 +26,13 @@ class MergeSelectedRowsOp : public framework::OperatorWithKernel {
"Input(X) of MergeSelectedRowsOp should not be null."); "Input(X) of MergeSelectedRowsOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of MergeSelectedRowsOp should not be null."); "Output(Out) of MergeSelectedRowsOp should not be null.");
PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("X").front(),
framework::proto::VarType::SELECTED_ROWS,
"Input X only should be SelectedRows.");
PADDLE_ENFORCE_EQ(ctx->GetOutputsVarType("Out").front(),
framework::proto::VarType::SELECTED_ROWS,
"Output Y only should be SelectedRows.");
ctx->ShareDim("X", /*->*/ "Out"); ctx->ShareDim("X", /*->*/ "Out");
} }
}; };
...@@ -43,7 +50,28 @@ class MergeSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -43,7 +50,28 @@ class MergeSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker {
R"DOC( R"DOC(
MergeSelectedRows Operator. MergeSelectedRows Operator.
MergeSelectedRows is used to merge the duplicated rows of the input. MergeSelectedRows is used to merge the duplicated rows of the input. The
output's row has no duplicated, and it's order is incremental.
Example:
Input:
X.rows is [0, 5, 5, 4, 19]
X.height is 20
X.value is:
[[1, 1]
[2, 2]
[3, 3]
[4, 4]
[6, 6]]
Output:
Out.row is [0, 4, 5, 19]
Out.height is 20
Out.value is:
[[1, 1]
[4, 4]
[5, 5]
[6, 6]]
)DOC"); )DOC");
} }
}; };
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
/*
* This file contains the list of the ngraph operators for Paddle.
*
* ATTENTION: It requires some C++11 features, for lower version C++ or C, we
* might release another API.
*/
#pragma once
#include "ops/binary_unnary_op.h"
#include "ops/mul_op.h"
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#pragma once
#include <string>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
namespace operators {
namespace ngraphs {
template <typename T>
static void BuildBinaryNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
auto y = paddle::platform::GetInputNode(op, "Y", ngb_node_map);
auto out = std::make_shared<T>(x, y);
paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
}
template <typename T>
static void BuildUnaryNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto input = paddle::platform::GetInputNode(op, "X", ngb_node_map);
auto out = std::make_shared<T>(input);
paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
}
} // namespace ngraphs
} // namespace operators
} // namespace paddle
#endif
/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#pragma once
#include <string>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
namespace operators {
namespace ngraphs {
static void BuildMulNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto op_attrs = paddle::framework::AttrReader(op->Attrs());
int x_num_col_dims = op_attrs.Get<int>("x_num_col_dims");
int y_num_col_dims = op_attrs.Get<int>("y_num_col_dims");
auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
auto y = paddle::platform::GetInputNode(op, "Y", ngb_node_map);
auto x_reshape = x;
auto y_reshape = y;
if (x->get_shape().size() > 2) {
auto x_2d = paddle::platform::FlattenTo2d(x->get_shape(), x_num_col_dims);
x_reshape = paddle::platform::NgReshaper(x, x_2d);
}
if (y->get_shape().size() > 2) {
auto y_2d = paddle::platform::FlattenTo2d(y->get_shape(), y_num_col_dims);
y_reshape = paddle::platform::NgReshaper(y, y_2d);
}
std::shared_ptr<ngraph::Node> out =
std::make_shared<ngraph::op::Dot>(x_reshape, y_reshape);
auto dummy_out = paddle::platform::GetOutputNode(op, "Out", ngb_node_map);
if (dummy_out && dummy_out->get_shape() != out->get_shape()) {
out = paddle::platform::NgReshaper(out, dummy_out->get_shape());
}
paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
}
static void BuildMulGradNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto op_attrs = paddle::framework::AttrReader(op->Attrs());
int x_num_col_dims = op_attrs.Get<int>("x_num_col_dims");
int y_num_col_dims = op_attrs.Get<int>("y_num_col_dims");
auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
auto y = paddle::platform::GetInputNode(op, "Y", ngb_node_map);
auto dout = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
bool is_dx = paddle::platform::HasOutput(op, "X@GRAD") ? true : false;
bool is_dy = paddle::platform::HasOutput(op, "Y@GRAD") ? true : false;
auto x_shape = x->get_shape();
auto y_shape = y->get_shape();
auto x_reshape = x;
auto y_reshape = y;
if (x_shape.size() > 2) {
auto x_2d_shape = paddle::platform::FlattenTo2d(x_shape, x_num_col_dims);
x_reshape = paddle::platform::NgReshaper(x, x_2d_shape);
}
if (y_shape.size() > 2) {
auto y_2d_shape = paddle::platform::FlattenTo2d(y_shape, y_num_col_dims);
y_reshape = paddle::platform::NgReshaper(y, y_2d_shape);
}
auto x_reshape_shape = x_reshape->get_shape();
std::reverse(x_reshape_shape.begin(), x_reshape_shape.end());
auto x_transpose = std::make_shared<ngraph::op::Reshape>(
x_reshape, ngraph::AxisVector{1, 0}, x_reshape_shape);
auto y_reshape_shape = y_reshape->get_shape();
std::reverse(y_reshape_shape.begin(), y_reshape_shape.end());
auto y_transpose = std::make_shared<ngraph::op::Reshape>(
y_reshape, ngraph::AxisVector{1, 0}, y_reshape_shape);
if (is_dx) {
if (dout->get_shape().size() > 2) {
auto dout_2d_shape = paddle::platform::FlattenTo2d(dout->get_shape(), 2);
dout = paddle::platform::NgReshaper(dout, dout_2d_shape);
}
auto dx = std::make_shared<ngraph::op::Dot>(dout, y_transpose);
if (dx->get_shape() == x_shape) {
paddle::platform::SetOutputNode(op, "X@GRAD", dx, ngb_node_map);
} else {
auto dx_reshape = paddle::platform::NgReshaper(dx, x_shape);
paddle::platform::SetOutputNode(op, "X@GRAD", dx_reshape, ngb_node_map);
}
}
if (is_dy) {
if (dout->get_shape().size() > 2) {
auto dout_2d_shape = paddle::platform::FlattenTo2d(dout->get_shape(), 2);
dout = paddle::platform::NgReshaper(dout, dout_2d_shape);
}
auto dy = std::make_shared<ngraph::op::Dot>(x_transpose, dout);
if (dy->get_shape() == y_shape) {
paddle::platform::SetOutputNode(op, "Y@GRAD", dy, ngb_node_map);
} else {
auto dy_reshape = paddle::platform::NgReshaper(dy, y_shape);
paddle::platform::SetOutputNode(op, "Y@GRAD", dy_reshape, ngb_node_map);
}
}
}
} // namespace ngraphs
} // namespace operators
} // namespace paddle
#endif
...@@ -109,6 +109,11 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -109,6 +109,11 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
"(float, default 1.0e-8) " "(float, default 1.0e-8) "
"Constant for numerical stability") "Constant for numerical stability")
.SetDefault(1.0e-8f); .SetDefault(1.0e-8f);
AddAttr<bool>(
"lazy_mode",
"(bool, default false) "
"only update the parameter that has gradient in sparse update")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Adam Optimizer. Adam Optimizer.
......
...@@ -177,12 +177,13 @@ struct SparseAdamFunctor { ...@@ -177,12 +177,13 @@ struct SparseAdamFunctor {
const int64_t* rows_; const int64_t* rows_;
int64_t row_numel_; int64_t row_numel_;
int64_t row_count_; int64_t row_count_;
bool lazy_mode_;
SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
const T* beta2_pow, const T* mom1, T* mom1_out, const T* beta2_pow, const T* mom1, T* mom1_out,
const T* mom2, T* mom2_out, const T* lr, const T* grad, const T* mom2, T* mom2_out, const T* lr, const T* grad,
const T* param, T* param_out, const int64_t* rows, const T* param, T* param_out, const int64_t* rows,
int64_t row_numel, int64_t row_count) int64_t row_numel, int64_t row_count, bool lazy_mode)
: beta1_(beta1), : beta1_(beta1),
beta2_(beta2), beta2_(beta2),
epsilon_(epsilon), epsilon_(epsilon),
...@@ -198,13 +199,10 @@ struct SparseAdamFunctor { ...@@ -198,13 +199,10 @@ struct SparseAdamFunctor {
param_out_(param_out), param_out_(param_out),
rows_(rows), rows_(rows),
row_numel_(row_numel), row_numel_(row_numel),
row_count_(row_count) {} row_count_(row_count),
lazy_mode_(lazy_mode) {}
inline HOSTDEVICE void operator()(size_t i) const {
auto row_idx =
math::BinarySearch<int64_t>(rows_, row_count_, i / row_numel_);
T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
inline HOSTDEVICE void adam_update(size_t i, T g) const {
// The following code is the same as dense // The following code is the same as dense
T mom1 = moment1_[i]; T mom1 = moment1_[i];
T mom2 = moment2_[i]; T mom2 = moment2_[i];
...@@ -225,6 +223,17 @@ struct SparseAdamFunctor { ...@@ -225,6 +223,17 @@ struct SparseAdamFunctor {
moment2_out_[i] = mom2; moment2_out_[i] = mom2;
param_out_[i] = p; param_out_[i] = p;
} }
inline HOSTDEVICE void operator()(size_t i) const {
auto row_idx =
math::BinarySearch<int64_t>(rows_, row_count_, i / row_numel_);
if (lazy_mode_ && row_idx < 0) {
return;
} else {
T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
adam_update(i, g);
}
}
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
...@@ -240,6 +249,7 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -240,6 +249,7 @@ class AdamOpKernel : public framework::OpKernel<T> {
using paddle::framework::LoDTensor; using paddle::framework::LoDTensor;
using paddle::operators::detail::Ref; using paddle::operators::detail::Ref;
bool lazy_mode = ctx.Attr<bool>("lazy_mode");
T beta1 = static_cast<T>(ctx.Attr<float>("beta1")); T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
T beta2 = static_cast<T>(ctx.Attr<float>("beta2")); T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
T epsilon = static_cast<T>(ctx.Attr<float>("epsilon")); T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
...@@ -351,11 +361,23 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -351,11 +361,23 @@ class AdamOpKernel : public framework::OpKernel<T> {
mom2_out.template mutable_data<T>(ctx.GetPlace()), mom2_out.template mutable_data<T>(ctx.GetPlace()),
lr.template data<T>(), grad_data, param.template data<T>(), lr.template data<T>(), grad_data, param.template data<T>(),
param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel, param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel,
grad_merge.rows().size()); grad_merge.rows().size(), lazy_mode);
platform::ForRange<DeviceContext> for_range( VLOG(3) << "lazy_mode :" << lazy_mode;
static_cast<const DeviceContext&>(ctx.device_context()), if (lazy_mode && platform::is_cpu_place(ctx.GetPlace())) {
param.numel()); size_t row_count = grad_merge.rows().size();
for_range(functor); std::vector<int64_t> cpu_rows(grad_merge.rows());
for (size_t row_index = 0; row_index < row_count; ++row_index) {
for (size_t offset = 0; offset < row_numel; ++offset) {
size_t i = cpu_rows[row_index] * row_numel + offset;
functor.adam_update(i, grad_data[row_index * row_numel + offset]);
}
}
} else {
platform::ForRange<DeviceContext> for_range(
static_cast<const DeviceContext&>(ctx.device_context()),
param.numel());
for_range(functor);
}
} else { } else {
PADDLE_THROW("Variable type not supported by adam_op"); PADDLE_THROW("Variable type not supported by adam_op");
} }
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/mkldnn_reuse.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using framework::DataLayout;
template <typename T>
class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
const bool is_test = ctx.Attr<bool>("is_test");
PADDLE_ENFORCE(
is_test == true,
"TransposeMKLDNN works only for inference!. Set is_test = True");
auto& dev_ctx =
ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
int ndims = axis.size();
auto* input = ctx.Input<Tensor>("X");
auto* output = ctx.Output<Tensor>("Out");
const T* input_data = input->data<T>();
if (ndims == 1) {
output->ShareDataWith(*input);
return;
}
std::vector<int> nchw_tz = paddle::framework::vectorize2int(input->dims());
const std::string key = platform::TransposeMKLDNNHandler::GetHash(
nchw_tz, axis, ctx.op().Output("Out"));
platform::TransposeMKLDNNHandler handler(nchw_tz, axis, dev_ctx,
mkldnn_engine, key);
auto transpose_src_memory_p = handler.AcquireSrcMemory(
input->format(), platform::to_void_cast<T>(input_data));
auto transpose_dst_memory_p =
handler.AcquireDstMemory(output, ctx.GetPlace());
auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p,
transpose_src_memory_p);
std::vector<mkldnn::primitive> pipeline;
pipeline.push_back(*transpose_p);
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(transpose2, MKLDNN, ::paddle::platform::CPUPlace,
ops::TransposeMKLDNNOpKernel<float>);
REGISTER_OP_KERNEL(transpose, MKLDNN, ::paddle::platform::CPUPlace,
ops::TransposeMKLDNNOpKernel<float>);
...@@ -16,6 +16,10 @@ limitations under the License. */ ...@@ -16,6 +16,10 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -53,11 +57,32 @@ class TransposeOp : public framework::OperatorWithKernel { ...@@ -53,11 +57,32 @@ class TransposeOp : public framework::OperatorWithKernel {
} }
ctx->SetOutputDim("Out", out_dims); ctx->SetOutputDim("Out", out_dims);
} }
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
framework::LibraryType library_{framework::LibraryType::kPlain};
std::string data_format = ctx.Attr<std::string>("data_format");
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
#ifdef PADDLE_WITH_MKLDNN
if (library_ == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN;
layout_ = framework::DataLayout::kMKLDNN;
}
#endif
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
ctx.GetPlace(), layout_, library_);
}
}; };
class TransposeOpMaker : public framework::OpProtoAndCheckerMaker { class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddAttr<bool>("is_test",
"(bool, default false) Set to true for inference only, false "
"for training. Some layers may run faster when this is true.")
.SetDefault(false);
AddInput( AddInput(
"X", "X",
"(Tensor) The input tensor, tensors with rank up to 6 are supported."); "(Tensor) The input tensor, tensors with rank up to 6 are supported.");
...@@ -67,6 +92,16 @@ class TransposeOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -67,6 +92,16 @@ class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
"(vector<int>) A list of values, and the size of the list should be " "(vector<int>) A list of values, and the size of the list should be "
"the same with the input tensor rank. This operator permutes the input " "the same with the input tensor rank. This operator permutes the input "
"tensor's axes according to the values given."); "tensor's axes according to the values given.");
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<std::string>(
"data_format",
"(string, default NCHW) Only used in "
"An optional string from: \"NHWC\", \"NCHW\". "
"Defaults to \"NHWC\". Specify the data format of the output data, "
"the input will be transformed automatically. ")
.SetDefault("AnyLayout");
AddComment(R"DOC( AddComment(R"DOC(
Transpose Operator. Transpose Operator.
...@@ -144,8 +179,18 @@ class Transpose2Op : public TransposeOp { ...@@ -144,8 +179,18 @@ class Transpose2Op : public TransposeOp {
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(), framework::LibraryType library_{framework::LibraryType::kPlain};
ctx.device_context()); std::string data_format = ctx.Attr<std::string>("data_format");
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
#ifdef PADDLE_WITH_MKLDNN
if (library_ == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN;
layout_ = framework::DataLayout::kMKLDNN;
}
#endif
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
ctx.GetPlace(), layout_, library_);
} }
}; };
......
...@@ -16,9 +16,7 @@ if (CUPTI_FOUND) ...@@ -16,9 +16,7 @@ if (CUPTI_FOUND)
list(APPEND CUDA_SRCS cupti.cc) list(APPEND CUDA_SRCS cupti.cc)
endif(CUPTI_FOUND) endif(CUPTI_FOUND)
nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader) nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
if (NOT WIN32)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
endif(NOT WIN32)
if (WITH_MKLML) if (WITH_MKLML)
cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml) cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
endif() endif()
......
...@@ -34,7 +34,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name); ...@@ -34,7 +34,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ #define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \
struct DynLoad__##__name { \ struct DynLoad__##__name { \
template <typename... Args> \ template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \ auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using cudnn_func = decltype(&::__name); \ using cudnn_func = decltype(&::__name); \
std::call_once(cudnn_dso_flag, []() { \ std::call_once(cudnn_dso_flag, []() { \
cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \ cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
......
...@@ -201,6 +201,8 @@ void* GetCurandDsoHandle() { ...@@ -201,6 +201,8 @@ void* GetCurandDsoHandle() {
void* GetWarpCTCDsoHandle() { void* GetWarpCTCDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__) #if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib"); return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib");
#elif defined(_WIN32)
return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "warpctc.dll");
#else #else
return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so"); return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so");
#endif #endif
......
...@@ -18,6 +18,12 @@ namespace paddle { ...@@ -18,6 +18,12 @@ namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
#ifndef _WIN32
#define DECLARE_TYPE(__name, ...) decltype(__name(__VA_ARGS__))
#else
#define DECLARE_TYPE(__name, ...) decltype(auto)
#endif
void* GetCublasDsoHandle(); void* GetCublasDsoHandle();
void* GetCUDNNDsoHandle(); void* GetCUDNNDsoHandle();
void* GetCUPTIDsoHandle(); void* GetCUPTIDsoHandle();
......
...@@ -34,7 +34,7 @@ extern void* mklml_dso_handle; ...@@ -34,7 +34,7 @@ extern void* mklml_dso_handle;
#define DYNAMIC_LOAD_MKLML_WRAP(__name) \ #define DYNAMIC_LOAD_MKLML_WRAP(__name) \
struct DynLoad__##__name { \ struct DynLoad__##__name { \
template <typename... Args> \ template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \ auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using mklmlFunc = decltype(&::__name); \ using mklmlFunc = decltype(&::__name); \
std::call_once(mklml_dso_flag, []() { \ std::call_once(mklml_dso_flag, []() { \
mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \ mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \
......
...@@ -33,7 +33,7 @@ extern void* tensorrt_dso_handle; ...@@ -33,7 +33,7 @@ extern void* tensorrt_dso_handle;
#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name) \ #define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name) \
struct DynLoad__##__name { \ struct DynLoad__##__name { \
template <typename... Args> \ template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \ auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using tensorrt_func = decltype(__name(args...)) (*)(Args...); \ using tensorrt_func = decltype(__name(args...)) (*)(Args...); \
std::call_once(tensorrt_dso_flag, []() { \ std::call_once(tensorrt_dso_flag, []() { \
tensorrt_dso_handle = \ tensorrt_dso_handle = \
......
...@@ -34,7 +34,7 @@ extern void* warpctc_dso_handle; ...@@ -34,7 +34,7 @@ extern void* warpctc_dso_handle;
#define DYNAMIC_LOAD_WARPCTC_WRAP(__name) \ #define DYNAMIC_LOAD_WARPCTC_WRAP(__name) \
struct DynLoad__##__name { \ struct DynLoad__##__name { \
template <typename... Args> \ template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \ auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using warpctcFunc = decltype(&::__name); \ using warpctcFunc = decltype(&::__name); \
std::call_once(warpctc_dso_flag, []() { \ std::call_once(warpctc_dso_flag, []() { \
warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \ warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \
......
...@@ -197,6 +197,130 @@ class MKLDNNHandler { ...@@ -197,6 +197,130 @@ class MKLDNNHandler {
bool is_reusing_; bool is_reusing_;
}; };
class TransposeMKLDNNHandler : public MKLDNNHandler {
public:
TransposeMKLDNNHandler(std::vector<int>& dims, std::vector<int>& axis,
const platform::MKLDNNDeviceContext& dev_ctx,
mkldnn::engine engine, const std::string& base_key)
: platform::MKLDNNHandler(dev_ctx, engine, base_key),
dims_(dims),
axis_(axis),
logical_axis_(dims.size(), 0) {}
std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
const mkldnn::memory::format& fmt, void* ptr) {
auto local_key = key_ + "@user_src_mem_p";
auto mem_p =
std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
" find mem primitive in device context");
if (mem_p == nullptr) {
// Make memory descriptor using input format, unless it
// cannot be trusted (nchw) then make up memory fmt manually
for (size_t i = 0; i < logical_axis_.size(); ++i) {
logical_axis_[i] = i;
}
auto src_md = fmt != mkldnn::memory::format::nchw
? platform::MKLDNNMemDesc(
dims_, platform::MKLDNNGetDataType<float>(), fmt)
: Axis2MemoryDesc(dims_, logical_axis_);
mem_p = std::make_shared<mkldnn::memory>(
mkldnn::memory::primitive_desc{src_md, engine_}, ptr);
dev_ctx_.SetBlob(local_key, mem_p);
} else {
mem_p->set_data_handle(ptr);
// Mark that reusing happenned. All primitives from operator instance
// should be reused or none of them. So we check consistency
is_reusing_ = true;
}
return mem_p;
}
std::shared_ptr<mkldnn::memory> AcquireDstMemory(framework::Tensor* output,
platform::Place place) {
auto local_key = key_ + "@user_dst_mem_p";
auto mem_p =
std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
" find mem primitive in device context");
if (mem_p == nullptr) {
auto dst_mdp = mkldnn::memory::primitive_desc{
Axis2MemoryDesc(dims_, axis_), engine_};
auto dst_data = output->mutable_data<float>(
place, paddle::memory::Allocator::kDefault, dst_mdp.get_size());
mem_p = std::make_shared<mkldnn::memory>(dst_mdp, dst_data);
dev_ctx_.SetBlob(local_key, mem_p);
} else {
auto dst_data = output->mutable_data<float>(place);
mem_p->set_data_handle(dst_data);
// Mark that reusing happenned. All primitives from operator instance
// should be reused or none of them. So we check consistency
is_reusing_ = true;
}
return mem_p;
}
std::shared_ptr<mkldnn::reorder> AcquireTranspose(
std::shared_ptr<mkldnn::memory> dst_memory_p,
std::shared_ptr<mkldnn::memory> src_memory_p) {
auto prim_key = key_ + "@transpose_p";
auto transpose_p =
std::static_pointer_cast<mkldnn::reorder>(dev_ctx_.GetBlob(prim_key));
PADDLE_ENFORCE((transpose_p != nullptr) || (is_reusing_ == false),
"Fail to find convolution primitive in device context");
if (transpose_p == nullptr) {
transpose_p =
std::make_shared<mkldnn::reorder>(*(src_memory_p), *(dst_memory_p));
dev_ctx_.SetBlob(prim_key, transpose_p);
} else {
is_reusing_ = true;
}
return transpose_p;
}
static std::string GetHash(std::vector<int>& shape, // NOLINT
std::vector<int>& axis, // NOLINT
const std::string& suffix) {
return dims2str(shape) + dims2str(axis) + suffix;
}
protected:
mkldnn_memory_desc_t Axis2MemoryDesc(std::vector<int>& nchw_tz,
std::vector<int>& axis) {
mkldnn_memory_desc_t mem_fmt;
mem_fmt.primitive_kind = mkldnn_memory;
mem_fmt.ndims = axis.size();
for (unsigned int i = 0; i < nchw_tz.size(); ++i) {
mem_fmt.dims[i] = nchw_tz[i]; // logical dimensions (nchw format,
// regardless physical layout)
}
mem_fmt.data_type = mkldnn_f32;
mem_fmt.format = mkldnn_blocked;
unsigned int total_stride = 1;
for (int i = nchw_tz.size() - 1; i >= 0; --i) {
mem_fmt.layout_desc.blocking.padding_dims[i] =
nchw_tz[i]; // logical dimensions (nchw format, regardless physical
// layout)
mem_fmt.layout_desc.blocking.block_dims[i] = 1;
mem_fmt.layout_desc.blocking.offset_padding_to_data[i] = 0; // no offset
mem_fmt.layout_desc.blocking.strides[0][axis[i]] = total_stride;
mem_fmt.layout_desc.blocking.strides[1][axis[i]] = 1;
total_stride *= nchw_tz[axis[i]];
}
mem_fmt.layout_desc.blocking.offset_padding = 0; // no initial offset
return mem_fmt;
}
private:
std::vector<int> dims_;
std::vector<int> axis_;
std::vector<int> logical_axis_;
};
template <class forward_t, class backward_data_t, class backward_weights_t> template <class forward_t, class backward_data_t, class backward_weights_t>
class ConvMKLDNNTemplateHandler : public MKLDNNHandler { class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
public: public:
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#pragma once
#include <functional>
#include <string>
#include <vector>
#include "ngraph/ngraph.hpp"
namespace paddle {
namespace platform {
static ngraph::Shape FlattenTo2d(ngraph::Shape sh, int num) {
auto x1 = std::accumulate(std::begin(sh), std::begin(sh) + num, 1,
std::multiplies<size_t>());
auto x2 = std::accumulate(std::begin(sh) + num, std::end(sh), 1,
std::multiplies<size_t>());
size_t x1_l = static_cast<size_t>(x1);
size_t x2_l = static_cast<size_t>(x2);
return ngraph::Shape{x1_l, x2_l};
}
static std::shared_ptr<ngraph::Node> NgReshaper(
std::shared_ptr<ngraph::Node> input, ngraph::Shape shape) {
std::vector<size_t> input_order(input->get_shape().size());
std::iota(std::begin(input_order), std::end(input_order), 0);
return std::make_shared<ngraph::op::Reshape>(
input, ngraph::AxisVector(input_order), shape);
}
static std::shared_ptr<ngraph::Node> GetNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
const std::string prm, const paddle::framework::VariableNameMap& var_map,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto& var_names = var_map.at(prm);
PADDLE_ENFORCE_EQ(var_names.size(), 1,
"op %s prm %s expects one associated var", op->Type(), prm);
if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) {
return (*ngb_node_map)[var_names[0]];
} else {
return nullptr;
}
}
static std::shared_ptr<ngraph::Node> GetInputNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
const std::string prm,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
return GetNode(op, prm, op->Inputs(), ngb_node_map);
}
static std::shared_ptr<ngraph::Node> GetOutputNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
const std::string prm,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
return GetNode(op, prm, op->Outputs(), ngb_node_map);
}
static void SetOutputNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
const std::string prm, std::shared_ptr<ngraph::Node> node,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto& var_names = op->Outputs().at(prm);
if (var_names.size() == 1) {
(*ngb_node_map)[var_names[0]] = node;
} else if (var_names.size() == 0) {
(*ngb_node_map)[""] = node;
} else {
PADDLE_THROW("prm %s has more than 1 var_names.", prm);
}
}
static bool HasOutput(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
const std::string prm) {
auto& outputs = op->Outputs();
if (outputs.find(prm) == outputs.end()) return false;
return outputs.at(prm).size() > 0;
}
} // namespace platform
} // namespace paddle
#endif
...@@ -55,7 +55,6 @@ static void *dlsym(void *handle, const char *symbol_name) { ...@@ -55,7 +55,6 @@ static void *dlsym(void *handle, const char *symbol_name) {
static void *dlopen(const char *filename, int flag) { static void *dlopen(const char *filename, int flag) {
std::string file_name(filename); std::string file_name(filename);
file_name.replace(0, file_name.size() - 1, '/', '\\');
HMODULE hModule = LoadLibrary(file_name.c_str()); HMODULE hModule = LoadLibrary(file_name.c_str());
if (!hModule) { if (!hModule) {
throw std::runtime_error(file_name + " not found."); throw std::runtime_error(file_name + " not found.");
......
...@@ -24,8 +24,9 @@ namespace pybind { ...@@ -24,8 +24,9 @@ namespace pybind {
void BindTracer(pybind11::module *m) { void BindTracer(pybind11::module *m) {
pybind11::class_<imperative::Tracer>(*m, "Tracer", "") pybind11::class_<imperative::Tracer>(*m, "Tracer", "")
.def("__init__", .def("__init__",
[](imperative::Tracer &self, framework::BlockDesc *root_block) { [](imperative::Tracer &self, framework::BlockDesc *root_block,
new (&self) imperative::Tracer(root_block); framework::BlockDesc *startup_block) {
new (&self) imperative::Tracer(root_block, startup_block);
}) })
.def("trace", &imperative::Tracer::Trace) .def("trace", &imperative::Tracer::Trace)
.def("get_scope", &imperative::Tracer::GetScope, .def("get_scope", &imperative::Tracer::GetScope,
......
...@@ -990,7 +990,6 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -990,7 +990,6 @@ All parameter, weight, gradient are variables in Paddle.
cannot be updated after being finalized.)DOC"); cannot be updated after being finalized.)DOC");
pe.def(py::init<const std::vector<platform::Place> &, pe.def(py::init<const std::vector<platform::Place> &,
const std::unordered_set<std::string> &,
const std::unordered_set<std::string> &, const ProgramDesc &, const std::unordered_set<std::string> &, const ProgramDesc &,
const std::string &, Scope *, std::vector<Scope *> &, const std::string &, Scope *, std::vector<Scope *> &,
const ExecutionStrategy &, const BuildStrategy &, size_t, const ExecutionStrategy &, const BuildStrategy &, size_t,
......
...@@ -509,11 +509,11 @@ function assert_api_spec_approvals() { ...@@ -509,11 +509,11 @@ function assert_api_spec_approvals() {
if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then
# NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable. # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433` python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803`
echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
if [ "${APPROVALS}" == "FALSE" ]; then if [ "${APPROVALS}" == "FALSE" ]; then
echo "You must have at least 2 approvals for the api change! ${API_FILE}" echo "You must have panyx0718 approval for the api change! ${API_FILE}"
exit 1 exit 1
fi fi
fi fi
done done
...@@ -521,11 +521,11 @@ function assert_api_spec_approvals() { ...@@ -521,11 +521,11 @@ function assert_api_spec_approvals() {
HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep -o -m 1 "const_cast" || true` HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep -o -m 1 "const_cast" || true`
if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433` python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803`
echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
if [ "${APPROVALS}" == "FALSE" ]; then if [ "${APPROVALS}" == "FALSE" ]; then
echo "You must have at least 2 approvals for the const_cast" echo "You must have panyx0718 approval for the const_cast"
exit 1 exit 1
fi fi
fi fi
......
...@@ -102,6 +102,13 @@ def __bootstrap__(): ...@@ -102,6 +102,13 @@ def __bootstrap__():
import sys import sys
import os import os
import platform import platform
if os.name == 'nt':
third_lib_path = os.path.abspath(os.path.dirname(
__file__)) + os.sep + '..' + os.sep + 'libs'
os.environ['path'] += ';' + third_lib_path
sys.path.append(third_lib_path)
from . import core from . import core
in_test = 'unittest' in sys.modules in_test = 'unittest' in sys.modules
...@@ -128,13 +135,12 @@ def __bootstrap__(): ...@@ -128,13 +135,12 @@ def __bootstrap__():
'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size",
'eager_delete_tensor_gb', 'fast_eager_deletion_mode', 'eager_delete_tensor_gb', 'fast_eager_deletion_mode',
'allocator_strategy', 'reader_queue_speed_test_mode', 'allocator_strategy', 'reader_queue_speed_test_mode',
'print_sub_graph_dir', 'pe_profile_fname' 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir'
] ]
if 'Darwin' not in sysstr: if 'Darwin' not in sysstr:
read_env_flags.append('use_pinned_memory') read_env_flags.append('use_pinned_memory')
if os.name != 'nt': if os.name != 'nt':
read_env_flags.append('warpctc_dir')
read_env_flags.append('cpu_deterministic') read_env_flags.append('cpu_deterministic')
if core.is_compiled_with_dist(): if core.is_compiled_with_dist():
......
...@@ -249,69 +249,6 @@ def serialize_op_decs(op_desc): ...@@ -249,69 +249,6 @@ def serialize_op_decs(op_desc):
return proto.__str__() return proto.__str__()
def _callback_lookup_(op):
"""
Only used in _append_backward_ops_
Build and returns a callback function for certain op. For example
parallel_do: AllReduce
:param op:
:return: callback function
"""
if op.type == 'parallel_do' and op.attr('use_nccl'):
all_vars = op.block.vars
param_names = set(op.input('parameters'))
param_names = [
name for name in param_names
if all_vars[name].stop_gradient is False
]
param_grad_names = [n + "@GRAD" for n in param_names]
class ParallelDoCallBack(object):
def __init__(self, param_grad_names, parallel_scopes_name):
self.has_inserted_nccl_init = False
self.param_grad_names = param_grad_names
self.parallel_scopes_name = parallel_scopes_name
def __call__(self, block, context):
if not self.has_inserted_nccl_init:
op_desc = _create_op_desc_(
"ncclInit",
{"parallel_scopes": self.parallel_scopes_name},
{"Communicator": ['nccl_com__do_not_change_']}, {})
block.program.global_block().desc.append_op().copy_from(
op_desc)
self.has_inserted_nccl_init = True
current_op_desc = context["__current_op_desc__"]
for o_param in current_op_desc.output_names():
for o_argu in current_op_desc.output(o_param):
if o_argu in self.param_grad_names:
allreduce_out_name = o_argu + "__nccl_all_reduce__"
op_desc = _create_op_desc_(
"ncclReduce",
{
"X": [o_argu],
"Communicator":
['nccl_com__do_not_change_']
},
{"Out": [allreduce_out_name]},
{"reduction": "ncclSum",
"root": 0}, )
block.desc.append_op().copy_from(op_desc)
op_desc = _create_op_desc_(
"assign", {"X": [allreduce_out_name]},
{"Out": [o_argu]}, {})
block.desc.append_op().copy_from(op_desc)
return ParallelDoCallBack(param_grad_names,
op.output("parallel_scopes"))
else:
return None
def _append_backward_ops_(block, def _append_backward_ops_(block,
ops, ops,
target_block, target_block,
...@@ -349,17 +286,8 @@ def _append_backward_ops_(block, ...@@ -349,17 +286,8 @@ def _append_backward_ops_(block,
sub_block = program.block(op._block_attr_id("sub_block")) sub_block = program.block(op._block_attr_id("sub_block"))
grad_sub_block = program._create_block() grad_sub_block = program._create_block()
grad_sub_block._set_forward_block_idx(sub_block.idx) grad_sub_block._set_forward_block_idx(sub_block.idx)
cb = _callback_lookup_(op) _append_backward_ops_(sub_block, sub_block.ops, grad_sub_block,
if cb is not None: no_grad_dict, grad_to_var, callbacks)
if callbacks is None:
new_callbacks = [cb]
else:
new_callbacks = callbacks + [_callback_lookup_(op)]
_append_backward_ops_(sub_block, sub_block.ops, grad_sub_block,
no_grad_dict, grad_to_var, new_callbacks)
else:
_append_backward_ops_(sub_block, sub_block.ops, grad_sub_block,
no_grad_dict, grad_to_var, callbacks)
program._rollback() program._rollback()
grad_sub_block_list.append(grad_sub_block.desc) grad_sub_block_list.append(grad_sub_block.desc)
...@@ -424,9 +352,6 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): ...@@ -424,9 +352,6 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
# infer_shape and infer_type # infer_shape and infer_type
op_desc.infer_var_type(block.desc) op_desc.infer_var_type(block.desc)
op_desc.infer_shape(block.desc) op_desc.infer_shape(block.desc)
# ncclInit dones't need to set data_type
if op_desc.type() == 'ncclInit':
continue
for arg in op_desc.output_arg_names(): for arg in op_desc.output_arg_names():
if arg in new_vars: if arg in new_vars:
_infer_var_data_type_(arg, block) _infer_var_data_type_(arg, block)
......
...@@ -22,9 +22,12 @@ from . import op_frequence ...@@ -22,9 +22,12 @@ from . import op_frequence
from .op_frequence import * from .op_frequence import *
from . import quantize from . import quantize
from .quantize import * from .quantize import *
from . import utils
from .utils import *
__all__ = [] __all__ = []
__all__ += decoder.__all__ __all__ += decoder.__all__
__all__ += memory_usage_calc.__all__ __all__ += memory_usage_calc.__all__
__all__ += op_frequence.__all__ __all__ += op_frequence.__all__
__all__ += quantize.__all__ __all__ += quantize.__all__
__all__ += utils.__all__
...@@ -13,10 +13,11 @@ ...@@ -13,10 +13,11 @@
# limitations under the License. # limitations under the License.
from __future__ import print_function from __future__ import print_function
#from . import lookup_table_utils from . import lookup_table_utils
#from .lookup_table_utils import * from .lookup_table_utils import *
from . import hdfs_utils from . import hdfs_utils
from .hdfs_utils import * from .hdfs_utils import *
#__all__ = lookup_table_utils.__all__ __all__ = []
__all__ = hdfs_utils.__all__ __all__ += lookup_table_utils.__all__
__all__ += hdfs_utils.__all__
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
"""HDFS Utils""" """HDFS Utils"""
import os import os
import sys
import subprocess import subprocess
import multiprocessing import multiprocessing
from datetime import datetime from datetime import datetime
...@@ -24,7 +25,7 @@ import errno ...@@ -24,7 +25,7 @@ import errno
import logging import logging
__all__ = ["HDFSClient", "multi_download"] __all__ = ["HDFSClient", "multi_download", "multi_upload"]
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
_logger = logging.getLogger("hdfs_utils") _logger = logging.getLogger("hdfs_utils")
...@@ -93,13 +94,15 @@ class HDFSClient(object): ...@@ -93,13 +94,15 @@ class HDFSClient(object):
def upload(self, hdfs_path, local_path, overwrite=False, retry_times=5): def upload(self, hdfs_path, local_path, overwrite=False, retry_times=5):
""" """
upload the local file to hdfs upload the local file to hdfs
Args:
hdfs_path: hdfs path, target path Args:
local_path: local file path, source path hdfs_path(str): the hdfs file path
overwrite: will overwrite the original file local_path(str): the local file path
retry_times: max times retry to upload overwrite(bool|None): will overwrite the file on HDFS or not
Returns: retry_times(int|5): retry times
Returns:
True or False True or False
""" """
assert hdfs_path is not None assert hdfs_path is not None
...@@ -109,7 +112,7 @@ class HDFSClient(object): ...@@ -109,7 +112,7 @@ class HDFSClient(object):
_logger.warn( _logger.warn(
"The Local path: {} is dir and I will support it later, return". "The Local path: {} is dir and I will support it later, return".
format(local_path)) format(local_path))
return return False
base = os.path.basename(local_path) base = os.path.basename(local_path)
if not self.is_exist(hdfs_path): if not self.is_exist(hdfs_path):
...@@ -141,14 +144,16 @@ class HDFSClient(object): ...@@ -141,14 +144,16 @@ class HDFSClient(object):
def download(self, hdfs_path, local_path, overwrite=False, unzip=False): def download(self, hdfs_path, local_path, overwrite=False, unzip=False):
""" """
download from hdfs download file from HDFS
Args:
hdfs_path: hdfs path, target path Args:
local_path: local file path, source path hdfs_path(str): the hdfs file path
overwrite: will remove original file and overwrite it. local_path(str): the local file path
unzip: ignore this param overwrite(bool|None): will overwrite the file on HDFS or not
Returns unzip(bool|False): if the download file is compressed by zip, unzip it or not.
True or False
Returns:
True or False
""" """
_logger.info('Downloading %r to %r.', hdfs_path, local_path) _logger.info('Downloading %r to %r.', hdfs_path, local_path)
_logger.info('Download of %s to %r complete.', hdfs_path, local_path) _logger.info('Download of %s to %r complete.', hdfs_path, local_path)
...@@ -188,13 +193,13 @@ class HDFSClient(object): ...@@ -188,13 +193,13 @@ class HDFSClient(object):
def is_exist(self, hdfs_path=None): def is_exist(self, hdfs_path=None):
""" """
whether the remote hdfs path exists? whether the remote HDFS path exists
Args:
hdfs_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp) Args:
fs_name: The default values are the same as in the job configuration hdfs_path(str): the hdfs file path
fs_ugi: The default values are the same as in the job configuration
Returns: Returns:
True or False True or False
""" """
exist_cmd = ['-test', '-e', hdfs_path] exist_cmd = ['-test', '-e', hdfs_path]
returncode, output, errors = self.__run_hdfs_cmd( returncode, output, errors = self.__run_hdfs_cmd(
...@@ -211,13 +216,13 @@ class HDFSClient(object): ...@@ -211,13 +216,13 @@ class HDFSClient(object):
def is_dir(self, hdfs_path=None): def is_dir(self, hdfs_path=None):
""" """
whether the remote hdfs path exists? whether the remote HDFS path is directory
Args:
remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp) Args:
fs_name: The default values are the same as in the job configuration hdfs_path(str): the hdfs file path
fs_ugi: The default values are the same as in the job configuration
Returns: Returns:
True or False True or False
""" """
if not self.is_exist(hdfs_path): if not self.is_exist(hdfs_path):
...@@ -237,17 +242,17 @@ class HDFSClient(object): ...@@ -237,17 +242,17 @@ class HDFSClient(object):
def delete(self, hdfs_path): def delete(self, hdfs_path):
""" """
Remove a file or directory from HDFS. Remove a file or directory from HDFS.
whether the remote HDFS path exists
Args: Args:
param hdfs_path: HDFS path. hdfs_path: HDFS path.
param recursive: Recursively delete files and directories. By default,
this method will raise an :class:`HdfsError` if trying to delete a
non-empty directory.
Returns: Returns:
True or False
This function returns `True` if the deletion was successful and `False` if This function returns `True` if the deletion was successful and `False` if
no file or directory previously existed at `hdfs_path`. no file or directory previously existed at `hdfs_path`.
""" """
_logger.info('Deleting %r.', hdfs_path) _logger.info('Deleting %r.', hdfs_path)
...@@ -273,16 +278,14 @@ class HDFSClient(object): ...@@ -273,16 +278,14 @@ class HDFSClient(object):
def rename(self, hdfs_src_path, hdfs_dst_path, overwrite=False): def rename(self, hdfs_src_path, hdfs_dst_path, overwrite=False):
""" """
Rename a file or folder. Move a file or folder on HDFS.
Args:
:param hdfs_src_path: Source path. Args:
:param hdfs_dst_path: Destination path. If the path already exists and is hdfs_path(str): HDFS path.
a directory, the source will be moved into it. If the path exists and is overwrite(bool|False): If the path already exists and overwrite is False, will return False.
a file, or if a parent destination directory is missing, this method will
raise an :class:`HdfsError`.
Returns: Returns:
This function returns `True` if the rename was successful and `False` if True or False
rename was faild.
""" """
assert hdfs_src_path is not None assert hdfs_src_path is not None
assert hdfs_dst_path is not None assert hdfs_dst_path is not None
...@@ -320,17 +323,20 @@ class HDFSClient(object): ...@@ -320,17 +323,20 @@ class HDFSClient(object):
raise raise
def makedirs(self, hdfs_path): def makedirs(self, hdfs_path):
"""Create a remote directory, recursively if necessary. """
Create a remote directory, recursively if necessary.
Args: Args:
:param hdfs_path: Remote path. Intermediate directories will be created hdfs_path(str): Remote path. Intermediate directories will be created appropriately.
appropriately.
Returns: Returns:
True if make a directories was successful, False when make a directiries was failed. True or False
""" """
_logger.info('Creating directories to %r.', hdfs_path) _logger.info('Creating directories to %r.', hdfs_path)
assert hdfs_path is not None assert hdfs_path is not None
if self.is_exist(hdfs_path): if self.is_exist(hdfs_path):
_logger.error("HDFS path is exist: {}".format(hdfs_path))
return return
mkdirs_commands = ['-mkdir', hdfs_path] mkdirs_commands = ['-mkdir', hdfs_path]
...@@ -346,11 +352,13 @@ class HDFSClient(object): ...@@ -346,11 +352,13 @@ class HDFSClient(object):
def ls(self, hdfs_path): def ls(self, hdfs_path):
""" """
ls a hdfs_path. ls directory contents about HDFS hdfs_path
Args:
:param hdfs_path: hdfs_path will be ls. Args:
hdfs_path(str): Remote HDFS path will be ls.
Returns: Returns:
This function returns a `list` that contaion all files in the hdfs_path. List: a contents list about hdfs_path.
""" """
assert hdfs_path is not None assert hdfs_path is not None
...@@ -378,11 +386,15 @@ class HDFSClient(object): ...@@ -378,11 +386,15 @@ class HDFSClient(object):
def lsr(self, hdfs_path, only_file=True, sort=True): def lsr(self, hdfs_path, only_file=True, sort=True):
""" """
ls a hdfs_path sort by time. list directory contents about HDFS hdfs_path recursively
Args:
:param hdfs_path: hdfs_path will be ls. Args:
hdfs_path(str): Remote HDFS path.
only_file(bool|True): will discard folders.
sort(bool|True): will be sorted by create time.
Returns: Returns:
This function returns a `list` that contaion all files sorted by time in the hdfs_path. List: a contents list about hdfs_path.
""" """
def sort_by_time(v1, v2): def sort_by_time(v1, v2):
...@@ -422,21 +434,106 @@ class HDFSClient(object): ...@@ -422,21 +434,106 @@ class HDFSClient(object):
return ret_lines return ret_lines
def multi_download(client,
hdfs_path,
local_path,
trainer_id,
trainers,
multi_processes=5):
"""
Download files from HDFS using multi process.
Args:
client(HDFSClient): instance of HDFSClient
hdfs_path(str): path on hdfs
local_path(str): path on local
trainer_id(int): current trainer id
trainers(int): all trainers number
multi_processes(int|5): the download data process at the same time, default=5
Returns:
List:
Download files in local folder.
"""
def __subprocess_download(datas):
for data in datas:
re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
if re_path == os.curdir:
sub_local_re_path = local_path
else:
sub_local_re_path = os.path.join(local_path, re_path)
client.download(data, sub_local_re_path)
assert isinstance(client, HDFSClient)
client.make_local_dirs(local_path)
_logger.info("Make local dir {} successfully".format(local_path))
all_need_download = client.lsr(hdfs_path, sort=True)
need_download = all_need_download[trainer_id::trainers]
_logger.info("Get {} files From all {} files need to be download from {}".
format(len(need_download), len(all_need_download), hdfs_path))
_logger.info("Start {} multi process to download datas".format(
multi_processes))
procs = []
for i in range(multi_processes):
process_datas = need_download[i::multi_processes]
p = multiprocessing.Process(
target=__subprocess_download, args=(process_datas, ))
procs.append(p)
p.start()
# complete the processes
for proc in procs:
proc.join()
_logger.info("Finish {} multi process to download datas".format(
multi_processes))
local_downloads = []
for data in need_download:
data_name = os.path.basename(data)
re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
if re_path == os.curdir:
local_re_path = os.path.join(local_path, data_name)
else:
local_re_path = os.path.join(local_path, re_path, data_name)
local_downloads.append(local_re_path)
return local_downloads
def getfilelist(path):
rlist = []
for dir, folder, file in os.walk(path):
for i in file:
t = os.path.join(dir, i)
rlist.append(t)
for r in rlist:
print(r)
def multi_upload(client, def multi_upload(client,
hdfs_path, hdfs_path,
local_path, local_path,
multi_processes=5, multi_processes=5,
overwrite=False): overwrite=False,
sync=True):
""" """
Upload file to hdfs. Upload files to HDFS using multi process.
Args: Args:
:param overwrite: will overwrite hdfs file or not client(HDFSClient): instance of HDFSClient
:param multi_processes: the upload data process at the same time, default=5 hdfs_path(str): path on hdfs
:param client: instance of HDFSClient local_path(str): path on local
:param hdfs_path: path on hdfs multi_processes(int|5): the upload data process at the same time, default=5
:param local_path: path on local overwrite(bool|False): will overwrite file on HDFS or not
sync(bool|True): upload files sync or not.
Returns: Returns:
None
""" """
def __subprocess_upload(datas): def __subprocess_upload(datas):
...@@ -446,13 +543,6 @@ def multi_upload(client, ...@@ -446,13 +543,6 @@ def multi_upload(client,
client.upload(hdfs_re_path, data, overwrite, retry_times=5) client.upload(hdfs_re_path, data, overwrite, retry_times=5)
def get_local_files(path): def get_local_files(path):
"""
Get all local files
Args:
path: local file path
Returns:
A list that contation all files in the path.
"""
rlist = [] rlist = []
if not os.path.isdir(path): if not os.path.isdir(path):
...@@ -488,71 +578,6 @@ def multi_upload(client, ...@@ -488,71 +578,6 @@ def multi_upload(client,
multi_processes)) multi_processes))
def multi_download(client,
hdfs_path,
local_path,
trainer_id,
trainers,
file_cnt,
multi_processes=5):
"""
multi_download
Args:
:param client: instance of HDFSClient
:param hdfs_path: path on hdfs
:param local_path: path on local
:param trainer_id: current trainer id
:param trainers: all trainers number
:param file_cnt: all file number
:param multi_processes: the download data process at the same time, default=5
:return: None
Returns:
A list that be downloaded.
"""
def __subprocess_download(datas):
for data in datas:
re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
local_re_path = os.path.join(local_path, re_path)
client.download(data, local_re_path)
assert isinstance(client, HDFSClient)
client.make_local_dirs(local_path)
_logger.info("Make local dir {} successfully".format(local_path))
all_need_download = client.lsr(hdfs_path, sort=True)[:file_cnt]
need_download = all_need_download[trainer_id::trainers]
_logger.info("Get {} files From all {} files need to be download from {}".
format(len(need_download), len(all_need_download), hdfs_path))
_logger.info("Start {} multi process to download datas".format(
multi_processes))
procs = []
for i in range(multi_processes):
process_datas = need_download[i::multi_processes]
p = multiprocessing.Process(
target=__subprocess_download, args=(process_datas, ))
procs.append(p)
p.start()
# complete the processes
for proc in procs:
proc.join()
_logger.info("Finish {} multi process to download datas".format(
multi_processes))
local_downloads = []
for data in need_download:
data_name = os.path.basename(data)
re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
local_re_path = os.path.join(local_path, re_path, data_name)
local_downloads.append(local_re_path)
return local_downloads
if __name__ == "__main__": if __name__ == "__main__":
hadoop_home = "/home/client/hadoop-client/hadoop/" hadoop_home = "/home/client/hadoop-client/hadoop/"
......
...@@ -18,14 +18,12 @@ import os ...@@ -18,14 +18,12 @@ import os
import time import time
import logging import logging
import paddle
import paddle.fluid as fluid
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid import io from paddle.fluid import io
from paddle.fluid import Program from paddle.fluid import Program
__all__ = [ __all__ = [
"load_inference_model", "load_persistable_vars", "load_persistables_for_increment", "load_persistables_for_inference",
"convert_dist_to_sparse_program" "convert_dist_to_sparse_program"
] ]
...@@ -80,19 +78,28 @@ def __get_prefetch_op_tuples(main_program): ...@@ -80,19 +78,28 @@ def __get_prefetch_op_tuples(main_program):
return prefetch_op_tuples return prefetch_op_tuples
def convert_dist_to_sparse_program(main_program): def convert_dist_to_sparse_program(program):
if not main_program._distributed_lookup_table: """
WARNING: this function will only be used for distributed training with distributed lookup table.
when we train model with distributed lookup table but want to do the local inference, we can use
this function to convert the train program with distributed lookup table to sparse lookup table.
:param program(Program): the program must be the trainer program, which will be get by the distribute transpiler.
:return:
program: The `program` is a Program, it's the program replace distributed lookup table to sparse lookup table.
"""
if not program._distributed_lookup_table:
_logger.warn( _logger.warn(
"There are no distributed lookup tables need to be converted") "There are no distributed lookup tables need to be converted")
return return
# create table param and grad var in pserver program # create table param and grad var in pserver program
origin_emb_var = "{}.origin".format(main_program._distributed_lookup_table) origin_emb_var = "{}.origin".format(program._distributed_lookup_table)
emb_var = main_program._distributed_lookup_table emb_var = program._distributed_lookup_table
main_program.global_block()._rename_var(emb_var, origin_emb_var) program.global_block()._rename_var(emb_var, origin_emb_var)
origin_param_var = main_program.global_block().vars[origin_emb_var] origin_param_var = program.global_block().vars[origin_emb_var]
param_var = main_program.global_block().create_var( param_var = program.global_block().create_var(
name=emb_var, name=emb_var,
shape=origin_param_var.shape, shape=origin_param_var.shape,
dtype=origin_param_var.dtype, dtype=origin_param_var.dtype,
...@@ -100,28 +107,28 @@ def convert_dist_to_sparse_program(main_program): ...@@ -100,28 +107,28 @@ def convert_dist_to_sparse_program(main_program):
persistable=True) persistable=True)
# parameter must be selected rows # parameter must be selected rows
param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS) param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
main_program._sync_with_cpp() program._sync_with_cpp()
prefetch_op_tuples = __get_prefetch_op_tuples(main_program) prefetch_op_tuples = __get_prefetch_op_tuples(program)
split_ids_id = prefetch_op_tuples[0] split_ids_id = prefetch_op_tuples[0]
for idx in range(split_ids_id + 2, split_ids_id - 1, -1): for idx in range(split_ids_id + 2, split_ids_id - 1, -1):
main_program.global_block()._remove_op(idx) program.global_block()._remove_op(idx)
main_program.desc.flush() program.desc.flush()
in_out_pairs = zip(prefetch_op_tuples[1], prefetch_op_tuples[2]) in_out_pairs = zip(prefetch_op_tuples[1], prefetch_op_tuples[2])
for in_out_pair in in_out_pairs: for in_out_pair in in_out_pairs:
idx = split_ids_id idx = split_ids_id
ids = main_program.global_block().vars[in_out_pair[0]] ids = program.global_block().vars[in_out_pair[0]]
out = main_program.global_block().vars[in_out_pair[1]] out = program.global_block().vars[in_out_pair[1]]
__insert_lookup_sparse_table_op(main_program, idx, ids, param_var, out) __insert_lookup_sparse_table_op(program, idx, ids, param_var, out)
main_program.desc.flush() program.desc.flush()
return main_program return program
def load_persistable_vars(executor, dirname, program, lookup_table_var): def _load_persistable_vars(executor, dirname, program, lookup_table_vars):
def _is_checkpoint_var(exclude_fluid_vars=None): def _is_checkpoint_var(exclude_fluid_vars=None):
""" """
the checkpoint will not save or load all the variables. the checkpoint will not save or load all the variables.
...@@ -159,8 +166,82 @@ def load_persistable_vars(executor, dirname, program, lookup_table_var): ...@@ -159,8 +166,82 @@ def load_persistable_vars(executor, dirname, program, lookup_table_var):
return is_valid return is_valid
def _load_lookup_table_vars(executor, dirname, main_program, io.load_vars(
lookup_table_vars): executor,
dirname=dirname,
main_program=program,
predicate=_is_checkpoint_var(lookup_table_vars),
filename=None)
def load_persistables_for_increment(dirname, executor, program,
lookup_table_var, lookup_table_var_path):
"""
WARNING: this function will only be used for distributed training with distributed lookup table.
for increment trainning, the pserver will not only load dense variables,
but also load the suitable lookup table var. Because of slice lookup table
var with HASH, we must load the correct slice var.
:param dirname(str): The directory path
:param executor(Executor): The executor to run for loading inference model.
:param program(Program): The parameter server program, which will run on Pserver.
:param lookup_table_var: the distributed lookup tables var name.
:param lookup_table_var_path: the the distributed lookup tables var location.
:return: None
"""
def __load_lookup_table_vars(executor, main_program, lookup_table_var,
lookup_table_var_path):
emb_var = main_program.global_block().var(lookup_table_var)
load_program = Program()
load_block = load_program.global_block()
load_block.append_op(
type='load',
inputs={},
outputs={'Out': [emb_var]},
attrs={'file_path': lookup_table_var_path})
executor.run(load_program)
if not os.path.isdir(dirname):
raise ValueError("There is no directory named '%s'", dirname)
if not os.path.exists(lookup_table_var_path):
raise ValueError("There is no file named '%s'", lookup_table_var_path)
if not isinstance(program, Program):
raise ValueError("program must be an instance of fluid.Program")
_logger.info("Start Load Sparse Program With "
"Distributed Lookup Table Vars from {}, time = {}".format(
dirname, time.ctime()))
_load_persistable_vars(executor, dirname, program, [lookup_table_var])
__load_lookup_table_vars(executor, program, lookup_table_var,
lookup_table_var_path)
_logger.info("Finish Load Sparse Program With "
"Distributed Lookup Table Vars from {}, time = {}".format(
dirname, time.ctime()))
def load_persistables_for_inference(dirname, executor, program,
lookup_table_var_name):
"""
WARNING: this function will only be used for inference with distributed lookup table.
Inference with distributed lookup table is a little funky, this function will load distributed
lookup table vars into sparse var, can be used in local inference mode.
:param dirname(str): The directory path
:param executor(Executor): The executor to run for loading inference model.
:param program(Program): The parameter server program, which will run on Pserver.
:param lookup_table_var_name: the distributed lookup tables var name.
:return: None
"""
def __load_lookup_table_vars(executor, dirname, main_program,
lookup_table_vars):
if not os.path.isdir(dirname): if not os.path.isdir(dirname):
raise ValueError("There is no directory named '%s'", dirname) raise ValueError("There is no directory named '%s'", dirname)
...@@ -209,48 +290,34 @@ def load_persistable_vars(executor, dirname, program, lookup_table_var): ...@@ -209,48 +290,34 @@ def load_persistable_vars(executor, dirname, program, lookup_table_var):
global_block.append_op(type='delete_var', inputs={'X': sums}) global_block.append_op(type='delete_var', inputs={'X': sums})
executor.run(convert_program) executor.run(convert_program)
_logger.info("Start Load Sparse Program With "
"Distributed Lookup Table Vars from {}, time = {}".format(
dirname, time.ctime()))
lookup_table_vars = [lookup_table_var]
io.load_vars(
executor,
dirname=dirname,
main_program=program,
predicate=_is_checkpoint_var(lookup_table_vars),
filename=None)
_load_lookup_table_vars(executor, dirname, program, lookup_table_vars)
_logger.info("Finish Load Sparse Program With "
"Distributed Lookup Table Vars from {}, time = {}".format(
dirname, time.ctime()))
def load_inference_model(dirname, executor, lookup_table_var_name):
if not os.path.isdir(dirname): if not os.path.isdir(dirname):
raise ValueError("There is no directory named '%s'", dirname) raise ValueError("There is no directory named '%s'", dirname)
local_model = os.path.join(dirname, model_filename) if program:
if not isinstance(program, Program):
raise ValueError("program must be an instance of fluid.Program")
else:
local_model = os.path.join(dirname, model_filename)
with open(local_model, "rb") as f: with open(local_model, "rb") as f:
program_desc_str = f.read() program_desc_str = f.read()
program = Program.parse_from_string(program_desc_str) program = Program.parse_from_string(program_desc_str)
if not core._is_program_version_supported(program._version()): if not core._is_program_version_supported(program._version()):
raise ValueError("Unsupported program version: %d\n" % raise ValueError("Unsupported program version: %d\n" %
program._version()) program._version())
# Binary data also need version. _logger.info("Start Load Sparse Program With "
load_persistable_vars(executor, dirname, program, lookup_table_var_name) "Distributed Lookup Table Vars from {}, time = {}".format(
dirname, time.ctime()))
_load_persistable_vars(executor, dirname, program, [lookup_table_var_name])
__load_lookup_table_vars(executor, dirname, program,
[lookup_table_var_name])
feed_target_names = program.desc.get_feed_target_names() _logger.info("Finish Load Sparse Program With "
fetch_target_names = program.desc.get_fetch_target_names() "Distributed Lookup Table Vars from {}, time = {}".format(
fetch_targets = [ dirname, time.ctime()))
program.global_block().var(name) for name in fetch_target_names
]
return [program, feed_target_names, fetch_targets] return program
...@@ -16,6 +16,7 @@ from __future__ import print_function ...@@ -16,6 +16,7 @@ from __future__ import print_function
import collections import collections
import contextlib import contextlib
import os
import re import re
import six import six
import sys import sys
...@@ -27,11 +28,18 @@ from .proto import framework_pb2 ...@@ -27,11 +28,18 @@ from .proto import framework_pb2
try: try:
from . import core from . import core
except ImportError as e: except ImportError as e:
raise ImportError( if os.name == 'nt':
"""NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\" raise ImportError(
if you encounters \"libmkldnn.so not found\" errors. If you have python """NOTE: You may need to run \"set PATH=c:\python27\lib:%PATH%\"
installed in other directory, replace \"/usr/local/lib\" with your own if you encounters \"mkldnn.dll not found\" errors. If you have python
directory. The original error is: \n""" + cpt.get_exception_message(e)) installed in other directory, replace \"c:\python27\lib" with your own
directory. The original error is: \n""" + cpt.get_exception_message(e))
else:
raise ImportError(
"""NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\"
if you encounters \"libmkldnn.so not found\" errors. If you have python
installed in other directory, replace \"/usr/local/lib\" with your own
directory. The original error is: \n""" + cpt.get_exception_message(e))
except Exception as e: except Exception as e:
raise e raise e
from . import unique_name from . import unique_name
...@@ -571,8 +579,8 @@ class Operator(object): ...@@ -571,8 +579,8 @@ class Operator(object):
OP_WITHOUT_KERNEL_SET = { OP_WITHOUT_KERNEL_SET = {
'feed', 'fetch', 'save', 'load', 'recurrent', 'go', 'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv', 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine', 'listen_and_serv', 'save_combine', 'load_combine', 'ncclInit', 'select',
'ncclInit', 'select', 'checkpoint_notify', 'gen_nccl_id' 'checkpoint_notify', 'gen_nccl_id'
} }
def __init__(self, def __init__(self,
......
...@@ -28,7 +28,8 @@ def enabled(): ...@@ -28,7 +28,8 @@ def enabled():
def guard(): def guard():
train = framework.Program() train = framework.Program()
startup = framework.Program() startup = framework.Program()
tracer = core.Tracer(train.current_block().desc) tracer = core.Tracer(train.current_block().desc,
startup.current_block().desc)
with framework.program_guard(train, startup): with framework.program_guard(train, startup):
with framework.unique_name.guard(): with framework.unique_name.guard():
with framework._imperative_guard(tracer): with framework._imperative_guard(tracer):
......
...@@ -226,156 +226,6 @@ class BlockGuard(object): ...@@ -226,156 +226,6 @@ class BlockGuard(object):
return True return True
class ParallelDo(object):
"""
ParallelDo is used to represent multi-thread data parallel processing.
Its vanilla implementation can be shown as the following (:math:`|` means
single thread and :math:`||||` means multiple threads)
.. code-block:: text
In the forward pass
| Split input onto different devices
| Copy parameter onto different devices
|||| Compute forward pass in parallel
| Merge output from different devices
In the backward pass
| Split output@grad onto different devices
|||| Compute backward pass in parallel
| accumulate param@grad from different devices to the first device
| Merge input@grad from different devices
| Copy param@grad to the place of parallel_do_op
Examples:
.. code-block:: python
images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
# ParallelDo version & Single-thread version
if thread_num > 1:
places = fluid.layers.get_places(thread_num)
pd = fluid.layers.control_flow.ParallelDo(places)
with pd.do():
images = pd.read_input(images)
label = pd.read_input(label)
predict = cnn_model(images)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
pd.write_output(avg_cost)
avg_cost = pd()
avg_cost = fluid.layers.mean(avg_cost)
else:
predict = cnn_model(images)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
.. warning::
It will be soon deprecated, please use ParallelExecutor instead.
"""
def __init__(self, places, use_nccl=False, name=None):
warnings.warn(
"API ParallelDo is deprecated since 0.15.0. Please use ParallelExecutor instead.",
Warning)
self.helper = LayerHelper("parallel_do", name=name)
self.inputs = []
self.places = places
self.outputs = []
self.status = StaticRNN.BEFORE_RNN_BLOCK
self.use_nccl = use_nccl
def do(self):
return BlockGuardWithCompletion(self)
def parent_block(self):
prog = self.helper.main_program
parent_idx = prog.current_block().parent_idx
assert parent_idx >= 0
parent_block = prog.block(parent_idx)
return parent_block
def __call__(self, *args, **kwargs):
if self.status != StaticRNN.AFTER_RNN_BLOCK:
raise ValueError("RNN output can only be retrieved after rnn block")
if len(self.outputs) == 0:
raise ValueError("RNN has no output")
elif len(self.outputs) == 1:
return self.outputs[0]
else:
return self.outputs
def read_input(self, var):
self.inputs.append(var)
return var
def write_output(self, var):
self.outputs.append(var)
def get_parameters(self):
main_program = self.helper.main_program
current_block = main_program.current_block()
parent_block = self.parent_block()
local_inputs = set()
params = list()
for var in self.inputs:
local_inputs.add(var.name)
for op in current_block.ops:
for iname in op.input_names:
for in_var_name in op.input(iname):
if in_var_name not in local_inputs:
params.append(in_var_name)
for oname in op.output_names:
for out_var_name in op.output(oname):
local_inputs.add(out_var_name)
params = list(set(params))
return [parent_block.var(name) for name in params]
def _complete_op(self):
main_program = self.helper.main_program
current_block = main_program.current_block()
parent_block = self.parent_block()
step_scope = parent_block.create_var(
type=core.VarDesc.VarType.STEP_SCOPES)
self.outputs = [
parent_block.create_var(
name=o.name,
shape=o.shape,
dtype=o.dtype,
lod_level=o.lod_level,
persistable=o.persistable,
stop_gradient=o.stop_gradient) for o in self.outputs
]
inputs = [parent_block.var(i.name) for i in self.inputs]
outputs = [parent_block.var(o.name) for o in self.outputs]
parent_block.append_op(
type='parallel_do',
inputs={
'inputs': inputs,
'parameters': self.get_parameters(),
'places': self.places
},
outputs={'outputs': outputs,
'parallel_scopes': [step_scope]},
attrs={'sub_block': current_block,
'use_nccl': self.use_nccl})
class BlockGuardWithCompletion(BlockGuard): class BlockGuardWithCompletion(BlockGuard):
""" """
BlockGuardWithCompletion class. BlockGuardWithCompletion class.
...@@ -384,9 +234,8 @@ class BlockGuardWithCompletion(BlockGuard): ...@@ -384,9 +234,8 @@ class BlockGuardWithCompletion(BlockGuard):
""" """
def __init__(self, rnn): def __init__(self, rnn):
if not (isinstance(rnn, StaticRNN) or isinstance(rnn, ParallelDo)): if not isinstance(rnn, StaticRNN):
raise TypeError( raise TypeError("BlockGuardWithCompletion takes a StaticRNN")
"BlockGuardWithCompletion takes a StaticRNN or ParallelDo")
super(BlockGuardWithCompletion, self).__init__(rnn.helper.main_program) super(BlockGuardWithCompletion, self).__init__(rnn.helper.main_program)
self.rnn = rnn self.rnn = rnn
......
...@@ -29,6 +29,7 @@ from . import utils ...@@ -29,6 +29,7 @@ from . import utils
from .. import unique_name from .. import unique_name
from functools import reduce from functools import reduce
from .. import core from .. import core
from ..imperative import layers
__all__ = [ __all__ = [
'fc', 'fc',
...@@ -9426,3 +9427,47 @@ def huber_loss(input, label, delta): ...@@ -9426,3 +9427,47 @@ def huber_loss(input, label, delta):
'Residual': residual}, 'Residual': residual},
attrs={'delta': delta}) attrs={'delta': delta})
return out return out
class FC(layers.PyLayer):
def __init__(self,
size,
param_attr=None,
num_flatten_dims=1,
dtype=core.VarDesc.VarType.FP32):
super(FC, self).__init__()
self._size = size
self._num_flatten_dims = num_flatten_dims
self._dtype = dtype
self._helper = LayerHelper('FC', param_attr=param_attr)
def _build_once(self, inputs):
input_shape = inputs[0].shape
param_shape = [
reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1)
] + [self._size]
self._w = self._helper.create_parameter(
attr=self._helper.param_attr,
shape=param_shape,
dtype=self._dtype,
is_bias=False)
def forward(self, inputs):
tmp = self._helper.create_variable_for_type_inference(self._dtype)
self._helper.append_op(
type="mul",
inputs={"X": inputs[0],
"Y": self._w},
outputs={"Out": tmp},
attrs={
"x_num_col_dims": self._num_flatten_dims,
"y_num_col_dims": 1
})
out = self._helper.create_variable_for_type_inference(self._dtype)
self._helper.append_op(
type="sum",
inputs={"X": [tmp]},
outputs={"Out": out},
attrs={"use_mkldnn": False})
return out
...@@ -641,9 +641,14 @@ class AdamOptimizer(Optimizer): ...@@ -641,9 +641,14 @@ class AdamOptimizer(Optimizer):
beta1 (float): The exponential decay rate for the 1st moment estimates. beta1 (float): The exponential decay rate for the 1st moment estimates.
beta2 (float): The exponential decay rate for the 2nd moment estimates. beta2 (float): The exponential decay rate for the 2nd moment estimates.
epsilon (float): a small float value for numerical stability. epsilon (float): a small float value for numerical stability.
regularization: A Regularizer, such as regularization: A Regularizer, such as fluid.regularizer.L2DecayRegularizer.
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix. name: A optional name prefix.
lazy_mode(bool: false): The official Adam algorithm has two moving-average accumulators
the accumulators are updated at every step. Every element of the two moving-average is updated
in both dense mode and sparse mode. If the size of parameter is very large, then the update
may be very slow. The lazy mode only update the element that has gradient is the current
mini-batch, so it will be much more faster. But this mode has different semantics with the
original Adam algorithm and may lead to different result.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -663,7 +668,8 @@ class AdamOptimizer(Optimizer): ...@@ -663,7 +668,8 @@ class AdamOptimizer(Optimizer):
beta2=0.999, beta2=0.999,
epsilon=1e-8, epsilon=1e-8,
regularization=None, regularization=None,
name=None): name=None,
lazy_mode=False):
assert learning_rate is not None assert learning_rate is not None
assert beta1 is not None assert beta1 is not None
assert beta2 is not None assert beta2 is not None
...@@ -676,6 +682,7 @@ class AdamOptimizer(Optimizer): ...@@ -676,6 +682,7 @@ class AdamOptimizer(Optimizer):
self._beta1 = beta1 self._beta1 = beta1
self._beta2 = beta2 self._beta2 = beta2
self._epsilon = epsilon self._epsilon = epsilon
self._lazy_mode = lazy_mode
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
...@@ -729,7 +736,8 @@ class AdamOptimizer(Optimizer): ...@@ -729,7 +736,8 @@ class AdamOptimizer(Optimizer):
attrs={ attrs={
"beta1": self._beta1, "beta1": self._beta1,
"beta2": self._beta2, "beta2": self._beta2,
"epsilon": self._epsilon "epsilon": self._epsilon,
"lazy_mode": self._lazy_mode
}) })
return adam_op return adam_op
......
...@@ -92,35 +92,27 @@ class ParallelExecutor(object): ...@@ -92,35 +92,27 @@ class ParallelExecutor(object):
num_trainers=1, num_trainers=1,
trainer_id=0, trainer_id=0,
scope=None): scope=None):
# step1: get places, the places are used in run too.
self._places = [] self._places = []
self._act_places = []
if use_cuda: if use_cuda:
gpus = []
gpus_env = os.getenv("FLAGS_selected_gpus") gpus_env = os.getenv("FLAGS_selected_gpus")
if gpus_env: if gpus_env:
gpus = [int(s) for s in gpus_env.split(",")] gpus = [int(s) for s in gpus_env.split(",")]
else: else:
for i in six.moves.range(core.get_cuda_device_count()): gpus = [
gpus.append(i) i for i in six.moves.range(core.get_cuda_device_count())
for i in gpus: ]
p = core.Place() self._places = [core.CUDAPlace(i) for i in gpus]
self._act_places.append(core.CUDAPlace(i))
p.set_place(self._act_places[-1])
self._places.append(p)
else: else:
cpu_num = int( cpu_num = int(
os.environ.get('CPU_NUM', multiprocessing.cpu_count())) os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
for i in six.moves.range(cpu_num): self._places = [core.CPUPlace() for _ in six.moves.range(cpu_num)]
p = core.Place()
self._act_places.append(core.CPUPlace())
p.set_place(self._act_places[-1])
self._places.append(p)
assert self._places, "no place for execution" assert self._places, "no place for execution"
# step2: init exec_strategy
if exec_strategy is None: if exec_strategy is None:
exec_strategy = ExecutionStrategy() exec_strategy = ExecutionStrategy()
exec_strategy.use_cuda = use_cuda exec_strategy.use_cuda = use_cuda
if exec_strategy.num_threads == 0: if exec_strategy.num_threads == 0:
if use_cuda: if use_cuda:
# Experiments on se-resnext shows that too many threads hurt # Experiments on se-resnext shows that too many threads hurt
...@@ -131,49 +123,54 @@ class ParallelExecutor(object): ...@@ -131,49 +123,54 @@ class ParallelExecutor(object):
os.environ.get('CPU_NUM', multiprocessing.cpu_count())) os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
exec_strategy.num_threads = cpu_num * 2 exec_strategy.num_threads = cpu_num * 2
# step3: init build_strategy
if build_strategy is None: if build_strategy is None:
build_strategy = BuildStrategy() build_strategy = BuildStrategy()
build_strategy.num_trainers = num_trainers build_strategy.num_trainers = num_trainers
build_strategy.trainer_id = trainer_id build_strategy.trainer_id = trainer_id
main = main_program # step4: get main_program, scope, local_scopes
main = main if main else framework.default_main_program() main = main_program if main_program \
else framework.default_main_program()
scope = scope if scope is not None else executor.global_scope()
if share_vars_from and not isinstance(share_vars_from,
ParallelExecutor):
raise TypeError("share_vars_from must be ParallelExecutor.")
local_scopes = share_vars_from.executor.local_scopes()\
if share_vars_from else []
# step5: check trainers_endpoints, it is used for distribution.
trainers_endpoints = main._trainers_endpoints trainers_endpoints = main._trainers_endpoints
if num_trainers > 1 and trainers_endpoints: if num_trainers > 1 and trainers_endpoints:
assert num_trainers == len( assert num_trainers == len(
trainers_endpoints), "num_trainers == len(end_points)" trainers_endpoints), "num_trainers == len(end_points)"
build_strategy.trainers_endpoints = trainers_endpoints build_strategy.trainers_endpoints = trainers_endpoints
if scope == None: # step5: get persistable_vars, parameter_vars, places. persistable_vars
scope = executor.global_scope() # need be broadcast to other local_scope.
persistable_vars = set([
if share_vars_from and not isinstance(share_vars_from, cpt.to_text(v.name) for v in [
ParallelExecutor):
raise TypeError("share_vars_from must be ParallelExecutor.")
local_scopes = share_vars_from.executor.local_scopes(
) if share_vars_from else []
self.persistable_vars = [
v.name for v in [
var for var in main.list_vars() var for var in main.list_vars()
if var.persistable and var.type != core.VarDesc.VarType.RAW if var.persistable and var.type != core.VarDesc.VarType.RAW
] ]
] ])
def place_obj(place):
p = core.Place()
p.set_place(place)
return p
places = list(map(place_obj, self._places))
# step6: init ParallelExecutor
self.executor = core.ParallelExecutor( self.executor = core.ParallelExecutor(
self._places, places, persistable_vars, main.desc,
set([
cpt.to_text(p.name)
for p in main.global_block().iter_parameters()
if not p.stop_gradient
]),
set(cpt.to_text(var) for var in self.persistable_vars), main.desc,
cpt.to_text(loss_name) cpt.to_text(loss_name)
if loss_name else six.u(''), scope, local_scopes, exec_strategy, if loss_name else six.u(''), scope, local_scopes, exec_strategy,
build_strategy, num_trainers, trainer_id) build_strategy, num_trainers, trainer_id)
self.scope = scope self.scope = scope
def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True): def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True):
...@@ -261,7 +258,7 @@ class ParallelExecutor(object): ...@@ -261,7 +258,7 @@ class ParallelExecutor(object):
self.executor.feed_and_split_tensor_into_local_scopes( self.executor.feed_and_split_tensor_into_local_scopes(
feed_tensor_dict) feed_tensor_dict)
elif isinstance(feed, list) or isinstance(feed, tuple): elif isinstance(feed, list) or isinstance(feed, tuple):
if len(feed) != len(self._act_places): if len(feed) != len(self._places):
raise ValueError( raise ValueError(
"Feed a list of tensor, the list should be the same size as places" "Feed a list of tensor, the list should be the same size as places"
) )
...@@ -277,7 +274,7 @@ class ParallelExecutor(object): ...@@ -277,7 +274,7 @@ class ParallelExecutor(object):
tensor = each[feed_name] tensor = each[feed_name]
if not isinstance(tensor, core.LoDTensor): if not isinstance(tensor, core.LoDTensor):
tmp = core.LoDTensor() tmp = core.LoDTensor()
tmp.set(tensor, self._act_places[i]) tmp.set(tensor, self._places[i])
tensor = tmp tensor = tmp
res_dict[feed_name] = tensor res_dict[feed_name] = tensor
res.append(res_dict) res.append(res_dict)
...@@ -294,4 +291,4 @@ class ParallelExecutor(object): ...@@ -294,4 +291,4 @@ class ParallelExecutor(object):
@property @property
def device_count(self): def device_count(self):
return len(self._act_places) return len(self._places)
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
from __future__ import print_function from __future__ import print_function
from paddle.fluid.layers.device import get_places from paddle.fluid.layers.device import get_places
from paddle.fluid.layers.control_flow import ParallelDo
import unittest import unittest
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle import paddle
...@@ -147,22 +146,7 @@ def train(word_dict, ...@@ -147,22 +146,7 @@ def train(word_dict,
cost, acc_out, prediction = net_method( cost, acc_out, prediction = net_method(
data, label, input_dim=dict_dim, class_dim=class_dim) data, label, input_dim=dict_dim, class_dim=class_dim)
else: else:
places = get_places() raise NotImplementedError()
pd = ParallelDo(places)
with pd.do():
cost, acc, _ = net_method(
pd.read_input(data),
pd.read_input(label),
input_dim=dict_dim,
class_dim=class_dim)
pd.write_output(cost)
pd.write_output(acc)
cost, acc = pd()
cost = fluid.layers.mean(cost)
acc_out = fluid.layers.mean(acc)
prediction = None
assert save_dirname is None
adagrad = fluid.optimizer.Adagrad(learning_rate=0.002) adagrad = fluid.optimizer.Adagrad(learning_rate=0.002)
adagrad.minimize(cost) adagrad.minimize(cost)
......
...@@ -25,7 +25,6 @@ import numpy ...@@ -25,7 +25,6 @@ import numpy
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.layers.device import get_places from paddle.fluid.layers.device import get_places
from paddle.fluid.layers.control_flow import ParallelDo
BATCH_SIZE = 64 BATCH_SIZE = 64
...@@ -82,19 +81,7 @@ def train(nn_type, ...@@ -82,19 +81,7 @@ def train(nn_type,
net_conf = conv_net net_conf = conv_net
if parallel: if parallel:
places = get_places() raise NotImplementedError()
pd = ParallelDo(places)
with pd.do():
img_ = pd.read_input(img)
label_ = pd.read_input(label)
prediction, avg_loss, acc = net_conf(img_, label_)
for o in [avg_loss, acc]:
pd.write_output(o)
avg_loss, acc = pd()
# get mean loss and acc through every devices.
avg_loss = fluid.layers.mean(avg_loss)
acc = fluid.layers.mean(acc)
else: else:
prediction, avg_loss, acc = net_conf(img, label) prediction, avg_loss, acc = net_conf(img, label)
...@@ -273,7 +260,7 @@ def inject_all_tests(): ...@@ -273,7 +260,7 @@ def inject_all_tests():
for use_cuda in (False, True): for use_cuda in (False, True):
if use_cuda and not core.is_compiled_with_cuda(): if use_cuda and not core.is_compiled_with_cuda():
continue continue
for parallel in (False, True): for parallel in (False, ):
for nn_type in ('mlp', 'conv'): for nn_type in ('mlp', 'conv'):
inject_test_method(use_cuda, parallel, nn_type, True) inject_test_method(use_cuda, parallel, nn_type, True)
......
...@@ -17,7 +17,6 @@ from __future__ import print_function ...@@ -17,7 +17,6 @@ from __future__ import print_function
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.layers.device import get_places from paddle.fluid.layers.device import get_places
from paddle.fluid.layers.control_flow import ParallelDo
import unittest import unittest
import os import os
import numpy as np import numpy as np
...@@ -84,18 +83,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True): ...@@ -84,18 +83,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
avg_cost, predict_word = __network__( avg_cost, predict_word = __network__(
[first_word, second_word, third_word, forth_word, next_word]) [first_word, second_word, third_word, forth_word, next_word])
else: else:
places = get_places() raise NotImplementedError()
pd = ParallelDo(places)
with pd.do():
avg_cost, predict_word = __network__(
list(
map(pd.read_input, [
first_word, second_word, third_word, forth_word,
next_word
])))
pd.write_output(avg_cost)
avg_cost = fluid.layers.mean(pd())
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
...@@ -262,7 +250,7 @@ def inject_test_method(use_cuda, is_sparse, is_parallel): ...@@ -262,7 +250,7 @@ def inject_test_method(use_cuda, is_sparse, is_parallel):
for use_cuda in (False, True): for use_cuda in (False, True):
for is_sparse in (False, True): for is_sparse in (False, True):
for is_parallel in (False, True): for is_parallel in (False, ):
inject_test_method(use_cuda, is_sparse, is_parallel) inject_test_method(use_cuda, is_sparse, is_parallel)
if __name__ == '__main__': if __name__ == '__main__':
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import math
import sys
import paddle
import paddle.fluid as fluid
from paddle.fluid.layers.device import get_places
from paddle.fluid.layers.control_flow import ParallelDo
# need to fix random seed and training data to compare the loss
# value accurately calculated by the default and the memory optimization
# version.
fluid.default_startup_program().random_seed = 111
x = fluid.layers.data(name='x', shape=[13], dtype='float32')
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
device_type = 'CPU'
use_nccl = False
place = fluid.CPUPlace()
if fluid.core.is_compiled_with_cuda():
device_type = 'CUDA'
use_nccl = False
place = fluid.CUDAPlace(0)
places = get_places(device_count=0, device_type=device_type)
pd = ParallelDo(places, use_nccl=use_nccl)
with pd.do():
x_ = pd.read_input(x)
y_ = pd.read_input(y)
y_predict = fluid.layers.fc(input=x_, size=1, act=None)
cost = fluid.layers.square_error_cost(input=y_predict, label=y_)
avg_cost = fluid.layers.mean(x=cost)
pd.write_output(avg_cost)
cost = pd()
avg_cost = fluid.layers.mean(x=cost)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01)
sgd_optimizer.minimize(avg_cost)
fluid.memory_optimize(fluid.default_main_program(), print_log=True)
# fluid.release_memory(fluid.default_main_program())
BATCH_SIZE = 200
# fix the order of training data
train_reader = paddle.batch(
paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE, drop_last=False)
# train_reader = paddle.batch(
# paddle.reader.shuffle(
# paddle.dataset.uci_housing.train(), buf_size=500),
# batch_size=BATCH_SIZE)
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
PASS_NUM = 100
for pass_id in range(PASS_NUM):
for data in train_reader():
avg_loss_value, = exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost])
if avg_loss_value[0] < 10.0:
exit(0) # if avg cost less than 10.0, we think our code is good.
print(avg_loss_value[0])
if math.isnan(float(avg_loss_value)):
sys.exit("got NaN loss, training failed.")
exit(1)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle.fluid.core as core
from paddle.fluid.tests.unittests.op_test import OpTest
from paddle.fluid.tests.unittests.test_activation_op import TestRelu, TestTanh
class TestNGRAPHReluDim2(TestRelu):
def setUp(self):
super(TestNGRAPHReluDim2, self).setUp()
class TestNGRAPHTanhDim2(TestTanh):
def setUp(self):
super(TestNGRAPHTanhDim2, self).setUp()
class TestNGRAPHReluDim4(TestRelu):
def setUp(self):
super(TestNGRAPHReluDim4, self).setUp()
x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
# The same reason with TestAbs
x[np.abs(x) < 0.005] = 0.02
out = np.maximum(x, 0)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
class TestNGRAPHTanhDim4(TestTanh):
def setUp(self):
super(TestNGRAPHTanhDim4, self).setUp()
self.inputs = {
'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
}
self.outputs = {'Out': np.tanh(self.inputs['X'])}
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
from paddle.fluid.tests.unittests.test_mul_op import TestMulOp, TestMulOp2, TestFP16MulOp1, TestFP16MulOp2
class TestNGRAPHMulOp(TestMulOp):
def init_dtype_type(self):
pass
class TestNGRAPHMulOp2(TestMulOp2):
def init_dtype_type(self):
pass
class TestNGRAPHFP16MulOp1(TestFP16MulOp1):
def init_dtype_type(self):
pass
class TestNGRAPHFP16MulOp2(TestFP16MulOp2):
def init_dtype_type(self):
pass
if __name__ == "__main__":
unittest.main()
...@@ -194,7 +194,8 @@ def adam_step(inputs, attributes): ...@@ -194,7 +194,8 @@ def adam_step(inputs, attributes):
return param_out, moment1_out, moment2_out return param_out, moment1_out, moment2_out
def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad): def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad,
lazy_mode):
''' '''
Simulate one step of the adam optimizer Simulate one step of the adam optimizer
:param inputs: dict of inputs :param inputs: dict of inputs
...@@ -218,19 +219,30 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad): ...@@ -218,19 +219,30 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad):
moment2_out = np.zeros(shape=[height, row_numel]) moment2_out = np.zeros(shape=[height, row_numel])
param_out = np.zeros(shape=[height, row_numel]) param_out = np.zeros(shape=[height, row_numel])
for idx, row_id in enumerate(rows): def update_row(row_id, update_value):
moment1_out[row_id] = beta1 * moment1[row_id] + (1 - beta1 moment1_out[row_id] = beta1 * moment1[row_id] + (1 - beta1
) * np_grad[idx] ) * update_value
moment2_out[row_id] = beta2 * moment2[row_id] + ( moment2_out[row_id] = beta2 * moment2[row_id] + (
1 - beta2) * np.square(np_grad[idx]) 1 - beta2) * np.square(update_value)
lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow) lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
param_out[row_id] = param[row_id] - lr_t * (moment1_out[row_id] / ( param_out[row_id] = param[row_id] - lr_t * (moment1_out[row_id] / (
np.sqrt(moment2_out[row_id]) + epsilon)) np.sqrt(moment2_out[row_id]) + epsilon))
if lazy_mode:
for idx, row_id in enumerate(rows):
update_row(row_id, np_grad[idx])
else:
for row_id in range(param_out.shape[0]):
update_value = np.zeros(np_grad[0].shape).astype("float32")
if row_id in rows:
update_value = np_grad[rows.index(row_id)]
update_row(row_id, update_value)
return param_out, moment1_out, moment2_out return param_out, moment1_out, moment2_out
class TestSparseAdamOp(unittest.TestCase): class TestSparseAdamOp(unittest.TestCase):
def setup(self, scope, place): def setup(self, scope, place, lazy_mode):
beta1 = 0.78 beta1 = 0.78
beta2 = 0.836 beta2 = 0.836
epsilon = 1e-4 epsilon = 1e-4
...@@ -248,6 +260,7 @@ class TestSparseAdamOp(unittest.TestCase): ...@@ -248,6 +260,7 @@ class TestSparseAdamOp(unittest.TestCase):
'Beta2Pow': np.array([beta2**10]).astype("float32"), 'Beta2Pow': np.array([beta2**10]).astype("float32"),
"LearningRate": np.full((1), 2.0).astype("float32") "LearningRate": np.full((1), 2.0).astype("float32")
} }
self.init_output = np.full((height, row_numel), 0.0).astype("float32")
self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2} self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
grad_selected_rows = scope.var('Grad').get_selected_rows() grad_selected_rows = scope.var('Grad').get_selected_rows()
...@@ -262,19 +275,21 @@ class TestSparseAdamOp(unittest.TestCase): ...@@ -262,19 +275,21 @@ class TestSparseAdamOp(unittest.TestCase):
self.sparse_inputs = ["Grad"] self.sparse_inputs = ["Grad"]
param_out, mom1, mom2 = adam_step_sparse( param_out, mom1, mom2 = adam_step_sparse(self.dense_inputs, self.attrs,
self.dense_inputs, self.attrs, height, rows, row_numel, np_array) height, rows, row_numel,
np_array, lazy_mode)
self.outputs = { self.outputs = {
"ParamOut": param_out, "ParamOut": param_out,
"Moment1Out": mom1, "Moment1Out": mom1,
"Moment2Out": mom2 "Moment2Out": mom2
} }
def check_with_place(self, place): def check_with_place(self, place, lazy_mode):
scope = core.Scope() scope = core.Scope()
self.setup(scope, place) self.setup(scope, place, lazy_mode)
op_args = dict() op_args = dict()
op_args['lazy_mode'] = lazy_mode
for key, np_array in self.dense_inputs.items(): for key, np_array in self.dense_inputs.items():
var = scope.var(key).get_tensor() var = scope.var(key).get_tensor()
var.set(np_array, place) var.set(np_array, place)
...@@ -283,7 +298,7 @@ class TestSparseAdamOp(unittest.TestCase): ...@@ -283,7 +298,7 @@ class TestSparseAdamOp(unittest.TestCase):
op_args[s] = s op_args[s] = s
for s in self.outputs: for s in self.outputs:
var = scope.var(s).get_tensor() var = scope.var(s).get_tensor()
var.set(self.outputs[s], place) var.set(self.init_output, place)
op_args[s] = s op_args[s] = s
for k in self.attrs: for k in self.attrs:
op_args[k] = self.attrs[k] op_args[k] = self.attrs[k]
...@@ -297,20 +312,17 @@ class TestSparseAdamOp(unittest.TestCase): ...@@ -297,20 +312,17 @@ class TestSparseAdamOp(unittest.TestCase):
actual = np.array(out_var) actual = np.array(out_var)
actual = actual.reshape([actual.size]) actual = actual.reshape([actual.size])
np_array = np_array.reshape([np_array.size]) np_array = np_array.reshape([np_array.size])
for idx, row_id in enumerate(self.rows):
j = 0 for i in range(np_array.size):
while j < self.row_numel: self.assertLess((actual[i] - np_array[i]), 0.00001)
pos = row_id * self.row_numel + j
self.assertLess((actual[pos] - np_array[pos]) / actual[pos], def test_sparse_adam(self):
0.00001)
j += 1
def test_sparse_sgd(self):
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
self.check_with_place(place) for lazy_mode in (True, False):
self.check_with_place(place, lazy_mode)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -16,7 +16,7 @@ from __future__ import print_function ...@@ -16,7 +16,7 @@ from __future__ import print_function
import unittest import unittest
from test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride from test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride, TestWithGroup, TestWith1x1, TestWithInput1x1Filter1x1
class TestMKLDNN(TestConv2dOp): class TestMKLDNN(TestConv2dOp):
...@@ -37,5 +37,23 @@ class TestMKLDNNWithStride(TestWithStride): ...@@ -37,5 +37,23 @@ class TestMKLDNNWithStride(TestWithStride):
self.data_format = "NCHW" self.data_format = "NCHW"
class TestMKLDNNWithGroup(TestWithGroup):
def init_kernel_type(self):
self.use_mkldnn = True
self.data_format = "NCHW"
class TestMKLDNNWith1x1(TestWith1x1):
def init_kernel_type(self):
self.use_mkldnn = True
self.data_format = "NCHW"
class TestMKLDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
def init_kernel_type(self):
self.use_mkldnn = True
self.data_format = "NCHW"
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -29,7 +29,7 @@ class TestGetTensorFromSelectedRows(unittest.TestCase): ...@@ -29,7 +29,7 @@ class TestGetTensorFromSelectedRows(unittest.TestCase):
def check_with_place(self, place): def check_with_place(self, place):
scope = core.Scope() scope = core.Scope()
x_rows = [0, 5, 5, 4, 20] x_rows = [0, 5, 5, 4, 19]
height = 20 height = 20
row_numel = 2 row_numel = 2
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
import unittest
import numpy as np
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid.layers.nn import FC
@contextlib.contextmanager
def new_program_scope():
prog = fluid.Program()
startup_prog = fluid.Program()
scope = fluid.core.Scope()
with fluid.scope_guard(scope):
with fluid.program_guard(prog, startup_prog):
yield
class MyLayer(fluid.imperative.PyLayer):
def __init__(self):
super(MyLayer, self).__init__()
def forward(self, inputs):
x = fluid.layers.relu(inputs[0])
self._x_for_debug = x
return [fluid.layers.elementwise_mul(x, x)]
class MLP(fluid.imperative.PyLayer):
def __init__(self):
super(MLP, self).__init__()
self._fc1 = FC(3,
fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.1)))
self._fc2 = FC(4,
fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.1)))
def forward(self, inputs):
x = self._fc1(inputs[0])
x = self._fc2(x)
x = fluid.layers.reduce_sum(x)
return x
class TestImperative(unittest.TestCase):
def test_layer(self):
with fluid.imperative.guard():
cl = core.Layer()
cl.forward([])
l = fluid.imperative.PyLayer()
l.forward([])
def test_layer_in_out(self):
np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32)
with fluid.imperative.guard():
l = MyLayer()
x = l(np_inp)[0]
self.assertIsNotNone(x)
dy_out = x._numpy()
x._backward()
dy_grad = l._x_for_debug._gradient()
with new_program_scope():
inp = fluid.layers.data(
name="inp", shape=[3], append_batch_size=False)
l = MyLayer()
x = l(inp)[0]
param_grads = fluid.backward.append_backward(
x, parameter_list=[l._x_for_debug.name])[0]
exe = fluid.Executor(fluid.CPUPlace())
static_out, static_grad = exe.run(
feed={inp.name: np_inp},
fetch_list=[x.name, param_grads[1].name])
self.assertTrue(np.allclose(dy_out, static_out))
self.assertTrue(np.allclose(dy_grad, static_grad))
def test_mlp(self):
np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
with fluid.imperative.guard():
mlp = MLP()
out = mlp(np_inp)
dy_out = out._numpy()
out._backward()
dy_grad = mlp._fc1._w._gradient()
with new_program_scope():
inp = fluid.layers.data(
name="inp", shape=[2, 2], append_batch_size=False)
mlp = MLP()
out = mlp(inp)
param_grads = fluid.backward.append_backward(
out, parameter_list=[mlp._fc1._w.name])[0]
exe = fluid.Executor(fluid.CPUPlace())
exe.run(fluid.default_startup_program())
static_out, static_grad = exe.run(
feed={inp.name: np_inp},
fetch_list=[out.name, param_grads[1].name])
self.assertTrue(np.allclose(dy_out, static_out))
self.assertTrue(np.allclose(dy_grad, static_grad))
if __name__ == '__main__':
unittest.main()
...@@ -29,8 +29,8 @@ class TestMergeSelectedRows(unittest.TestCase): ...@@ -29,8 +29,8 @@ class TestMergeSelectedRows(unittest.TestCase):
def check_with_place(self, place): def check_with_place(self, place):
scope = core.Scope() scope = core.Scope()
x_rows = [0, 5, 5, 4, 20] x_rows = [0, 5, 5, 4, 19]
out_rows = [0, 4, 5, 20] out_rows = [0, 4, 5, 19]
height = 20 height = 20
row_numel = 2 row_numel = 2
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
from test_transpose_op import TestTransposeOp
class TestTransposeMKLDNN(TestTransposeOp):
def init_op_type(self):
self.op_type = "transpose2"
self.use_mkldnn = True
self.is_test = True
return
def test_check_grad(self):
return
def test_check_grad_no_input(self):
return
def test_check_grad_no_filter(self):
return
class TestCase0MKLDNN(TestTransposeMKLDNN):
def initTestCase(self):
self.shape = (3, )
self.axis = (0, )
class TestCase1a(TestTransposeMKLDNN):
def initTestCase(self):
self.shape = (3, 4, 5)
self.axis = (0, 2, 1)
class TestCase1b(TestTransposeMKLDNN):
def initTestCase(self):
self.shape = (3, 4, 5)
self.axis = (2, 1, 0)
class TestCase2(TestTransposeMKLDNN):
def initTestCase(self):
self.shape = (2, 3, 4, 5)
self.axis = (0, 2, 3, 1)
class TestCase3(TestTransposeMKLDNN):
def initTestCase(self):
self.shape = (2, 3, 4, 5, 6)
self.axis = (4, 2, 3, 1, 0)
class TestCase4(TestTransposeMKLDNN):
def initTestCase(self):
self.shape = (2, 3, 4, 5, 6, 1)
self.axis = (4, 2, 3, 1, 0, 5)
if __name__ == '__main__':
unittest.main()
...@@ -21,15 +21,24 @@ from op_test import OpTest ...@@ -21,15 +21,24 @@ from op_test import OpTest
class TestTransposeOp(OpTest): class TestTransposeOp(OpTest):
def setUp(self): def setUp(self):
self.init_op_type()
self.initTestCase() self.initTestCase()
self.op_type = "transpose2"
self.inputs = {'X': np.random.random(self.shape).astype("float32")} self.inputs = {'X': np.random.random(self.shape).astype("float32")}
self.attrs = {'axis': list(self.axis)} self.attrs = {
'axis': list(self.axis),
'use_mkldnn': self.use_mkldnn,
'is_test': self.is_test,
}
self.outputs = { self.outputs = {
'XShape': np.random.random(self.shape).astype("float32"), 'XShape': np.random.random(self.shape).astype("float32"),
'Out': self.inputs['X'].transpose(self.axis) 'Out': self.inputs['X'].transpose(self.axis)
} }
def init_op_type(self):
self.op_type = "transpose2"
self.use_mkldnn = False
self.is_test = False
def test_check_output(self): def test_check_output(self):
self.check_output(no_check_set=['XShape']) self.check_output(no_check_set=['XShape'])
......
...@@ -35,11 +35,10 @@ dtype_to_size = { ...@@ -35,11 +35,10 @@ dtype_to_size = {
} }
SUB_BLOCK_OPS = [ SUB_BLOCK_OPS = [
"while", "while_grad", "parallel_do", "parallel_do_grad", "while", "while_grad", "conditional_block", "conditional_block_grad"
"conditional_block", "conditional_block_grad"
] ]
SUB_BLOCK_PAIR = [("while", "while_grad"), ("parallel_do", "parallel_do_grad"), SUB_BLOCK_PAIR = [("while", "while_grad"),
("conditional_block", "conditional_block_grad")] ("conditional_block", "conditional_block_grad")]
PRINT_LOG = False PRINT_LOG = False
......
...@@ -107,9 +107,9 @@ packages=['paddle', ...@@ -107,9 +107,9 @@ packages=['paddle',
'paddle.fluid.distributed', 'paddle.fluid.distributed',
'paddle.fluid.layers', 'paddle.fluid.layers',
'paddle.fluid.contrib', 'paddle.fluid.contrib',
'paddle.fluid.contrib.utils',
'paddle.fluid.contrib.decoder', 'paddle.fluid.contrib.decoder',
'paddle.fluid.contrib.quantize', 'paddle.fluid.contrib.quantize',
'paddle.fluid.contrib.utils',
'paddle.fluid.transpiler', 'paddle.fluid.transpiler',
'paddle.fluid.transpiler.details'] 'paddle.fluid.transpiler.details']
...@@ -160,10 +160,11 @@ if '${WITH_FLUID_ONLY}'== 'OFF': ...@@ -160,10 +160,11 @@ if '${WITH_FLUID_ONLY}'== 'OFF':
# put all thirdparty libraries in paddle.libs # put all thirdparty libraries in paddle.libs
libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs' libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs'
if os.name != 'nt':
package_data['paddle.libs']= [] package_data['paddle.libs']= []
package_data['paddle.libs']=['libwarpctc' + ext_name] package_data['paddle.libs']=[('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_name]
shutil.copy('${WARPCTC_LIBRARIES}', libs_path) shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
if '${WITH_MKL}' == 'ON': if '${WITH_MKL}' == 'ON':
shutil.copy('${MKLML_LIB}', libs_path) shutil.copy('${MKLML_LIB}', libs_path)
shutil.copy('${MKLML_IOMP_LIB}', libs_path) shutil.copy('${MKLML_IOMP_LIB}', libs_path)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册