提交 c1f7e54f 编写于 作者: S sneaxiy

merge develop

test=develop
...@@ -350,6 +350,22 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b ...@@ -350,6 +350,22 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b
paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)) paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.contrib.load_persistables_for_increment ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.load_persistables_for_inference ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.convert_dist_to_sparse_program ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.HDFSClient.__init__ ArgSpec(args=['self', 'hadoop_home', 'configs'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.HDFSClient.delete ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.HDFSClient.download ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'unzip'], varargs=None, keywords=None, defaults=(False, False))
paddle.fluid.contrib.HDFSClient.is_dir ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.HDFSClient.is_exist ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.HDFSClient.ls ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.HDFSClient.lsr ArgSpec(args=['self', 'hdfs_path', 'only_file', 'sort'], varargs=None, keywords=None, defaults=(True, True))
paddle.fluid.contrib.HDFSClient.make_local_dirs ArgSpec(args=['local_path'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.HDFSClient.makedirs ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.HDFSClient.rename ArgSpec(args=['self', 'hdfs_src_path', 'hdfs_dst_path', 'overwrite'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.contrib.HDFSClient.upload ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'retry_times'], varargs=None, keywords=None, defaults=(False, 5))
paddle.fluid.contrib.multi_download ArgSpec(args=['client', 'hdfs_path', 'local_path', 'trainer_id', 'trainers', 'multi_processes'], varargs=None, keywords=None, defaults=(5,))
paddle.fluid.contrib.multi_upload ArgSpec(args=['client', 'hdfs_path', 'local_path', 'multi_processes', 'overwrite', 'sync'], varargs=None, keywords=None, defaults=(5, False, True))
paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
......
...@@ -131,9 +131,7 @@ std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy( ...@@ -131,9 +131,7 @@ std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy(
std::unique_ptr<ir::Graph> BuildStrategy::Apply( std::unique_ptr<ir::Graph> BuildStrategy::Apply(
const ProgramDesc &main_program, const std::vector<platform::Place> &places, const ProgramDesc &main_program, const std::vector<platform::Place> &places,
const std::string &loss_var_name, const std::string &loss_var_name, const std::vector<Scope *> &local_scopes,
const std::unordered_set<std::string> &param_names,
const std::vector<Scope *> &local_scopes,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const { const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const {
#else #else
...@@ -149,9 +147,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply( ...@@ -149,9 +147,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
pass->SetNotOwned<const std::vector<platform::Place>>("places", &places); pass->SetNotOwned<const std::vector<platform::Place>>("places", &places);
pass->Erase("loss_var_name"); pass->Erase("loss_var_name");
pass->SetNotOwned<const std::string>("loss_var_name", &loss_var_name); pass->SetNotOwned<const std::string>("loss_var_name", &loss_var_name);
pass->Erase("params");
pass->SetNotOwned<const std::unordered_set<std::string>>("params",
&param_names);
pass->Erase("local_scopes"); pass->Erase("local_scopes");
pass->SetNotOwned<const std::vector<Scope *>>("local_scopes", pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
&local_scopes); &local_scopes);
......
...@@ -106,14 +106,13 @@ struct BuildStrategy { ...@@ -106,14 +106,13 @@ struct BuildStrategy {
// Apply the passes built by the pass_builder_. The passes will be // Apply the passes built by the pass_builder_. The passes will be
// applied to the Program and output an ir::Graph. // applied to the Program and output an ir::Graph.
std::unique_ptr<ir::Graph> Apply( std::unique_ptr<ir::Graph> Apply(const ProgramDesc &main_program,
const ProgramDesc &main_program,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const std::string &loss_var_name, const std::string &loss_var_name,
const std::unordered_set<std::string> &param_names,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const; const bool use_cuda,
platform::NCCLContextMap *nccl_ctxs) const;
#else #else
const bool use_cuda) const; const bool use_cuda) const;
#endif #endif
......
...@@ -130,7 +130,6 @@ void AddOutputToLeafOps(ir::Graph *graph) { ...@@ -130,7 +130,6 @@ void AddOutputToLeafOps(ir::Graph *graph) {
static const char kLossVarName[] = "loss_var_name"; static const char kLossVarName[] = "loss_var_name";
static const char kPlaces[] = "places"; static const char kPlaces[] = "places";
static const char kParams[] = "params";
static const char kLocalScopes[] = "local_scopes"; static const char kLocalScopes[] = "local_scopes";
static const char kStrategy[] = "strategy"; static const char kStrategy[] = "strategy";
static const char kNumTrainers[] = "num_trainers"; static const char kNumTrainers[] = "num_trainers";
...@@ -147,9 +146,6 @@ void MultiDevSSAGraphBuilder::Init() const { ...@@ -147,9 +146,6 @@ void MultiDevSSAGraphBuilder::Init() const {
nccl_ctxs_ = &Get<platform::NCCLContextMap>("nccl_ctxs"); nccl_ctxs_ = &Get<platform::NCCLContextMap>("nccl_ctxs");
#endif #endif
for (auto &p : Get<const std::unordered_set<std::string>>(kParams)) {
grad_names_.insert(GradVarName(p));
}
balance_vars_.resize(places_.size(), 0); balance_vars_.resize(places_.size(), 0);
if (strategy_.enable_data_balance_ && places_.size() == 1) { if (strategy_.enable_data_balance_ && places_.size() == 1) {
LOG(WARNING) << "It is no need to enable data balance when there is only " LOG(WARNING) << "It is no need to enable data balance when there is only "
...@@ -359,7 +355,9 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl( ...@@ -359,7 +355,9 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
BuildStrategy::GradientScaleStrategy::kCustomized) { BuildStrategy::GradientScaleStrategy::kCustomized) {
// TODO(paddle-dev): Why is there no input for this op_handle? // TODO(paddle-dev): Why is there no input for this op_handle?
auto loss_grad_name = node->Op()->OutputArgumentNames()[0]; auto loss_grad_name = node->Op()->OutputArgumentNames()[0];
CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0]); auto out_dtype = all_vars_.at(loss_grad_name)->GetDataType();
CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0],
out_dtype);
} }
// This assumes the backward generating code will ensure IsScaleLossOp // This assumes the backward generating code will ensure IsScaleLossOp
// is true only for the op that scale the final scalar loss. // is true only for the op that scale the final scalar loss.
...@@ -662,13 +660,13 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID( ...@@ -662,13 +660,13 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(
void MultiDevSSAGraphBuilder::CreateScaleLossGradOp( void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
ir::Graph *result, const std::string &loss_grad_name, ir::Graph *result, const std::string &loss_grad_name,
ir::Node *out_var_node) const { ir::Node *out_var_node, proto::VarType::Type dtype) const {
for (size_t i = 0; i < places_.size(); ++i) { for (size_t i = 0; i < places_.size(); ++i) {
// Insert ScaleCost OpHandle // Insert ScaleCost OpHandle
auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]); auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]);
auto *op_handle = new ScaleLossGradOpHandle( auto *op_handle = new ScaleLossGradOpHandle(
result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation), result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation),
local_scopes_.size(), local_scopes_[i], places_[i], dev_ctx); local_scopes_.size(), local_scopes_[i], places_[i], dev_ctx, dtype);
result->Get<GraphOps>(kGraphOps).emplace_back(op_handle); result->Get<GraphOps>(kGraphOps).emplace_back(op_handle);
// FIXME: Currently ScaleLossGradOp only use device_count as scale // FIXME: Currently ScaleLossGradOp only use device_count as scale
...@@ -896,7 +894,6 @@ REGISTER_PASS(multi_devices_pass, ...@@ -896,7 +894,6 @@ REGISTER_PASS(multi_devices_pass,
paddle::framework::details::MultiDevSSAGraphBuilder) paddle::framework::details::MultiDevSSAGraphBuilder)
.RequirePassAttr(paddle::framework::details::kLossVarName) .RequirePassAttr(paddle::framework::details::kLossVarName)
.RequirePassAttr(paddle::framework::details::kPlaces) .RequirePassAttr(paddle::framework::details::kPlaces)
.RequirePassAttr(paddle::framework::details::kParams)
.RequirePassAttr(paddle::framework::details::kLocalScopes) .RequirePassAttr(paddle::framework::details::kLocalScopes)
.RequirePassAttr(paddle::framework::details::kStrategy) .RequirePassAttr(paddle::framework::details::kStrategy)
.RequirePassAttr(paddle::framework::details::kNumTrainers); .RequirePassAttr(paddle::framework::details::kNumTrainers);
...@@ -68,7 +68,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass { ...@@ -68,7 +68,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
void CreateScaleLossGradOp(ir::Graph *result, void CreateScaleLossGradOp(ir::Graph *result,
const std::string &loss_grad_name, const std::string &loss_grad_name,
ir::Node *out_var_node) const; ir::Node *out_var_node,
proto::VarType::Type dtype) const;
VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og, VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
int dst_dev_id) const; int dst_dev_id) const;
...@@ -102,7 +103,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass { ...@@ -102,7 +103,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
mutable std::string loss_var_name_; mutable std::string loss_var_name_;
mutable std::vector<platform::Place> places_; mutable std::vector<platform::Place> places_;
mutable std::vector<Scope *> local_scopes_; mutable std::vector<Scope *> local_scopes_;
mutable std::unordered_set<std::string> grad_names_;
mutable BuildStrategy strategy_; mutable BuildStrategy strategy_;
mutable std::unordered_map<std::string, VarDesc *> all_vars_; mutable std::unordered_map<std::string, VarDesc *> all_vars_;
......
...@@ -22,39 +22,66 @@ namespace details { ...@@ -22,39 +22,66 @@ namespace details {
ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev,
Scope *scope, Scope *scope,
platform::Place place, platform::Place place,
platform::DeviceContext *dev_ctx) platform::DeviceContext *dev_ctx,
proto::VarType::Type dtype)
: OpHandleBase(node), : OpHandleBase(node),
coeff_(static_cast<float>(1.0 / num_dev)), coeff_(static_cast<float>(1.0 / num_dev)),
scope_(scope), scope_(scope),
place_(place) { place_(place),
out_dtype_(dtype) {
this->SetDeviceContext(place_, dev_ctx); this->SetDeviceContext(place_, dev_ctx);
} }
ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {} ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
void ScaleLossGradOpHandle::RunImpl() { struct ScaleLossGradFunctor {
// Doesn't wait any event float coeff_;
std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name_; Tensor *out_;
auto &local_scope = *scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(); platform::Place place_;
OpHandleBase *op_handle_;
proto::VarType::Type out_dtype_;
platform::DeviceContext *ctx_;
float *tmp = local_scope.FindVar(var_name) ScaleLossGradFunctor(float coeff, Tensor *out, platform::Place place,
->GetMutable<LoDTensor>() OpHandleBase *op_handle, proto::VarType::Type dtype,
->mutable_data<float>(make_ddim({1}), place_); platform::DeviceContext *ctx)
: coeff_(coeff), out_(out), place_(place), out_dtype_(dtype), ctx_(ctx) {}
template <typename OutT>
void apply() const {
auto *out_data = out_->mutable_data<OutT>(place_);
if (platform::is_cpu_place(place_)) { if (platform::is_cpu_place(place_)) {
*tmp = coeff_; *out_data = static_cast<OutT>(coeff_);
} else { } else {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
this->RunAndRecordEvent([&] { OutT cast_coeff = static_cast<OutT>(coeff_);
auto stream = static_cast<platform::CUDADeviceContext *>( auto stream = static_cast<platform::CUDADeviceContext *>(ctx_)->stream();
this->dev_ctxes_.at(place_)) memory::Copy(boost::get<platform::CUDAPlace>(place_), out_data,
->stream(); platform::CPUPlace(), &cast_coeff, SizeOfType(out_dtype_),
memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp, stream);
platform::CPUPlace(), &coeff_, sizeof(float), stream);
VLOG(10) << place_ << "RUN Scale loss grad op"; VLOG(10) << place_ << "RUN Scale loss grad op";
});
#endif #endif
} }
}
};
void ScaleLossGradOpHandle::RunImpl() {
// Doesn't wait any event
std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name_;
auto &local_scope = *scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
auto *tensor = local_scope.FindVar(var_name)->GetMutable<LoDTensor>();
tensor->Resize(make_ddim({1}));
#ifdef PADDLE_WITH_CUDA
ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_,
this->dev_ctxes_.at(place_));
this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); });
#else
ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_, nullptr);
framework::VisitDataType(out_dtype_, func);
#endif
} }
std::string ScaleLossGradOpHandle::Name() const { return "Scale LossGrad"; } std::string ScaleLossGradOpHandle::Name() const { return "Scale LossGrad"; }
......
...@@ -26,8 +26,8 @@ namespace details { ...@@ -26,8 +26,8 @@ namespace details {
struct ScaleLossGradOpHandle : public OpHandleBase { struct ScaleLossGradOpHandle : public OpHandleBase {
ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, Scope *scope, ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, Scope *scope,
platform::Place place, platform::Place place, platform::DeviceContext *context,
platform::DeviceContext *context); proto::VarType::Type dtype);
~ScaleLossGradOpHandle() final; ~ScaleLossGradOpHandle() final;
...@@ -40,6 +40,7 @@ struct ScaleLossGradOpHandle : public OpHandleBase { ...@@ -40,6 +40,7 @@ struct ScaleLossGradOpHandle : public OpHandleBase {
float coeff_; float coeff_;
Scope *scope_; Scope *scope_;
platform::Place place_; platform::Place place_;
proto::VarType::Type out_dtype_;
}; };
} // namespace details } // namespace details
......
...@@ -24,35 +24,6 @@ namespace paddle { ...@@ -24,35 +24,6 @@ namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
// The function keeps the graph consistent by replacing
// a node 'from' in the set of inputs nodes
// of the visited node by a node 'to'.
void CorrectGraphEdges(Graph* graph, Node* from, Node* to) {
for (auto& node : GraphTraits::DFS(*graph)) {
auto from_in_inputs =
std::find(std::begin(node.inputs), std::end(node.inputs), from);
if (from_in_inputs != std::end(node.inputs)) {
IR_NODE_LINK_TO(to, (&node));
auto inputs = node.Op()->Inputs();
using input_type = VariableNameMap::value_type;
std::for_each(std::begin(inputs), std::end(inputs),
[from, to, &node](const input_type& i) -> void {
auto param_names = i.second;
auto pi = std::find(std::begin(param_names),
std::end(param_names), from->Name());
if (pi != std::end(param_names)) {
node.Op()->SetInput(i.first, {to->Name()});
}
});
}
}
}
bool IsReachable(ir::Graph* graph, Node* from, Node* to) { bool IsReachable(ir::Graph* graph, Node* from, Node* to) {
auto find_node = [](ir::Graph* graph, const Node* node) -> Node* { auto find_node = [](ir::Graph* graph, const Node* node) -> Node* {
for (auto n : graph->Nodes()) { for (auto n : graph->Nodes()) {
...@@ -99,24 +70,11 @@ bool IsReachable(ir::Graph* graph, Node* from, Node* to) { ...@@ -99,24 +70,11 @@ bool IsReachable(ir::Graph* graph, Node* from, Node* to) {
return false; return false;
} }
boost::optional<Node*> HasBias(const Node& op, const std::string& bias_name) { template <typename T>
auto bias_input_names = op.Op()->Inputs(); boost::optional<T> HasAttribute(const Node& op, const std::string& attr) {
auto bias_it = bias_input_names.find(bias_name); if (op.Op()->HasAttr(attr))
return boost::get<T>(op.Op()->GetAttr(attr));
if (bias_it != std::end(bias_input_names)) { else
bool has_bias = !bias_it->second.empty();
if (has_bias) {
auto bias_names = bias_it->second;
auto bias_names_it =
std::find_if(std::begin(op.inputs), std::end(op.inputs),
[&bias_names](Node* n) -> bool {
return n->Name() == bias_names[0];
});
return *bias_names_it;
}
}
return boost::none; return boost::none;
} }
...@@ -151,40 +109,18 @@ void ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::operator()( ...@@ -151,40 +109,18 @@ void ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::operator()(
if (!IsReachable(graph, elementwise_add_identity, conv_output)) return; if (!IsReachable(graph, elementwise_add_identity, conv_output)) return;
OpDesc op_desc; auto fuse_relu = HasAttribute<bool>(*conv_op, "fuse_relu");
op_desc.SetType("conv2d"); if (fuse_relu && *fuse_relu) return;
op_desc.SetInput("Input", {conv_input->Name()}); conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()});
op_desc.SetInput("Filter", {conv_filter->Name()}); conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
op_desc.SetInput("ResidualData", {elementwise_add_identity->Name()}); conv_op->Op()->SetAttr("fuse_residual_connection", true);
op_desc.SetOutput("Output", {conv_output->Name()});
auto conv_bias = HasBias(*conv_op, "Bias"); GraphSafeRemoveNodes(graph, {conv_output, elementwise_add_op});
if (conv_bias) { IR_NODE_LINK_TO(elementwise_add_identity, conv_op);
op_desc.SetInput("Bias", {(*conv_bias)->Name()}); IR_NODE_LINK_TO(conv_op, elementwise_add_out);
}
for (const auto& attr : conv_op->Op()->GetAttrMap()) {
op_desc.SetAttr(attr.first, attr.second);
}
op_desc.SetAttr("fuse_residual_connection", true);
auto fused_conv_op = graph->CreateOpNode(&op_desc);
IR_NODE_LINK_TO(conv_input, fused_conv_op);
IR_NODE_LINK_TO(conv_filter, fused_conv_op);
IR_NODE_LINK_TO(elementwise_add_identity, fused_conv_op);
IR_NODE_LINK_TO(fused_conv_op, conv_output);
if (conv_bias) {
IR_NODE_LINK_TO((*conv_bias), fused_conv_op);
}
CorrectGraphEdges(graph, elementwise_add_out, conv_output);
GraphSafeRemoveNodes(graph,
{elementwise_add_out, conv_op, elementwise_add_op});
(*fusion_stats)++; (*fusion_stats)++;
} }
...@@ -229,60 +165,33 @@ void ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::operator()( ...@@ -229,60 +165,33 @@ void ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::operator()(
Node* projection_node; Node* projection_node;
Node* residual_conv_op; Node* residual_conv_op;
Node* residual_conv_input;
Node* residual_conv_filter;
Node* residual_conv_output; Node* residual_conv_output;
if (IsReachable(graph, conv_x_input, conv_y_output)) { if (IsReachable(graph, conv_x_input, conv_y_output)) {
projection_node = conv_x_output; projection_node = conv_x_output;
residual_conv_op = conv_y_op; residual_conv_op = conv_y_op;
residual_conv_input = conv_y_input;
residual_conv_filter = conv_y_filter;
residual_conv_output = conv_y_output; residual_conv_output = conv_y_output;
} else if (IsReachable(graph, conv_y_input, conv_x_output)) { } else if (IsReachable(graph, conv_y_input, conv_x_output)) {
projection_node = conv_y_output; projection_node = conv_y_output;
residual_conv_op = conv_x_op; residual_conv_op = conv_x_op;
residual_conv_input = conv_x_input;
residual_conv_filter = conv_x_filter;
residual_conv_output = conv_x_output; residual_conv_output = conv_x_output;
} else { } else {
return; return;
} }
OpDesc op_desc; auto fuse_relu = HasAttribute<bool>(*residual_conv_op, "fuse_relu");
op_desc.SetType("conv2d"); if (fuse_relu && *fuse_relu) return;
op_desc.SetInput("Input", {residual_conv_input->Name()}); residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()});
op_desc.SetInput("Filter", {residual_conv_filter->Name()}); residual_conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
op_desc.SetInput("ResidualData", {projection_node->Name()});
op_desc.SetOutput("Output", {residual_conv_output->Name()});
auto residual_conv_bias = HasBias(*residual_conv_op, "Bias"); residual_conv_op->Op()->SetAttr("fuse_residual_connection", true);
if (residual_conv_bias) { GraphSafeRemoveNodes(graph, {residual_conv_output, elementwise_add_op});
op_desc.SetInput("Bias", {(*residual_conv_bias)->Name()});
}
for (const auto& attr : residual_conv_op->Op()->GetAttrMap()) {
op_desc.SetAttr(attr.first, attr.second);
}
op_desc.SetAttr("fuse_residual_connection", true);
auto fused_conv_op = graph->CreateOpNode(&op_desc); IR_NODE_LINK_TO(projection_node, residual_conv_op);
IR_NODE_LINK_TO(residual_conv_op, elementwise_add_out);
IR_NODE_LINK_TO(residual_conv_input, fused_conv_op);
IR_NODE_LINK_TO(residual_conv_filter, fused_conv_op);
IR_NODE_LINK_TO(projection_node, fused_conv_op);
IR_NODE_LINK_TO(fused_conv_op, residual_conv_output);
if (residual_conv_bias) {
IR_NODE_LINK_TO((*residual_conv_bias), fused_conv_op);
}
CorrectGraphEdges(graph, elementwise_add_out, residual_conv_output);
GraphSafeRemoveNodes(
graph, {elementwise_add_out, residual_conv_op, elementwise_add_op});
(*fusion_stats)++; (*fusion_stats)++;
} }
......
...@@ -110,22 +110,125 @@ class CompileTimeInferShapeContext : public InferShapeContext { ...@@ -110,22 +110,125 @@ class CompileTimeInferShapeContext : public InferShapeContext {
} }
} }
std::vector<InferShapeVarPtr> GetInputVarPtrs(
const std::string &name) override {
const std::vector<std::string> arg_names = Inputs(name);
std::vector<InferShapeVarPtr> res;
res.reserve(arg_names.size());
std::transform(arg_names.begin(), arg_names.end(), std::back_inserter(res),
[this](const std::string &name) {
return block_.FindVarRecursive(name);
});
return res;
}
std::vector<InferShapeVarPtr> GetOutputVarPtrs(
const std::string &name) override {
const std::vector<std::string> arg_names = Outputs(name);
std::vector<InferShapeVarPtr> res;
res.reserve(arg_names.size());
std::transform(arg_names.begin(), arg_names.end(), std::back_inserter(res),
[this](const std::string &name) {
return block_.FindVarRecursive(name);
});
return res;
}
DDim GetInputDim(const std::string &name) const override {
const std::vector<std::string> &arg_names = Inputs(name);
PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
"Input(%s) should hold one element, but now it holds %d",
name, arg_names.size());
return this->GetDim(arg_names[0]);
}
std::vector<DDim> GetInputsDim(const std::string &name) const override {
const std::vector<std::string> &arg_names = Inputs(name);
return GetDims(arg_names);
}
bool IsRuntime() const override; bool IsRuntime() const override;
std::vector<proto::VarType::Type> GetInputsVarType(
const std::string &name) const override {
return GetVarTypes(Inputs(name));
}
std::vector<proto::VarType::Type> GetOutputsVarType(
const std::string &name) const override {
return GetVarTypes(Outputs(name));
}
void SetOutputDim(const std::string &name, const DDim &dim) override {
auto &arg_names = Outputs(name);
PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
"Output(%s) should hold one element, but now it holds %d",
name, arg_names.size());
SetDim(arg_names[0], dim);
}
void SetOutputsDim(const std::string &name,
const std::vector<DDim> &dims) override {
auto &names = Outputs(name);
SetDims(names, dims);
}
protected: protected:
proto::VarType::Type GetVarType(const std::string &name) const override; std::vector<proto::VarType::Type> GetVarTypes(
const std::vector<std::string> &names) const {
std::vector<proto::VarType::Type> retv;
retv.resize(names.size());
std::transform(
names.begin(), names.end(), retv.begin(),
std::bind(std::mem_fn(&CompileTimeInferShapeContext::GetVarType), this,
std::placeholders::_1));
return retv;
}
DDim GetDim(const std::string &name) const override; proto::VarType::Type GetVarType(const std::string &name) const;
void SetDim(const std::string &name, const DDim &dim) override; DDim GetDim(const std::string &name) const {
auto var = block_.FindVarRecursive(name);
PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
DDim res;
try {
auto shape = var->GetShape();
res = shape.empty() ? make_ddim({0UL}) : make_ddim(shape);
} catch (...) {
VLOG(5) << "GetDim of variable " << name << " error";
std::rethrow_exception(std::current_exception());
}
return res;
}
std::vector<DDim> GetDims(const std::vector<std::string> &names) const {
std::vector<DDim> ret;
ret.reserve(names.size());
std::transform(
names.begin(), names.end(), std::back_inserter(ret),
[this](const std::string &name) { return this->GetDim(name); });
return ret;
}
void SetDim(const std::string &name, const DDim &dim);
void SetDims(const std::vector<std::string> &names,
const std::vector<DDim> &dims) {
size_t length = names.size();
PADDLE_ENFORCE_EQ(length, dims.size());
for (size_t i = 0; i < length; ++i) {
if (names[i] == framework::kEmptyVarName) {
continue;
}
SetDim(names[i], dims[i]);
}
}
std::vector<DDim> GetRepeatedDims(const std::string &name) const override; std::vector<DDim> GetRepeatedDims(const std::string &name) const override;
void SetRepeatedDims(const std::string &name, void SetRepeatedDims(const std::string &name,
const std::vector<DDim> &dims) override; const std::vector<DDim> &dims) override;
InferShapeVarPtr GetVarPtr(const std::string &name) override;
const OpDesc &op_; const OpDesc &op_;
const BlockDesc &block_; const BlockDesc &block_;
}; };
...@@ -644,20 +747,6 @@ const std::vector<std::string> &CompileTimeInferShapeContext::Outputs( ...@@ -644,20 +747,6 @@ const std::vector<std::string> &CompileTimeInferShapeContext::Outputs(
return op_.Output(name); return op_.Output(name);
} }
DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
auto var = block_.FindVarRecursive(name);
PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
DDim res;
try {
auto shape = var->GetShape();
res = shape.empty() ? make_ddim({0UL}) : make_ddim(shape);
} catch (...) {
VLOG(5) << "GetDim of variable " << name << " error";
std::rethrow_exception(std::current_exception());
}
return res;
}
std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims( std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims(
const std::string &name) const { const std::string &name) const {
auto var = block_.FindVarRecursive(name); auto var = block_.FindVarRecursive(name);
...@@ -696,10 +785,5 @@ proto::VarType::Type CompileTimeInferShapeContext::GetVarType( ...@@ -696,10 +785,5 @@ proto::VarType::Type CompileTimeInferShapeContext::GetVarType(
return block_.FindVarRecursive(name)->GetType(); return block_.FindVarRecursive(name)->GetType();
} }
InferShapeVarPtr CompileTimeInferShapeContext::GetVarPtr(
const std::string &name) {
return block_.FindVarRecursive(name);
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -142,12 +142,14 @@ RuntimeContext::RuntimeContext(const VariableNameMap& innames, ...@@ -142,12 +142,14 @@ RuntimeContext::RuntimeContext(const VariableNameMap& innames,
const Scope& scope) { const Scope& scope) {
for (auto& var_name_item : innames) { for (auto& var_name_item : innames) {
std::vector<Variable*>& input_vars = inputs[var_name_item.first]; std::vector<Variable*>& input_vars = inputs[var_name_item.first];
input_vars.reserve(var_name_item.second.size());
for (auto& var_name : var_name_item.second) { for (auto& var_name : var_name_item.second) {
input_vars.push_back(scope.FindVar(var_name)); input_vars.push_back(scope.FindVar(var_name));
} }
} }
for (auto& var_name_item : outnames) { for (auto& var_name_item : outnames) {
std::vector<Variable*>& output_vars = outputs[var_name_item.first]; std::vector<Variable*>& output_vars = outputs[var_name_item.first];
output_vars.reserve(var_name_item.second.size());
for (auto& var_name : var_name_item.second) { for (auto& var_name : var_name_item.second) {
output_vars.push_back(scope.FindVar(var_name)); output_vars.push_back(scope.FindVar(var_name));
} }
...@@ -556,30 +558,28 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -556,30 +558,28 @@ class RuntimeInferShapeContext : public InferShapeContext {
bool HasOutput(const std::string& name) const override { bool HasOutput(const std::string& name) const override {
// has only one output // has only one output
const auto& outs = op_.Outputs(); const auto& outs = ctx_.outputs;
auto it = outs.find(name); auto it = outs.find(name);
if (it == outs.end()) { if (it == outs.end()) {
return false; return false;
} }
const auto& out = it->second; const auto& out = it->second;
if (out.size() == 0 || out[0] == kEmptyVarName) { if (out.size() == 0) {
return false; return false;
} }
PADDLE_ENFORCE_EQ(out.size(), 1UL, PADDLE_ENFORCE_EQ(out.size(), 1UL,
"Output %s should not have more than one outputs", name); "Output %s should not have more than one outputs", name);
return scope_.FindVar(out[0]) != nullptr; return out[0] != nullptr;
} }
bool HasInputs(const std::string& name) const override { bool HasInputs(const std::string& name) const override {
if (!op_.HasInputs(name)) { const auto& ins = ctx_.inputs;
return false; auto it = ins.find(name);
} if (it == ins.end() || it->second.empty()) {
auto inputs = op_.Inputs(name);
if (inputs.empty()) {
return false; return false;
} }
for (auto& input : inputs) { for (auto& input : it->second) {
if (scope_.FindVar(input) == nullptr) { if (input == nullptr) {
return false; return false;
} }
} }
...@@ -587,15 +587,13 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -587,15 +587,13 @@ class RuntimeInferShapeContext : public InferShapeContext {
} }
bool HasOutputs(const std::string& name) const override { bool HasOutputs(const std::string& name) const override {
if (!op_.HasOutputs(name)) { const auto& outs = ctx_.outputs;
return false; auto it = outs.find(name);
} if (it == outs.end() || it->second.empty()) {
auto outputs = op_.Outputs(name);
if (outputs.empty()) {
return false; return false;
} }
for (auto& output : outputs) { for (auto& output : it->second) {
if (scope_.FindVar(output) == nullptr) { if (output == nullptr) {
return false; return false;
} }
} }
...@@ -616,16 +614,18 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -616,16 +614,18 @@ class RuntimeInferShapeContext : public InferShapeContext {
void ShareDim(const std::string& in, const std::string& out, size_t i = 0, void ShareDim(const std::string& in, const std::string& out, size_t i = 0,
size_t j = 0) override { size_t j = 0) override {
PADDLE_ENFORCE_LT(i, Inputs(in).size()); auto in_it = ctx_.inputs.find(in);
PADDLE_ENFORCE_LT(j, Outputs(out).size()); auto out_it = ctx_.outputs.find(out);
const std::string& input_n = Inputs(in)[i]; PADDLE_ENFORCE(in_it != ctx_.inputs.end() && in_it->second.size() > i,
const std::string& output_n = Outputs(out)[j]; "Inputs %s should have %llu argument", in, i);
PADDLE_ENFORCE(out_it != ctx_.outputs.end() && out_it->second.size() > j,
"Outputs %s should have %llu argument", out, j);
Variable* in_var = in_it->second[i];
Variable* out_var = out_it->second[j];
Variable* in_var = scope_.FindVar(input_n);
Variable* out_var = scope_.FindVar(output_n);
PADDLE_ENFORCE(in_var->Type() == out_var->Type(), PADDLE_ENFORCE(in_var->Type() == out_var->Type(),
"The type of %s and %s is not the same.", output_n, "The type of %s and %s is not the same.", in, out);
GetDim(input_n));
if (in_var->IsType<framework::SelectedRows>()) { if (in_var->IsType<framework::SelectedRows>()) {
auto& in_sele_rows = in_var->Get<framework::SelectedRows>(); auto& in_sele_rows = in_var->Get<framework::SelectedRows>();
...@@ -646,13 +646,16 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -646,13 +646,16 @@ class RuntimeInferShapeContext : public InferShapeContext {
void ShareLoD(const std::string& in, const std::string& out, size_t i = 0, void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
size_t j = 0) const override { size_t j = 0) const override {
const std::vector<std::string>& inputs = Inputs(in); auto in_it = ctx_.inputs.find(in);
const std::vector<std::string>& outputs = Outputs(out); auto out_it = ctx_.outputs.find(out);
PADDLE_ENFORCE_LT(i, inputs.size()); PADDLE_ENFORCE(in_it != ctx_.inputs.end() && in_it->second.size() > i,
PADDLE_ENFORCE_LT(j, outputs.size()); "Inputs %s should have %llu argument", in, i);
Variable* in_var = scope_.FindVar(inputs.at(i)); PADDLE_ENFORCE(out_it != ctx_.outputs.end() && out_it->second.size() > j,
"Outputs %s should have %llu argument", out, j);
Variable* in_var = in_it->second.at(i);
if (!in_var->IsType<LoDTensor>()) return; if (!in_var->IsType<LoDTensor>()) return;
Variable* out_var = scope_.FindVar(outputs.at(j)); Variable* out_var = out_it->second.at(j);
PADDLE_ENFORCE(out_var->IsType<LoDTensor>(), PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
"The %d-th output of Output(%s) must be LoDTensor.", j, out); "The %d-th output of Output(%s) must be LoDTensor.", j, out);
auto in_tensor = in_var->Get<LoDTensor>(); auto in_tensor = in_var->Get<LoDTensor>();
...@@ -687,9 +690,64 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -687,9 +690,64 @@ class RuntimeInferShapeContext : public InferShapeContext {
bool IsRuntime() const override { return true; } bool IsRuntime() const override { return true; }
// TODO(paddle-dev): Can this be template?
std::vector<InferShapeVarPtr> GetInputVarPtrs(
const std::string& name) override {
const std::vector<Variable*>& vars = InputVars(name);
std::vector<InferShapeVarPtr> res;
res.reserve(vars.size());
res.insert(res.begin(), vars.begin(), vars.end());
return res;
}
std::vector<InferShapeVarPtr> GetOutputVarPtrs(
const std::string& name) override {
const std::vector<Variable*>& vars = OutputVars(name);
std::vector<InferShapeVarPtr> res;
res.reserve(vars.size());
res.insert(res.begin(), vars.begin(), vars.end());
return res;
}
DDim GetInputDim(const std::string& name) const override {
const std::vector<Variable*>& vars = InputVars(name);
PADDLE_ENFORCE_EQ(vars.size(), 1UL,
"Input(%s) should hold one element, but now it holds %d",
name, vars.size());
return this->GetDim(vars[0]);
}
std::vector<DDim> GetInputsDim(const std::string& name) const override {
const std::vector<Variable*>& vars = InputVars(name);
return GetDims(vars);
}
std::vector<proto::VarType::Type> GetInputsVarType(
const std::string& name) const override {
return GetVarTypes(InputVars(name));
}
std::vector<proto::VarType::Type> GetOutputsVarType(
const std::string& name) const override {
return GetVarTypes(OutputVars(name));
}
void SetOutputDim(const std::string& name, const DDim& dim) override {
auto& vars = OutputVars(name);
PADDLE_ENFORCE_EQ(vars.size(), 1UL,
"Output(%s) should hold one element, but now it holds %d",
name, vars.size());
SetDim(vars[0], dim);
}
void SetOutputsDim(const std::string& name,
const std::vector<DDim>& dims) override {
auto& vars = OutputVars(name);
SetDims(vars, dims);
}
protected: protected:
DDim GetDim(const std::string& name) const override { DDim GetDim(Variable* var) const {
Variable* var = scope_.FindVar(name);
PADDLE_ENFORCE_NOT_NULL(var); PADDLE_ENFORCE_NOT_NULL(var);
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
return var->Get<LoDTensor>().dims(); return var->Get<LoDTensor>().dims();
...@@ -697,25 +755,44 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -697,25 +755,44 @@ class RuntimeInferShapeContext : public InferShapeContext {
return var->Get<SelectedRows>().GetCompleteDims(); return var->Get<SelectedRows>().GetCompleteDims();
} else { } else {
PADDLE_THROW( PADDLE_THROW(
"Only LoDTensor/SelectedRows support 'GetDim', but Variable %s's " "Only LoDTensor/SelectedRows support 'GetDim', but Variables "
"type_id is %s.", "type_id is %s.",
name, ToTypeName(var->Type())); ToTypeName(var->Type()));
} }
} }
std::vector<DDim> GetDims(const std::vector<Variable*>& vars) const {
std::vector<DDim> ret;
ret.reserve(vars.size());
std::transform(vars.begin(), vars.end(), std::back_inserter(ret),
[this](Variable* var) { return this->GetDim(var); });
return ret;
}
std::vector<DDim> GetRepeatedDims(const std::string& name) const override { std::vector<DDim> GetRepeatedDims(const std::string& name) const override {
PADDLE_THROW("Only compile time support this method"); PADDLE_THROW("Only compile time support this method");
} }
void SetDim(const std::string& name, const DDim& dim) override { void SetDim(Variable* var, const DDim& dim) {
Variable* var = scope_.FindVar(name);
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
var->GetMutable<LoDTensor>()->Resize(dim); var->GetMutable<LoDTensor>()->Resize(dim);
} else if (var->IsType<SelectedRows>()) { } else if (var->IsType<SelectedRows>()) {
var->GetMutable<SelectedRows>()->set_height(dim[0]); var->GetMutable<SelectedRows>()->set_height(dim[0]);
} else { } else {
PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.", PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
name, ToTypeName(var->Type())); ToTypeName(var->Type()));
}
}
void SetDims(const std::vector<Variable*>& vars,
const std::vector<DDim>& dims) {
size_t length = vars.size();
PADDLE_ENFORCE_EQ(length, dims.size());
for (size_t i = 0; i < length; ++i) {
if (vars[i] == nullptr) {
continue;
}
SetDim(vars[i], dims[i]);
} }
} }
...@@ -724,16 +801,36 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -724,16 +801,36 @@ class RuntimeInferShapeContext : public InferShapeContext {
PADDLE_THROW("Only compile time support this method"); PADDLE_THROW("Only compile time support this method");
} }
proto::VarType::Type GetVarType(const std::string& name) const override { std::vector<proto::VarType::Type> GetVarTypes(
auto* var = scope_.FindVar(name); const std::vector<Variable*>& vars) const {
return ToVarType(var->Type()); std::vector<proto::VarType::Type> retv;
retv.resize(vars.size());
std::transform(vars.begin(), vars.end(), retv.begin(),
std::bind(std::mem_fn(&RuntimeInferShapeContext::GetVarType),
this, std::placeholders::_1));
return retv;
} }
InferShapeVarPtr GetVarPtr(const std::string& name) override { proto::VarType::Type GetVarType(Variable* var) const {
return scope_.FindVar(name); return ToVarType(var->Type());
} }
private: private:
const std::vector<Variable*>& InputVars(const std::string& name) const {
auto it = ctx_.inputs.find(name);
PADDLE_ENFORCE(it != ctx_.inputs.end(),
"Operator %s does not have the input %s.", op_.Type(), name);
return it->second;
}
const std::vector<Variable*>& OutputVars(const std::string& name) const {
auto it = ctx_.outputs.find(name);
PADDLE_ENFORCE(it != ctx_.outputs.end(),
"Operator %s does not have the outputs %s.", op_.Type(),
name);
return it->second;
}
const OperatorBase& op_; const OperatorBase& op_;
const Scope& scope_; const Scope& scope_;
const RuntimeContext& ctx_; const RuntimeContext& ctx_;
...@@ -864,8 +961,7 @@ Scope* OperatorWithKernel::PrepareData( ...@@ -864,8 +961,7 @@ Scope* OperatorWithKernel::PrepareData(
for (size_t i = 0; i < var_name_item.second.size(); ++i) { for (size_t i = 0; i < var_name_item.second.size(); ++i) {
auto& var_name = var_name_item.second[i]; auto& var_name = var_name_item.second[i];
auto* var = scope.FindVar(var_name); auto* var = input_vars[i];
input_vars[i] = var;
// Only tensor can be tranfer to another device. // Only tensor can be tranfer to another device.
if (var == nullptr || !VarIsTensor(*var)) { if (var == nullptr || !VarIsTensor(*var)) {
......
...@@ -190,7 +190,6 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() { ...@@ -190,7 +190,6 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
ParallelExecutor::ParallelExecutor( ParallelExecutor::ParallelExecutor(
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const std::unordered_set<std::string> &params,
const std::unordered_set<std::string> &bcast_vars, const std::unordered_set<std::string> &bcast_vars,
const ProgramDesc &main_program, const std::string &loss_var_name, const ProgramDesc &main_program, const std::string &loss_var_name,
Scope *scope, const std::vector<Scope *> &local_scopes, Scope *scope, const std::vector<Scope *> &local_scopes,
...@@ -209,7 +208,7 @@ ParallelExecutor::ParallelExecutor( ...@@ -209,7 +208,7 @@ ParallelExecutor::ParallelExecutor(
"the number of places must be greater than 1."); "the number of places must be greater than 1.");
} }
// Step 1. Bcast the params to devs. // Step 1. Bcast the bcast_vars to devs.
// Create local scopes // Create local scopes
if (local_scopes.empty()) { if (local_scopes.empty()) {
member_->own_local_scope_ = true; member_->own_local_scope_ = true;
...@@ -249,12 +248,12 @@ ParallelExecutor::ParallelExecutor( ...@@ -249,12 +248,12 @@ ParallelExecutor::ParallelExecutor(
// ncclOp // ncclOp
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
std::unique_ptr<ir::Graph> graph = build_strategy.Apply( std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
main_program, member_->places_, loss_var_name, params, main_program, member_->places_, loss_var_name, member_->local_scopes_,
member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get()); member_->use_cuda_, member_->nccl_ctxs_.get());
#else #else
std::unique_ptr<ir::Graph> graph = std::unique_ptr<ir::Graph> graph =
build_strategy.Apply(main_program, member_->places_, loss_var_name, build_strategy.Apply(main_program, member_->places_, loss_var_name,
params, member_->local_scopes_, member_->use_cuda_); member_->local_scopes_, member_->use_cuda_);
#endif #endif
auto max_memory_size = GetEagerDeletionThreshold(); auto max_memory_size = GetEagerDeletionThreshold();
if (max_memory_size >= 0) { if (max_memory_size >= 0) {
......
...@@ -41,7 +41,6 @@ class ParallelExecutor { ...@@ -41,7 +41,6 @@ class ParallelExecutor {
public: public:
explicit ParallelExecutor(const std::vector<platform::Place> &places, explicit ParallelExecutor(const std::vector<platform::Place> &places,
const std::unordered_set<std::string> &params,
const std::unordered_set<std::string> &bcast_vars, const std::unordered_set<std::string> &bcast_vars,
const ProgramDesc &main_program, const ProgramDesc &main_program,
const std::string &loss_var_name, Scope *scope, const std::string &loss_var_name, Scope *scope,
......
...@@ -22,20 +22,6 @@ limitations under the License. */ ...@@ -22,20 +22,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
DDim InferShapeContext::GetInputDim(const std::string &name) const {
const std::vector<std::string> &arg_names = Inputs(name);
PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
"Input(%s) should hold one element, but now it holds %d",
name, arg_names.size());
return this->GetDim(arg_names[0]);
}
std::vector<DDim> InferShapeContext::GetInputsDim(
const std::string &name) const {
const std::vector<std::string> &arg_names = Inputs(name);
return GetDims(arg_names);
}
std::vector<DDim> InferShapeContext::GetReaderDims( std::vector<DDim> InferShapeContext::GetReaderDims(
const std::string &name) const { const std::string &name) const {
const std::vector<std::string> &arg_names = Inputs(name); const std::vector<std::string> &arg_names = Inputs(name);
...@@ -46,26 +32,6 @@ std::vector<DDim> InferShapeContext::GetReaderDims( ...@@ -46,26 +32,6 @@ std::vector<DDim> InferShapeContext::GetReaderDims(
return this->GetRepeatedDims(arg_names[0]); return this->GetRepeatedDims(arg_names[0]);
} }
DDim InferShapeContext::GetInputsElementDim(const std::string &name,
int idx) const {
const std::vector<std::string> &names = Inputs(name);
return this->GetDim(names[idx]);
}
void InferShapeContext::SetOutputDim(const std::string &name, const DDim &dim) {
auto &arg_names = Outputs(name);
PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
"Output(%s) should hold one element, but now it holds %d",
name, arg_names.size());
SetDim(arg_names[0], dim);
}
void InferShapeContext::SetOutputsDim(const std::string &name,
const std::vector<DDim> &dims) {
auto &names = Outputs(name);
SetDims(names, dims);
}
void InferShapeContext::SetReaderDims(const std::string &name, void InferShapeContext::SetReaderDims(const std::string &name,
const std::vector<DDim> &dims) { const std::vector<DDim> &dims) {
const std::vector<std::string> &arg_names = Outputs(name); const std::vector<std::string> &arg_names = Outputs(name);
...@@ -76,69 +42,5 @@ void InferShapeContext::SetReaderDims(const std::string &name, ...@@ -76,69 +42,5 @@ void InferShapeContext::SetReaderDims(const std::string &name,
return this->SetRepeatedDims(arg_names[0], dims); return this->SetRepeatedDims(arg_names[0], dims);
} }
std::vector<InferShapeVarPtr> InferShapeContext::GetInputVarPtrs(
const std::string &name) {
const std::vector<std::string> arg_names = Inputs(name);
std::vector<InferShapeVarPtr> res;
res.reserve(arg_names.size());
std::transform(
arg_names.begin(), arg_names.end(), std::back_inserter(res),
[this](const std::string &name) { return this->GetVarPtr(name); });
return res;
}
std::vector<InferShapeVarPtr> InferShapeContext::GetOutputVarPtrs(
const std::string &name) {
const std::vector<std::string> arg_names = Outputs(name);
std::vector<InferShapeVarPtr> res;
res.reserve(arg_names.size());
std::transform(
arg_names.begin(), arg_names.end(), std::back_inserter(res),
[this](const std::string &name) { return this->GetVarPtr(name); });
return res;
}
std::vector<DDim> InferShapeContext::GetDims(
const std::vector<std::string> &names) const {
std::vector<DDim> ret;
ret.reserve(names.size());
std::transform(
names.begin(), names.end(), std::back_inserter(ret),
[this](const std::string &name) { return this->GetDim(name); });
return ret;
}
void InferShapeContext::SetDims(const std::vector<std::string> &names,
const std::vector<DDim> &dims) {
size_t length = names.size();
PADDLE_ENFORCE_EQ(length, dims.size());
for (size_t i = 0; i < length; ++i) {
if (names[i] == framework::kEmptyVarName) {
continue;
}
SetDim(names[i], dims[i]);
}
}
std::vector<proto::VarType::Type> InferShapeContext::GetInputsVarType(
const std::string &name) const {
return GetVarTypes(Inputs(name));
}
std::vector<proto::VarType::Type> InferShapeContext::GetOutputsVarType(
const std::string &name) const {
return GetVarTypes(Outputs(name));
}
std::vector<proto::VarType::Type> InferShapeContext::GetVarTypes(
const std::vector<std::string> &names) const {
std::vector<proto::VarType::Type> retv;
retv.resize(names.size());
std::transform(names.begin(), names.end(), retv.begin(),
std::bind(std::mem_fn(&InferShapeContext::GetVarType), this,
std::placeholders::_1));
return retv;
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -33,22 +33,23 @@ class InferShapeContext { ...@@ -33,22 +33,23 @@ class InferShapeContext {
virtual bool HasInput(const std::string &name) const = 0; virtual bool HasInput(const std::string &name) const = 0;
virtual bool HasOutput(const std::string &name) const = 0; virtual bool HasOutput(const std::string &name) const = 0;
std::vector<proto::VarType::Type> GetInputsVarType( virtual std::vector<proto::VarType::Type> GetInputsVarType(
const std::string &name) const; const std::string &name) const = 0;
std::vector<proto::VarType::Type> GetOutputsVarType( virtual std::vector<proto::VarType::Type> GetOutputsVarType(
const std::string &name) const; const std::string &name) const = 0;
virtual bool HasInputs(const std::string &name) const = 0; virtual bool HasInputs(const std::string &name) const = 0;
virtual bool HasOutputs(const std::string &name) const = 0; virtual bool HasOutputs(const std::string &name) const = 0;
DDim GetInputDim(const std::string &name) const; virtual DDim GetInputDim(const std::string &name) const = 0;
std::vector<DDim> GetInputsDim(const std::string &name) const; virtual std::vector<DDim> GetInputsDim(const std::string &name) const = 0;
std::vector<DDim> GetReaderDims(const std::string &name) const; virtual std::vector<DDim> GetReaderDims(const std::string &name) const;
DDim GetInputsElementDim(const std::string &name, int idx) const;
void SetOutputDim(const std::string &name, const DDim &dim); virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0;
void SetOutputsDim(const std::string &name, const std::vector<DDim> &dims); virtual void SetOutputsDim(const std::string &name,
void SetReaderDims(const std::string &name, const std::vector<DDim> &dims); const std::vector<DDim> &dims) = 0;
virtual void SetReaderDims(const std::string &name,
const std::vector<DDim> &dims);
virtual AttrReader Attrs() const = 0; virtual AttrReader Attrs() const = 0;
virtual const std::vector<std::string> &Inputs( virtual const std::vector<std::string> &Inputs(
...@@ -67,27 +68,15 @@ class InferShapeContext { ...@@ -67,27 +68,15 @@ class InferShapeContext {
virtual bool IsRuntime() const = 0; virtual bool IsRuntime() const = 0;
std::vector<InferShapeVarPtr> GetInputVarPtrs(const std::string &name); virtual std::vector<InferShapeVarPtr> GetInputVarPtrs(
std::vector<InferShapeVarPtr> GetOutputVarPtrs(const std::string &name); const std::string &name) = 0;
virtual InferShapeVarPtr GetVarPtr(const std::string &name) = 0; virtual std::vector<InferShapeVarPtr> GetOutputVarPtrs(
const std::string &name) = 0;
// Note: In while op, we need this to be public
void SetDims(const std::vector<std::string> &names,
const std::vector<DDim> &dims);
protected: protected:
virtual DDim GetDim(const std::string &name) const = 0;
virtual void SetDim(const std::string &name, const DDim &dim) = 0;
virtual std::vector<DDim> GetRepeatedDims(const std::string &name) const = 0; virtual std::vector<DDim> GetRepeatedDims(const std::string &name) const = 0;
virtual void SetRepeatedDims(const std::string &name, virtual void SetRepeatedDims(const std::string &name,
const std::vector<DDim> &dims) = 0; const std::vector<DDim> &dims) = 0;
std::vector<DDim> GetDims(const std::vector<std::string> &names) const;
std::vector<proto::VarType::Type> GetVarTypes(
const std::vector<std::string> &names) const;
virtual proto::VarType::Type GetVarType(const std::string &name) const = 0;
}; };
} // namespace framework } // namespace framework
......
...@@ -188,11 +188,13 @@ std::vector<Variable*> OpBase::ApplyGrad(framework::Scope* scope) { ...@@ -188,11 +188,13 @@ std::vector<Variable*> OpBase::ApplyGrad(framework::Scope* scope) {
std::vector<Variable*> ret; std::vector<Variable*> ret;
for (size_t i = 0; i < input_vars_->size(); ++i) { for (size_t i = 0; i < input_vars_->size(); ++i) {
bool found = false; bool found = false;
VarBase* origin_var = (*input_vars_)[i];
for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) {
Variable* var = scope->FindVar(outvar); Variable* var = scope->FindVar(outvar);
VarBase* origin_var = (*input_vars_)[i];
std::string orig_var = grad_to_var_->at(outvar); std::string orig_var = grad_to_var_->at(outvar);
PADDLE_ENFORCE(origin_var->var_desc_->Name() == orig_var); if (origin_var->var_desc_->Name() != orig_var) {
continue;
}
VLOG(3) << "apply grad " << outvar << " with origin " << orig_var; VLOG(3) << "apply grad " << outvar << " with origin " << orig_var;
origin_var->ApplyGrad(scope, var); origin_var->ApplyGrad(scope, var);
found = true; found = true;
......
...@@ -43,9 +43,12 @@ void CreateGradOp(const framework::OpDesc& op_desc, ...@@ -43,9 +43,12 @@ void CreateGradOp(const framework::OpDesc& op_desc,
class Tracer { class Tracer {
public: public:
explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) { explicit Tracer(framework::BlockDesc* root_block,
framework::BlockDesc* startup_block)
: root_block_(root_block), startup_block_(startup_block) {
root_scope_ = new framework::Scope(); root_scope_ = new framework::Scope();
scopes_[root_block_] = root_scope_; scopes_[root_block_] = root_scope_;
scopes_[startup_block_] = root_scope_;
} }
virtual ~Tracer() { delete root_scope_; } virtual ~Tracer() { delete root_scope_; }
...@@ -80,6 +83,8 @@ class Tracer { ...@@ -80,6 +83,8 @@ class Tracer {
} else { } else {
op->pre_ops_->push_back(nullptr); op->pre_ops_->push_back(nullptr);
} }
VLOG(3) << "input vname " << vname << " "
<< var->Get<framework::LoDTensor>().dims().size();
} }
*op->output_vars_ = outputs; *op->output_vars_ = outputs;
...@@ -98,12 +103,19 @@ class Tracer { ...@@ -98,12 +103,19 @@ class Tracer {
outputs[i]->pre_op_ = op; outputs[i]->pre_op_ = op;
outputs[i]->pre_op_out_idx_ = i; outputs[i]->pre_op_out_idx_ = i;
} }
VLOG(3) << "tracer running " << op_desc->Type();
op_base->Run(*scope, platform::CPUPlace()); op_base->Run(*scope, platform::CPUPlace());
if (block == startup_block_) {
op->grad_op_desc_ = nullptr;
op->grad_to_var_ = nullptr;
} else {
framework::OpDesc* grad_op_desc; framework::OpDesc* grad_op_desc;
auto grad_to_var = new std::unordered_map<std::string, std::string>(); auto grad_to_var = new std::unordered_map<std::string, std::string>();
CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var); CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var);
op->grad_op_desc_ = grad_op_desc; op->grad_op_desc_ = grad_op_desc;
op->grad_to_var_ = grad_to_var; op->grad_to_var_ = grad_to_var;
}
op->block_ = block; op->block_ = block;
} }
...@@ -121,6 +133,7 @@ class Tracer { ...@@ -121,6 +133,7 @@ class Tracer {
private: private:
std::map<framework::BlockDesc*, framework::Scope*> scopes_; std::map<framework::BlockDesc*, framework::Scope*> scopes_;
framework::BlockDesc* root_block_; framework::BlockDesc* root_block_;
framework::BlockDesc* startup_block_;
framework::Scope* root_scope_; framework::Scope* root_scope_;
}; };
......
...@@ -398,26 +398,41 @@ class WhileGradOpShapeInference : public framework::InferShapeBase { ...@@ -398,26 +398,41 @@ class WhileGradOpShapeInference : public framework::InferShapeBase {
ctx->HasInputs(kOutputs); ctx->HasInputs(kOutputs);
ctx->HasInputs(framework::GradVarName(kOutputs)); ctx->HasInputs(framework::GradVarName(kOutputs));
auto p_names = ctx->Inputs(kX);
auto pg_ig_names = ctx->Outputs(kXGRAD); auto pg_ig_names = ctx->Outputs(kXGRAD);
auto var_types = ctx->GetInputsVarType(kX); std::vector<framework::InferShapeVarPtr> in_var_ptrs =
std::vector<std::string> names_to_set; ctx->GetInputVarPtrs(kX);
std::vector<framework::DDim> dims_to_set; std::vector<framework::InferShapeVarPtr> out_var_ptrs =
for (size_t i = 0; i < p_names.size(); ++i) { ctx->GetOutputVarPtrs(kXGRAD);
PADDLE_ENFORCE(in_var_ptrs.size() == out_var_ptrs.size());
for (size_t i = 0; i < in_var_ptrs.size(); ++i) {
if (pg_ig_names[i] == framework::kEmptyVarName) { if (pg_ig_names[i] == framework::kEmptyVarName) {
continue; continue;
} }
auto dims = ctx->GetInputsElementDim(kX, i); if (ctx->IsRuntime()) {
if (var_types[i] == framework::proto::VarType::LOD_TENSOR) { framework::Variable *in_var =
names_to_set.push_back(pg_ig_names[i]); boost::get<framework::Variable *>(in_var_ptrs[i]);
dims_to_set.push_back(dims); framework::Variable *out_var =
} else if (var_types[i] == framework::proto::VarType::LOD_TENSOR_ARRAY) { boost::get<framework::Variable *>(out_var_ptrs[i]);
// not sure how to set the dim of LOD_TENSOR_ARRAY
names_to_set.push_back(pg_ig_names[i]); auto type = framework::ToVarType(in_var->Type());
dims_to_set.push_back(dims); if (type == framework::proto::VarType::LOD_TENSOR) {
out_var->GetMutable<LoDTensor>()->Resize(
in_var->Get<framework::LoDTensor>().dims());
} else if (type == framework::proto::VarType::SELECTED_ROWS) {
out_var->GetMutable<framework::SelectedRows>()->set_height(
in_var->Get<framework::SelectedRows>().GetCompleteDims()[0]);
} else if (type == framework::proto::VarType::LOD_TENSOR_ARRAY) {
PADDLE_THROW("WhileGradOp doesn't support type %d",
static_cast<int>(type));
}
} else {
framework::VarDesc *in_var =
boost::get<framework::VarDesc *>(in_var_ptrs[i]);
boost::get<framework::VarDesc *>(out_var_ptrs[i])
->SetShape(in_var->GetShape());
} }
} }
ctx->SetDims(names_to_set, dims_to_set);
} }
}; };
......
...@@ -155,11 +155,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -155,11 +155,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto chosen_memory_format = auto chosen_memory_format =
platform::data_format_to_memory_format(data_format); platform::data_format_to_memory_format(data_format);
weights_format = mkldnn::memory::format::any;
// Check the format for user's special output
if (chosen_memory_format != mkldnn::memory::format::any) {
if (is_conv3d) { if (is_conv3d) {
chosen_memory_format = chosen_memory_format =
platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format); platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format);
} }
weights_format = GetWeightsFormat(chosen_memory_format, g, is_conv3d); }
auto src_md = platform::MKLDNNMemDesc( auto src_md = platform::MKLDNNMemDesc(
src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format); src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
...@@ -435,11 +438,14 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -435,11 +438,14 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
auto chosen_memory_format = auto chosen_memory_format =
platform::data_format_to_memory_format(data_format); platform::data_format_to_memory_format(data_format);
weights_format = mkldnn::memory::format::any;
// Check the format for user's special output
if (chosen_memory_format != mkldnn::memory::format::any) {
if (is_conv3d) { if (is_conv3d) {
chosen_memory_format = chosen_memory_format =
platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format); platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format);
} }
weights_format = GetWeightsFormat(chosen_memory_format, g, is_conv3d); }
auto src_md = platform::MKLDNNMemDesc( auto src_md = platform::MKLDNNMemDesc(
src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format); src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
......
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <stdlib.h>
#include <limits> #include <limits>
#include "glog/logging.h" // For VLOG #include "glog/logging.h" // For VLOG
...@@ -420,7 +421,15 @@ void GRPCClient::Proceed() { ...@@ -420,7 +421,15 @@ void GRPCClient::Proceed() {
sync_cond_.notify_all(); sync_cond_.notify_all();
} }
} }
VLOG(3) << "GRPCClient Proceed end";
// Last log message
// Avoid using VLOG() and LOG(): in the destructor of google::LogMessage() a
// static Mutex log_mutex is used for synchronization, which might have been
// destructed at this moment.
if (FLAGS_v >= 3) {
std::string msg("GRPCClient Proceed end");
fwrite(msg.c_str(), msg.length(), 1, stdout);
}
} }
std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) { std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
......
...@@ -12,18 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,18 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_div_op.h" #include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
elementwise_div, elementwise_div,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>, ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>, ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>, ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>); ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
elementwise_div_grad, elementwise_div_grad,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>, ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>, ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>, ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
......
...@@ -12,19 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,19 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h" #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
elementwise_mul, elementwise_mul, ops::ElementwiseMulKernel<plat::CUDADeviceContext, float>,
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, float>, ops::ElementwiseMulKernel<plat::CUDADeviceContext, double>,
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, double>, ops::ElementwiseMulKernel<plat::CUDADeviceContext, int>,
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int>, ops::ElementwiseMulKernel<plat::CUDADeviceContext, int64_t>,
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>); ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
elementwise_mul_grad, elementwise_mul_grad,
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, float>, ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, float>,
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, double>, ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, double>,
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, int>, ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int>,
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int64_t>,
int64_t>); ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, plat::float16>);
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/fill_zeros_like_op.h" #include "paddle/fluid/operators/fill_zeros_like_op.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
...@@ -22,4 +23,6 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -22,4 +23,6 @@ REGISTER_OP_CUDA_KERNEL(
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int64_t>, ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, float>, ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, float>,
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, double>, ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, double>,
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>); ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>);
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <thrust/reduce.h> #include <thrust/reduce.h>
#include "paddle/fluid/operators/metrics/accuracy_op.h" #include "paddle/fluid/operators/metrics/accuracy_op.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
namespace paddle { namespace paddle {
...@@ -94,6 +95,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> { ...@@ -94,6 +95,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
// FIXME(typhoonzero): types of T is for inference data. // FIXME(typhoonzero): types of T is for inference data.
// label data is always int64 // label data is always int64
REGISTER_OP_CUDA_KERNEL(accuracy, REGISTER_OP_CUDA_KERNEL(
paddle::operators::AccuracyOpCUDAKernel<float>, accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
paddle::operators::AccuracyOpCUDAKernel<double>); paddle::operators::AccuracyOpCUDAKernel<double>,
paddle::operators::AccuracyOpCUDAKernel<paddle::platform::float16>);
...@@ -49,7 +49,8 @@ class MulOp : public framework::OperatorWithKernel { ...@@ -49,7 +49,8 @@ class MulOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
y_dims.size(), y_num_col_dims, y_dims.size(), y_num_col_dims,
"The input tensor Y's rank of MulOp should be larger than " "The input tensor Y's rank of MulOp should be larger than "
"y_num_col_dims."); "y_num_col_dims: %ld vs %ld",
y_dims.size(), y_num_col_dims);
auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims); auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims); auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
......
...@@ -14,8 +14,11 @@ limitations under the License. */ ...@@ -14,8 +14,11 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/optimizers/momentum_op.h" #include "paddle/fluid/operators/optimizers/momentum_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
momentum, ops::MomentumOpKernel<paddle::platform::CUDADeviceContext, float>, momentum, ops::MomentumOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::MomentumOpKernel<paddle::platform::CUDADeviceContext, double>); ops::MomentumOpKernel<paddle::platform::CUDADeviceContext, double>,
ops::MomentumOpKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>);
...@@ -237,7 +237,8 @@ class SparseMomentumFunctor<T, UseNesterov> { ...@@ -237,7 +237,8 @@ class SparseMomentumFunctor<T, UseNesterov> {
inline HOSTDEVICE void operator()(size_t i) { inline HOSTDEVICE void operator()(size_t i) {
auto row_idx = auto row_idx =
math::BinarySearch<int64_t>(rows_, row_height_, i / row_numel_); math::BinarySearch<int64_t>(rows_, row_height_, i / row_numel_);
T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] : 0; T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_]
: static_cast<T>(0);
// put memory access in register // put memory access in register
const T p = p_[i]; const T p = p_[i];
const T lr = lr_[0]; const T lr = lr_[0];
...@@ -282,7 +283,8 @@ class SparseMomentumFunctor<T, NoNesterov> { ...@@ -282,7 +283,8 @@ class SparseMomentumFunctor<T, NoNesterov> {
inline HOSTDEVICE void operator()(size_t i) { inline HOSTDEVICE void operator()(size_t i) {
auto row_idx = auto row_idx =
math::BinarySearch<int64_t>(rows_, row_height_, i / row_numel_); math::BinarySearch<int64_t>(rows_, row_height_, i / row_numel_);
T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] : 0; T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_]
: static_cast<T>(0);
// put memory access in register // put memory access in register
const T p = p_[i]; const T p = p_[i];
const T lr = lr_[0]; const T lr = lr_[0];
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/operators/top_k_op.h" #include "paddle/fluid/operators/top_k_op.h"
#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/assert.h"
#include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -150,7 +151,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam, ...@@ -150,7 +151,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
if (k < MaxLength - (*beam)) { if (k < MaxLength - (*beam)) {
topk[k] = topk[k + *beam]; topk[k] = topk[k + *beam];
} else { } else {
topk[k].set(-INFINITY, -1); topk[k].set(-static_cast<T>(INFINITY), -1);
} }
} }
if (!(*is_empty)) { if (!(*is_empty)) {
...@@ -160,7 +161,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam, ...@@ -160,7 +161,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
} }
*max = topk[MaxLength - 1]; *max = topk[MaxLength - 1];
if ((*max).v == -1) *is_empty = true; if ((*max).v == -static_cast<T>(1)) *is_empty = true;
*beam = 0; *beam = 0;
} }
} }
...@@ -181,7 +182,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam, ...@@ -181,7 +182,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
if (k < MaxLength - *beam) { if (k < MaxLength - *beam) {
topk[k] = topk[k + *beam]; topk[k] = topk[k + *beam];
} else { } else {
topk[k].set(-INFINITY, -1); topk[k].set(-static_cast<T>(INFINITY), -1);
} }
} }
if (!(*is_empty)) { if (!(*is_empty)) {
...@@ -278,7 +279,7 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices, ...@@ -278,7 +279,7 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
bool firststep = true; bool firststep = true;
for (int j = 0; j < MaxLength; j++) { for (int j = 0; j < MaxLength; j++) {
topk[j].set(-INFINITY, -1); topk[j].set(-static_cast<T>(INFINITY), -1);
} }
while (top_num) { while (top_num) {
ThreadGetTopK<T, MaxLength, BlockSize>( ThreadGetTopK<T, MaxLength, BlockSize>(
...@@ -362,5 +363,7 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> { ...@@ -362,5 +363,7 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>, REGISTER_OP_CUDA_KERNEL(
paddle::operators::TopkOpCUDAKernel<double>); top_k, paddle::operators::TopkOpCUDAKernel<float>,
paddle::operators::TopkOpCUDAKernel<double>,
paddle::operators::TopkOpCUDAKernel<paddle::platform::float16>);
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/dynload/nccl.h" #include "paddle/fluid/platform/dynload/nccl.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"
#define NCCL_ID_VARNAME "NCCLID" #define NCCL_ID_VARNAME "NCCLID"
...@@ -38,6 +39,8 @@ inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) { ...@@ -38,6 +39,8 @@ inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) {
return ncclInt; return ncclInt;
} else if (type == framework::proto::VarType::INT64) { } else if (type == framework::proto::VarType::INT64) {
return ncclInt64; return ncclInt64;
} else if (type == framework::proto::VarType::FP16) {
return ncclFloat16;
} else { } else {
PADDLE_THROW("Not supported"); PADDLE_THROW("Not supported");
} }
......
...@@ -24,8 +24,9 @@ namespace pybind { ...@@ -24,8 +24,9 @@ namespace pybind {
void BindTracer(pybind11::module *m) { void BindTracer(pybind11::module *m) {
pybind11::class_<imperative::Tracer>(*m, "Tracer", "") pybind11::class_<imperative::Tracer>(*m, "Tracer", "")
.def("__init__", .def("__init__",
[](imperative::Tracer &self, framework::BlockDesc *root_block) { [](imperative::Tracer &self, framework::BlockDesc *root_block,
new (&self) imperative::Tracer(root_block); framework::BlockDesc *startup_block) {
new (&self) imperative::Tracer(root_block, startup_block);
}) })
.def("trace", &imperative::Tracer::Trace) .def("trace", &imperative::Tracer::Trace)
.def("get_scope", &imperative::Tracer::GetScope, .def("get_scope", &imperative::Tracer::GetScope,
......
...@@ -977,7 +977,6 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -977,7 +977,6 @@ All parameter, weight, gradient are variables in Paddle.
cannot be updated after being finalized.)DOC"); cannot be updated after being finalized.)DOC");
pe.def(py::init<const std::vector<platform::Place> &, pe.def(py::init<const std::vector<platform::Place> &,
const std::unordered_set<std::string> &,
const std::unordered_set<std::string> &, const ProgramDesc &, const std::unordered_set<std::string> &, const ProgramDesc &,
const std::string &, Scope *, std::vector<Scope *> &, const std::string &, Scope *, std::vector<Scope *> &,
const ExecutionStrategy &, const BuildStrategy &, size_t, const ExecutionStrategy &, const BuildStrategy &, size_t,
......
...@@ -489,8 +489,11 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, ...@@ -489,8 +489,11 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
grad_to_var = dict() grad_to_var = dict()
op_desc = _create_op_desc_( op_desc = _create_op_desc_(
"fill_constant", {}, {"Out": [_append_grad_suffix_(loss.name)]}, { "fill_constant",
"shape": [1], {},
{"Out": [_append_grad_suffix_(loss.name)]},
{
"shape": [1], # TODO(panyx0718): This can be loss.shape.
"value": 1.0, "value": 1.0,
"dtype": loss.dtype, "dtype": loss.dtype,
"force_cpu": False, "force_cpu": False,
......
...@@ -22,9 +22,12 @@ from . import op_frequence ...@@ -22,9 +22,12 @@ from . import op_frequence
from .op_frequence import * from .op_frequence import *
from . import quantize from . import quantize
from .quantize import * from .quantize import *
from . import utils
from .utils import *
__all__ = [] __all__ = []
__all__ += decoder.__all__ __all__ += decoder.__all__
__all__ += memory_usage_calc.__all__ __all__ += memory_usage_calc.__all__
__all__ += op_frequence.__all__ __all__ += op_frequence.__all__
__all__ += quantize.__all__ __all__ += quantize.__all__
__all__ += utils.__all__
...@@ -13,10 +13,11 @@ ...@@ -13,10 +13,11 @@
# limitations under the License. # limitations under the License.
from __future__ import print_function from __future__ import print_function
#from . import lookup_table_utils from . import lookup_table_utils
#from .lookup_table_utils import * from .lookup_table_utils import *
from . import hdfs_utils from . import hdfs_utils
from .hdfs_utils import * from .hdfs_utils import *
#__all__ = lookup_table_utils.__all__ __all__ = []
__all__ = hdfs_utils.__all__ __all__ += lookup_table_utils.__all__
__all__ += hdfs_utils.__all__
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
"""HDFS Utils""" """HDFS Utils"""
import os import os
import sys
import subprocess import subprocess
import multiprocessing import multiprocessing
from datetime import datetime from datetime import datetime
...@@ -24,7 +25,7 @@ import errno ...@@ -24,7 +25,7 @@ import errno
import logging import logging
__all__ = ["HDFSClient", "multi_download"] __all__ = ["HDFSClient", "multi_download", "multi_upload"]
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
_logger = logging.getLogger("hdfs_utils") _logger = logging.getLogger("hdfs_utils")
...@@ -94,11 +95,13 @@ class HDFSClient(object): ...@@ -94,11 +95,13 @@ class HDFSClient(object):
def upload(self, hdfs_path, local_path, overwrite=False, retry_times=5): def upload(self, hdfs_path, local_path, overwrite=False, retry_times=5):
""" """
upload the local file to hdfs upload the local file to hdfs
Args: Args:
hdfs_path: hdfs path, target path hdfs_path(str): the hdfs file path
local_path: local file path, source path local_path(str): the local file path
overwrite: will overwrite the original file overwrite(bool|None): will overwrite the file on HDFS or not
retry_times: max times retry to upload retry_times(int|5): retry times
Returns: Returns:
True or False True or False
""" """
...@@ -109,7 +112,7 @@ class HDFSClient(object): ...@@ -109,7 +112,7 @@ class HDFSClient(object):
_logger.warn( _logger.warn(
"The Local path: {} is dir and I will support it later, return". "The Local path: {} is dir and I will support it later, return".
format(local_path)) format(local_path))
return return False
base = os.path.basename(local_path) base = os.path.basename(local_path)
if not self.is_exist(hdfs_path): if not self.is_exist(hdfs_path):
...@@ -141,13 +144,15 @@ class HDFSClient(object): ...@@ -141,13 +144,15 @@ class HDFSClient(object):
def download(self, hdfs_path, local_path, overwrite=False, unzip=False): def download(self, hdfs_path, local_path, overwrite=False, unzip=False):
""" """
download from hdfs download file from HDFS
Args: Args:
hdfs_path: hdfs path, target path hdfs_path(str): the hdfs file path
local_path: local file path, source path local_path(str): the local file path
overwrite: will remove original file and overwrite it. overwrite(bool|None): will overwrite the file on HDFS or not
unzip: ignore this param unzip(bool|False): if the download file is compressed by zip, unzip it or not.
Returns
Returns:
True or False True or False
""" """
_logger.info('Downloading %r to %r.', hdfs_path, local_path) _logger.info('Downloading %r to %r.', hdfs_path, local_path)
...@@ -188,11 +193,11 @@ class HDFSClient(object): ...@@ -188,11 +193,11 @@ class HDFSClient(object):
def is_exist(self, hdfs_path=None): def is_exist(self, hdfs_path=None):
""" """
whether the remote hdfs path exists? whether the remote HDFS path exists
Args: Args:
hdfs_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp) hdfs_path(str): the hdfs file path
fs_name: The default values are the same as in the job configuration
fs_ugi: The default values are the same as in the job configuration
Returns: Returns:
True or False True or False
""" """
...@@ -211,11 +216,11 @@ class HDFSClient(object): ...@@ -211,11 +216,11 @@ class HDFSClient(object):
def is_dir(self, hdfs_path=None): def is_dir(self, hdfs_path=None):
""" """
whether the remote hdfs path exists? whether the remote HDFS path is directory
Args: Args:
remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp) hdfs_path(str): the hdfs file path
fs_name: The default values are the same as in the job configuration
fs_ugi: The default values are the same as in the job configuration
Returns: Returns:
True or False True or False
""" """
...@@ -239,15 +244,15 @@ class HDFSClient(object): ...@@ -239,15 +244,15 @@ class HDFSClient(object):
""" """
Remove a file or directory from HDFS. Remove a file or directory from HDFS.
whether the remote HDFS path exists
Args: Args:
param hdfs_path: HDFS path. hdfs_path: HDFS path.
param recursive: Recursively delete files and directories. By default,
this method will raise an :class:`HdfsError` if trying to delete a
non-empty directory.
Returns: Returns:
True or False
This function returns `True` if the deletion was successful and `False` if This function returns `True` if the deletion was successful and `False` if
no file or directory previously existed at `hdfs_path`. no file or directory previously existed at `hdfs_path`.
""" """
_logger.info('Deleting %r.', hdfs_path) _logger.info('Deleting %r.', hdfs_path)
...@@ -273,16 +278,14 @@ class HDFSClient(object): ...@@ -273,16 +278,14 @@ class HDFSClient(object):
def rename(self, hdfs_src_path, hdfs_dst_path, overwrite=False): def rename(self, hdfs_src_path, hdfs_dst_path, overwrite=False):
""" """
Rename a file or folder. Move a file or folder on HDFS.
Args: Args:
:param hdfs_src_path: Source path. hdfs_path(str): HDFS path.
:param hdfs_dst_path: Destination path. If the path already exists and is overwrite(bool|False): If the path already exists and overwrite is False, will return False.
a directory, the source will be moved into it. If the path exists and is
a file, or if a parent destination directory is missing, this method will
raise an :class:`HdfsError`.
Returns: Returns:
This function returns `True` if the rename was successful and `False` if True or False
rename was faild.
""" """
assert hdfs_src_path is not None assert hdfs_src_path is not None
assert hdfs_dst_path is not None assert hdfs_dst_path is not None
...@@ -320,17 +323,20 @@ class HDFSClient(object): ...@@ -320,17 +323,20 @@ class HDFSClient(object):
raise raise
def makedirs(self, hdfs_path): def makedirs(self, hdfs_path):
"""Create a remote directory, recursively if necessary. """
Create a remote directory, recursively if necessary.
Args: Args:
:param hdfs_path: Remote path. Intermediate directories will be created hdfs_path(str): Remote path. Intermediate directories will be created appropriately.
appropriately.
Returns: Returns:
True if make a directories was successful, False when make a directiries was failed. True or False
""" """
_logger.info('Creating directories to %r.', hdfs_path) _logger.info('Creating directories to %r.', hdfs_path)
assert hdfs_path is not None assert hdfs_path is not None
if self.is_exist(hdfs_path): if self.is_exist(hdfs_path):
_logger.error("HDFS path is exist: {}".format(hdfs_path))
return return
mkdirs_commands = ['-mkdir', hdfs_path] mkdirs_commands = ['-mkdir', hdfs_path]
...@@ -346,11 +352,13 @@ class HDFSClient(object): ...@@ -346,11 +352,13 @@ class HDFSClient(object):
def ls(self, hdfs_path): def ls(self, hdfs_path):
""" """
ls a hdfs_path. ls directory contents about HDFS hdfs_path
Args: Args:
:param hdfs_path: hdfs_path will be ls. hdfs_path(str): Remote HDFS path will be ls.
Returns: Returns:
This function returns a `list` that contaion all files in the hdfs_path. List: a contents list about hdfs_path.
""" """
assert hdfs_path is not None assert hdfs_path is not None
...@@ -378,11 +386,15 @@ class HDFSClient(object): ...@@ -378,11 +386,15 @@ class HDFSClient(object):
def lsr(self, hdfs_path, only_file=True, sort=True): def lsr(self, hdfs_path, only_file=True, sort=True):
""" """
ls a hdfs_path sort by time. list directory contents about HDFS hdfs_path recursively
Args: Args:
:param hdfs_path: hdfs_path will be ls. hdfs_path(str): Remote HDFS path.
only_file(bool|True): will discard folders.
sort(bool|True): will be sorted by create time.
Returns: Returns:
This function returns a `list` that contaion all files sorted by time in the hdfs_path. List: a contents list about hdfs_path.
""" """
def sort_by_time(v1, v2): def sort_by_time(v1, v2):
...@@ -422,61 +434,54 @@ class HDFSClient(object): ...@@ -422,61 +434,54 @@ class HDFSClient(object):
return ret_lines return ret_lines
def multi_upload(client, def multi_download(client,
hdfs_path, hdfs_path,
local_path, local_path,
multi_processes=5, trainer_id,
overwrite=False): trainers,
multi_processes=5):
""" """
Upload file to hdfs. Download files from HDFS using multi process.
Args: Args:
:param overwrite: will overwrite hdfs file or not client(HDFSClient): instance of HDFSClient
:param multi_processes: the upload data process at the same time, default=5 hdfs_path(str): path on hdfs
:param client: instance of HDFSClient local_path(str): path on local
:param hdfs_path: path on hdfs trainer_id(int): current trainer id
:param local_path: path on local trainers(int): all trainers number
Returns: multi_processes(int|5): the download data process at the same time, default=5
Returns:
List:
Download files in local folder.
""" """
def __subprocess_upload(datas): def __subprocess_download(datas):
for data in datas: for data in datas:
re_path = os.path.relpath(os.path.dirname(data), local_path) re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
hdfs_re_path = os.path.join(hdfs_path, re_path) if re_path == os.curdir:
client.upload(hdfs_re_path, data, overwrite, retry_times=5) sub_local_re_path = local_path
else:
def get_local_files(path): sub_local_re_path = os.path.join(local_path, re_path)
""" client.download(data, sub_local_re_path)
Get all local files
Args:
path: local file path
Returns:
A list that contation all files in the path.
"""
rlist = []
if not os.path.isdir(path): assert isinstance(client, HDFSClient)
return rlist
for dirname, folder, files in os.walk(path): client.make_local_dirs(local_path)
for i in files: _logger.info("Make local dir {} successfully".format(local_path))
t = os.path.join(dirname, i)
rlist.append(t)
return rlist
assert isinstance(client, HDFSClient) all_need_download = client.lsr(hdfs_path, sort=True)
need_download = all_need_download[trainer_id::trainers]
_logger.info("Get {} files From all {} files need to be download from {}".
format(len(need_download), len(all_need_download), hdfs_path))
all_files = get_local_files(local_path) _logger.info("Start {} multi process to download datas".format(
if not all_files:
_logger.info("there are nothing need to upload, exit")
return
_logger.info("Start {} multi process to upload datas".format(
multi_processes)) multi_processes))
procs = [] procs = []
for i in range(multi_processes): for i in range(multi_processes):
process_datas = all_files[i::multi_processes] process_datas = need_download[i::multi_processes]
p = multiprocessing.Process( p = multiprocessing.Process(
target=__subprocess_upload, args=(process_datas, )) target=__subprocess_download, args=(process_datas, ))
procs.append(p) procs.append(p)
p.start() p.start()
...@@ -484,55 +489,84 @@ def multi_upload(client, ...@@ -484,55 +489,84 @@ def multi_upload(client,
for proc in procs: for proc in procs:
proc.join() proc.join()
_logger.info("Finish {} multi process to upload datas".format( _logger.info("Finish {} multi process to download datas".format(
multi_processes)) multi_processes))
local_downloads = []
for data in need_download:
data_name = os.path.basename(data)
re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
if re_path == os.curdir:
local_re_path = os.path.join(local_path, data_name)
else:
local_re_path = os.path.join(local_path, re_path, data_name)
local_downloads.append(local_re_path)
return local_downloads
def multi_download(client,
def getfilelist(path):
rlist = []
for dir, folder, file in os.walk(path):
for i in file:
t = os.path.join(dir, i)
rlist.append(t)
for r in rlist:
print(r)
def multi_upload(client,
hdfs_path, hdfs_path,
local_path, local_path,
trainer_id, multi_processes=5,
trainers, overwrite=False,
file_cnt, sync=True):
multi_processes=5):
""" """
multi_download Upload files to HDFS using multi process.
Args: Args:
:param client: instance of HDFSClient client(HDFSClient): instance of HDFSClient
:param hdfs_path: path on hdfs hdfs_path(str): path on hdfs
:param local_path: path on local local_path(str): path on local
:param trainer_id: current trainer id multi_processes(int|5): the upload data process at the same time, default=5
:param trainers: all trainers number overwrite(bool|False): will overwrite file on HDFS or not
:param file_cnt: all file number sync(bool|True): upload files sync or not.
:param multi_processes: the download data process at the same time, default=5
:return: None
Returns: Returns:
A list that be downloaded. None
""" """
def __subprocess_download(datas): def __subprocess_upload(datas):
for data in datas: for data in datas:
re_path = os.path.relpath(os.path.dirname(data), hdfs_path) re_path = os.path.relpath(os.path.dirname(data), local_path)
local_re_path = os.path.join(local_path, re_path) hdfs_re_path = os.path.join(hdfs_path, re_path)
client.download(data, local_re_path) client.upload(hdfs_re_path, data, overwrite, retry_times=5)
assert isinstance(client, HDFSClient) def get_local_files(path):
rlist = []
client.make_local_dirs(local_path) if not os.path.isdir(path):
_logger.info("Make local dir {} successfully".format(local_path)) return rlist
all_need_download = client.lsr(hdfs_path, sort=True)[:file_cnt] for dirname, folder, files in os.walk(path):
need_download = all_need_download[trainer_id::trainers] for i in files:
_logger.info("Get {} files From all {} files need to be download from {}". t = os.path.join(dirname, i)
format(len(need_download), len(all_need_download), hdfs_path)) rlist.append(t)
return rlist
_logger.info("Start {} multi process to download datas".format( assert isinstance(client, HDFSClient)
all_files = get_local_files(local_path)
if not all_files:
_logger.info("there are nothing need to upload, exit")
return
_logger.info("Start {} multi process to upload datas".format(
multi_processes)) multi_processes))
procs = [] procs = []
for i in range(multi_processes): for i in range(multi_processes):
process_datas = need_download[i::multi_processes] process_datas = all_files[i::multi_processes]
p = multiprocessing.Process( p = multiprocessing.Process(
target=__subprocess_download, args=(process_datas, )) target=__subprocess_upload, args=(process_datas, ))
procs.append(p) procs.append(p)
p.start() p.start()
...@@ -540,18 +574,9 @@ def multi_download(client, ...@@ -540,18 +574,9 @@ def multi_download(client,
for proc in procs: for proc in procs:
proc.join() proc.join()
_logger.info("Finish {} multi process to download datas".format( _logger.info("Finish {} multi process to upload datas".format(
multi_processes)) multi_processes))
local_downloads = []
for data in need_download:
data_name = os.path.basename(data)
re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
local_re_path = os.path.join(local_path, re_path, data_name)
local_downloads.append(local_re_path)
return local_downloads
if __name__ == "__main__": if __name__ == "__main__":
hadoop_home = "/home/client/hadoop-client/hadoop/" hadoop_home = "/home/client/hadoop-client/hadoop/"
......
...@@ -18,14 +18,12 @@ import os ...@@ -18,14 +18,12 @@ import os
import time import time
import logging import logging
import paddle
import paddle.fluid as fluid
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid import io from paddle.fluid import io
from paddle.fluid import Program from paddle.fluid import Program
__all__ = [ __all__ = [
"load_inference_model", "load_persistable_vars", "load_persistables_for_increment", "load_persistables_for_inference",
"convert_dist_to_sparse_program" "convert_dist_to_sparse_program"
] ]
...@@ -80,19 +78,28 @@ def __get_prefetch_op_tuples(main_program): ...@@ -80,19 +78,28 @@ def __get_prefetch_op_tuples(main_program):
return prefetch_op_tuples return prefetch_op_tuples
def convert_dist_to_sparse_program(main_program): def convert_dist_to_sparse_program(program):
if not main_program._distributed_lookup_table: """
WARNING: this function will only be used for distributed training with distributed lookup table.
when we train model with distributed lookup table but want to do the local inference, we can use
this function to convert the train program with distributed lookup table to sparse lookup table.
:param program(Program): the program must be the trainer program, which will be get by the distribute transpiler.
:return:
program: The `program` is a Program, it's the program replace distributed lookup table to sparse lookup table.
"""
if not program._distributed_lookup_table:
_logger.warn( _logger.warn(
"There are no distributed lookup tables need to be converted") "There are no distributed lookup tables need to be converted")
return return
# create table param and grad var in pserver program # create table param and grad var in pserver program
origin_emb_var = "{}.origin".format(main_program._distributed_lookup_table) origin_emb_var = "{}.origin".format(program._distributed_lookup_table)
emb_var = main_program._distributed_lookup_table emb_var = program._distributed_lookup_table
main_program.global_block()._rename_var(emb_var, origin_emb_var) program.global_block()._rename_var(emb_var, origin_emb_var)
origin_param_var = main_program.global_block().vars[origin_emb_var] origin_param_var = program.global_block().vars[origin_emb_var]
param_var = main_program.global_block().create_var( param_var = program.global_block().create_var(
name=emb_var, name=emb_var,
shape=origin_param_var.shape, shape=origin_param_var.shape,
dtype=origin_param_var.dtype, dtype=origin_param_var.dtype,
...@@ -100,28 +107,28 @@ def convert_dist_to_sparse_program(main_program): ...@@ -100,28 +107,28 @@ def convert_dist_to_sparse_program(main_program):
persistable=True) persistable=True)
# parameter must be selected rows # parameter must be selected rows
param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS) param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
main_program._sync_with_cpp() program._sync_with_cpp()
prefetch_op_tuples = __get_prefetch_op_tuples(main_program) prefetch_op_tuples = __get_prefetch_op_tuples(program)
split_ids_id = prefetch_op_tuples[0] split_ids_id = prefetch_op_tuples[0]
for idx in range(split_ids_id + 2, split_ids_id - 1, -1): for idx in range(split_ids_id + 2, split_ids_id - 1, -1):
main_program.global_block()._remove_op(idx) program.global_block()._remove_op(idx)
main_program.desc.flush() program.desc.flush()
in_out_pairs = zip(prefetch_op_tuples[1], prefetch_op_tuples[2]) in_out_pairs = zip(prefetch_op_tuples[1], prefetch_op_tuples[2])
for in_out_pair in in_out_pairs: for in_out_pair in in_out_pairs:
idx = split_ids_id idx = split_ids_id
ids = main_program.global_block().vars[in_out_pair[0]] ids = program.global_block().vars[in_out_pair[0]]
out = main_program.global_block().vars[in_out_pair[1]] out = program.global_block().vars[in_out_pair[1]]
__insert_lookup_sparse_table_op(main_program, idx, ids, param_var, out) __insert_lookup_sparse_table_op(program, idx, ids, param_var, out)
main_program.desc.flush() program.desc.flush()
return main_program return program
def load_persistable_vars(executor, dirname, program, lookup_table_var): def _load_persistable_vars(executor, dirname, program, lookup_table_vars):
def _is_checkpoint_var(exclude_fluid_vars=None): def _is_checkpoint_var(exclude_fluid_vars=None):
""" """
the checkpoint will not save or load all the variables. the checkpoint will not save or load all the variables.
...@@ -159,7 +166,81 @@ def load_persistable_vars(executor, dirname, program, lookup_table_var): ...@@ -159,7 +166,81 @@ def load_persistable_vars(executor, dirname, program, lookup_table_var):
return is_valid return is_valid
def _load_lookup_table_vars(executor, dirname, main_program, io.load_vars(
executor,
dirname=dirname,
main_program=program,
predicate=_is_checkpoint_var(lookup_table_vars),
filename=None)
def load_persistables_for_increment(dirname, executor, program,
lookup_table_var, lookup_table_var_path):
"""
WARNING: this function will only be used for distributed training with distributed lookup table.
for increment trainning, the pserver will not only load dense variables,
but also load the suitable lookup table var. Because of slice lookup table
var with HASH, we must load the correct slice var.
:param dirname(str): The directory path
:param executor(Executor): The executor to run for loading inference model.
:param program(Program): The parameter server program, which will run on Pserver.
:param lookup_table_var: the distributed lookup tables var name.
:param lookup_table_var_path: the the distributed lookup tables var location.
:return: None
"""
def __load_lookup_table_vars(executor, main_program, lookup_table_var,
lookup_table_var_path):
emb_var = main_program.global_block().var(lookup_table_var)
load_program = Program()
load_block = load_program.global_block()
load_block.append_op(
type='load',
inputs={},
outputs={'Out': [emb_var]},
attrs={'file_path': lookup_table_var_path})
executor.run(load_program)
if not os.path.isdir(dirname):
raise ValueError("There is no directory named '%s'", dirname)
if not os.path.exists(lookup_table_var_path):
raise ValueError("There is no file named '%s'", lookup_table_var_path)
if not isinstance(program, Program):
raise ValueError("program must be an instance of fluid.Program")
_logger.info("Start Load Sparse Program With "
"Distributed Lookup Table Vars from {}, time = {}".format(
dirname, time.ctime()))
_load_persistable_vars(executor, dirname, program, [lookup_table_var])
__load_lookup_table_vars(executor, program, lookup_table_var,
lookup_table_var_path)
_logger.info("Finish Load Sparse Program With "
"Distributed Lookup Table Vars from {}, time = {}".format(
dirname, time.ctime()))
def load_persistables_for_inference(dirname, executor, program,
lookup_table_var_name):
"""
WARNING: this function will only be used for inference with distributed lookup table.
Inference with distributed lookup table is a little funky, this function will load distributed
lookup table vars into sparse var, can be used in local inference mode.
:param dirname(str): The directory path
:param executor(Executor): The executor to run for loading inference model.
:param program(Program): The parameter server program, which will run on Pserver.
:param lookup_table_var_name: the distributed lookup tables var name.
:return: None
"""
def __load_lookup_table_vars(executor, dirname, main_program,
lookup_table_vars): lookup_table_vars):
if not os.path.isdir(dirname): if not os.path.isdir(dirname):
raise ValueError("There is no directory named '%s'", dirname) raise ValueError("There is no directory named '%s'", dirname)
...@@ -209,30 +290,13 @@ def load_persistable_vars(executor, dirname, program, lookup_table_var): ...@@ -209,30 +290,13 @@ def load_persistable_vars(executor, dirname, program, lookup_table_var):
global_block.append_op(type='delete_var', inputs={'X': sums}) global_block.append_op(type='delete_var', inputs={'X': sums})
executor.run(convert_program) executor.run(convert_program)
_logger.info("Start Load Sparse Program With "
"Distributed Lookup Table Vars from {}, time = {}".format(
dirname, time.ctime()))
lookup_table_vars = [lookup_table_var]
io.load_vars(
executor,
dirname=dirname,
main_program=program,
predicate=_is_checkpoint_var(lookup_table_vars),
filename=None)
_load_lookup_table_vars(executor, dirname, program, lookup_table_vars)
_logger.info("Finish Load Sparse Program With "
"Distributed Lookup Table Vars from {}, time = {}".format(
dirname, time.ctime()))
def load_inference_model(dirname, executor, lookup_table_var_name):
if not os.path.isdir(dirname): if not os.path.isdir(dirname):
raise ValueError("There is no directory named '%s'", dirname) raise ValueError("There is no directory named '%s'", dirname)
if program:
if not isinstance(program, Program):
raise ValueError("program must be an instance of fluid.Program")
else:
local_model = os.path.join(dirname, model_filename) local_model = os.path.join(dirname, model_filename)
with open(local_model, "rb") as f: with open(local_model, "rb") as f:
...@@ -244,13 +308,16 @@ def load_inference_model(dirname, executor, lookup_table_var_name): ...@@ -244,13 +308,16 @@ def load_inference_model(dirname, executor, lookup_table_var_name):
raise ValueError("Unsupported program version: %d\n" % raise ValueError("Unsupported program version: %d\n" %
program._version()) program._version())
# Binary data also need version. _logger.info("Start Load Sparse Program With "
load_persistable_vars(executor, dirname, program, lookup_table_var_name) "Distributed Lookup Table Vars from {}, time = {}".format(
dirname, time.ctime()))
_load_persistable_vars(executor, dirname, program, [lookup_table_var_name])
__load_lookup_table_vars(executor, dirname, program,
[lookup_table_var_name])
feed_target_names = program.desc.get_feed_target_names() _logger.info("Finish Load Sparse Program With "
fetch_target_names = program.desc.get_fetch_target_names() "Distributed Lookup Table Vars from {}, time = {}".format(
fetch_targets = [ dirname, time.ctime()))
program.global_block().var(name) for name in fetch_target_names
]
return [program, feed_target_names, fetch_targets] return program
...@@ -44,6 +44,8 @@ class DataToLoDTensorConverter(object): ...@@ -44,6 +44,8 @@ class DataToLoDTensorConverter(object):
self.dtype = 'int64' self.dtype = 'int64'
elif dtype == core.VarDesc.VarType.FP64: elif dtype == core.VarDesc.VarType.FP64:
self.dtype = 'float64' self.dtype = 'float64'
elif dtype == core.VarDesc.VarType.FP16:
self.dtype = 'float16'
elif dtype == core.VarDesc.VarType.INT32: elif dtype == core.VarDesc.VarType.INT32:
self.dtype = 'int32' self.dtype = 'int32'
elif dtype == core.VarDesc.VarType.UINT8: elif dtype == core.VarDesc.VarType.UINT8:
......
...@@ -1324,6 +1324,9 @@ class Block(object): ...@@ -1324,6 +1324,9 @@ class Block(object):
def _prepend_op(self, *args, **kwargs): def _prepend_op(self, *args, **kwargs):
op_desc = self.desc._prepend_op() op_desc = self.desc._prepend_op()
op = Operator(self, op_desc, *args, **kwargs) op = Operator(self, op_desc, *args, **kwargs)
if _in_imperative_mode():
_imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs],
[v._ivar for v in op.outputs], self.desc)
self.ops.insert(0, op) self.ops.insert(0, op)
return op return op
......
...@@ -28,7 +28,8 @@ def enabled(): ...@@ -28,7 +28,8 @@ def enabled():
def guard(): def guard():
train = framework.Program() train = framework.Program()
startup = framework.Program() startup = framework.Program()
tracer = core.Tracer(train.current_block().desc) tracer = core.Tracer(train.current_block().desc,
startup.current_block().desc)
with framework.program_guard(train, startup): with framework.program_guard(train, startup):
with framework.unique_name.guard(): with framework.unique_name.guard():
with framework._imperative_guard(tracer): with framework._imperative_guard(tracer):
......
...@@ -25,11 +25,9 @@ __all__ = ['PyLayer'] ...@@ -25,11 +25,9 @@ __all__ = ['PyLayer']
class PyLayer(core.Layer): class PyLayer(core.Layer):
def __init__(self): def __init__(self):
pass self._built = False
def __call__(self, inputs): def __call__(self, inputs):
# TODO(panyx0718): Support declarative mode as well.
assert base.enabled()
if not isinstance(inputs, list) and not isinstance(inputs, tuple): if not isinstance(inputs, list) and not isinstance(inputs, tuple):
inputs = [inputs] inputs = [inputs]
...@@ -37,8 +35,15 @@ class PyLayer(core.Layer): ...@@ -37,8 +35,15 @@ class PyLayer(core.Layer):
for x in inputs: for x in inputs:
py_var = base.to_variable(x) py_var = base.to_variable(x)
var_inputs.append(py_var) var_inputs.append(py_var)
if not self._built:
self._build_once(inputs)
self._built = True
outputs = self.forward(var_inputs) outputs = self.forward(var_inputs)
return outputs return outputs
def _build_once(self, inputs):
pass
def forward(self, inputs): def forward(self, inputs):
return [] return []
...@@ -18,6 +18,7 @@ from . import framework ...@@ -18,6 +18,7 @@ from . import framework
import numpy as np import numpy as np
import contextlib import contextlib
from .core import VarDesc from .core import VarDesc
from . import unique_name
__all__ = [ __all__ = [
'Constant', 'Uniform', 'Normal', 'TruncatedNormal', 'Xavier', 'Bilinear', 'Constant', 'Uniform', 'Normal', 'TruncatedNormal', 'Xavier', 'Bilinear',
...@@ -207,16 +208,39 @@ class UniformInitializer(Initializer): ...@@ -207,16 +208,39 @@ class UniformInitializer(Initializer):
# Initialization Ops should be prepended and not appended # Initialization Ops should be prepended and not appended
if self._seed == 0: if self._seed == 0:
self._seed = block.program.random_seed self._seed = block.program.random_seed
# to be compatible of fp16 initalizers
if var.dtype == VarDesc.VarType.FP16:
out_dtype = VarDesc.VarType.FP32
out_var = block.create_var(
name=unique_name.generate(".".join(['gaussian_random', 'tmp'])),
shape=var.shape,
dtype=out_dtype,
type=VarDesc.VarType.LOD_TENSOR,
persistable=False)
else:
out_dtype = var.dtype
out_var = var
op = block._prepend_op( op = block._prepend_op(
type="uniform_random", type="uniform_random",
outputs={"Out": var}, outputs={"Out": out_var},
attrs={ attrs={
"shape": var.shape, "shape": var.shape,
"dtype": int(var.dtype), "dtype": out_dtype,
"min": self._low, "min": self._low,
"max": self._high, "max": self._high,
"seed": self._seed "seed": self._seed
}) })
if var.dtype == VarDesc.VarType.FP16:
block.append_op(
type="cast",
inputs={"X": out_var},
outputs={"Out": var},
attrs={"in_dtype": out_var.dtype,
"out_dtype": var.dtype})
var.op = op var.op = op
return op return op
...@@ -261,17 +285,39 @@ class NormalInitializer(Initializer): ...@@ -261,17 +285,39 @@ class NormalInitializer(Initializer):
# Initialization Ops should be prepended and not appended # Initialization Ops should be prepended and not appended
if self._seed == 0: if self._seed == 0:
self._seed = block.program.random_seed self._seed = block.program.random_seed
# to be compatible of fp16 initalizers
if var.dtype == VarDesc.VarType.FP16:
out_dtype = VarDesc.VarType.FP32
out_var = block.create_var(
name=unique_name.generate(".".join(['gaussian_random', 'tmp'])),
shape=var.shape,
dtype=out_dtype,
type=VarDesc.VarType.LOD_TENSOR,
persistable=False)
else:
out_dtype = var.dtype
out_var = var
op = block._prepend_op( op = block._prepend_op(
type="gaussian_random", type="gaussian_random",
outputs={"Out": var}, outputs={"Out": out_var},
attrs={ attrs={
"shape": var.shape, "shape": var.shape,
"dtype": int(var.dtype), "dtype": out_dtype,
"mean": self._mean, "mean": self._mean,
"std": self._std_dev, "std": self._std_dev,
"seed": self._seed, "seed": self._seed,
"use_mkldnn": False "use_mkldnn": False
}) })
if var.dtype == VarDesc.VarType.FP16:
block.append_op(
type="cast",
inputs={"X": out_var},
outputs={"Out": var},
attrs={"in_dtype": out_var.dtype,
"out_dtype": var.dtype})
var.op = op var.op = op
return op return op
......
...@@ -63,6 +63,8 @@ def noam_decay(d_model, warmup_steps): ...@@ -63,6 +63,8 @@ def noam_decay(d_model, warmup_steps):
Returns: Returns:
The decayed learning rate. The decayed learning rate.
""" """
def _lr_schedule(dtype):
with default_main_program()._lr_schedule_guard(): with default_main_program()._lr_schedule_guard():
global_step = _decay_step_counter(1) global_step = _decay_step_counter(1)
...@@ -72,6 +74,8 @@ def noam_decay(d_model, warmup_steps): ...@@ -72,6 +74,8 @@ def noam_decay(d_model, warmup_steps):
return lr_value return lr_value
return _lr_schedule
def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False): def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
""" """
...@@ -109,6 +113,8 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False): ...@@ -109,6 +113,8 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
""" """
def _lr_schedule(dtype):
with default_main_program()._lr_schedule_guard(): with default_main_program()._lr_schedule_guard():
global_step = _decay_step_counter() global_step = _decay_step_counter()
...@@ -119,6 +125,8 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False): ...@@ -119,6 +125,8 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
return decayed_lr return decayed_lr
return _lr_schedule
def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False): def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
"""Applies natural exponential decay to the initial learning rate. """Applies natural exponential decay to the initial learning rate.
...@@ -138,6 +146,8 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False): ...@@ -138,6 +146,8 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
Returns: Returns:
The decayed learning rate The decayed learning rate
""" """
def _lr_schedule(dtype):
with default_main_program()._lr_schedule_guard(): with default_main_program()._lr_schedule_guard():
global_step = _decay_step_counter() global_step = _decay_step_counter()
...@@ -148,6 +158,8 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False): ...@@ -148,6 +158,8 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
return decayed_lr return decayed_lr
return _lr_schedule
def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False): def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
""" """
...@@ -184,6 +196,8 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False): ...@@ -184,6 +196,8 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
staircase=True)) staircase=True))
sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
""" """
def _lr_schedule(dtype):
with default_main_program()._lr_schedule_guard(): with default_main_program()._lr_schedule_guard():
global_step = _decay_step_counter() global_step = _decay_step_counter()
...@@ -195,6 +209,8 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False): ...@@ -195,6 +209,8 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
return decayed_lr return decayed_lr
return _lr_schedule
def polynomial_decay(learning_rate, def polynomial_decay(learning_rate,
decay_steps, decay_steps,
...@@ -224,15 +240,17 @@ def polynomial_decay(learning_rate, ...@@ -224,15 +240,17 @@ def polynomial_decay(learning_rate,
Returns: Returns:
Variable: The decayed learning rate Variable: The decayed learning rate
""" """
def _lr_schedule(dtype, decay_steps=decay_steps):
with default_main_program()._lr_schedule_guard(): with default_main_program()._lr_schedule_guard():
global_step = _decay_step_counter() global_step = _decay_step_counter()
if cycle: if cycle:
div_res = ops.ceil(global_step / decay_steps) div_res = ops.ceil(global_step / decay_steps)
zero_var = tensor.fill_constant( zero_var = tensor.fill_constant(
shape=[1], dtype='float32', value=0.0) shape=[1], dtype=dtype, value=0.0)
one_var = tensor.fill_constant( one_var = tensor.fill_constant(
shape=[1], dtype='float32', value=1.0) shape=[1], dtype=dtype, value=1.0)
with control_flow.Switch() as switch: with control_flow.Switch() as switch:
with switch.case(global_step == zero_var): with switch.case(global_step == zero_var):
...@@ -240,13 +258,16 @@ def polynomial_decay(learning_rate, ...@@ -240,13 +258,16 @@ def polynomial_decay(learning_rate,
decay_steps = decay_steps * div_res decay_steps = decay_steps * div_res
else: else:
decay_steps_var = tensor.fill_constant( decay_steps_var = tensor.fill_constant(
shape=[1], dtype='float32', value=float(decay_steps)) shape=[1], dtype=dtype, value=float(decay_steps))
global_step = nn.elementwise_min(x=global_step, y=decay_steps_var) global_step = nn.elementwise_min(
x=global_step, y=decay_steps_var)
decayed_lr = (learning_rate - end_learning_rate) * \ decayed_lr = (learning_rate - end_learning_rate) * \
((1 - global_step / decay_steps) ** power) + end_learning_rate ((1 - global_step / decay_steps) ** power) + end_learning_rate
return decayed_lr return decayed_lr
return _lr_schedule
def piecewise_decay(boundaries, values): def piecewise_decay(boundaries, values):
"""Applies piecewise decay to the initial learning rate. """Applies piecewise decay to the initial learning rate.
...@@ -273,6 +294,8 @@ def piecewise_decay(boundaries, values): ...@@ -273,6 +294,8 @@ def piecewise_decay(boundaries, values):
""" """
def _lr_schedule(dtype):
with default_main_program()._lr_schedule_guard(): with default_main_program()._lr_schedule_guard():
if len(values) - len(boundaries) != 1: if len(values) - len(boundaries) != 1:
raise ValueError("len(values) - len(boundaries) should be 1") raise ValueError("len(values) - len(boundaries) should be 1")
...@@ -306,6 +329,8 @@ def piecewise_decay(boundaries, values): ...@@ -306,6 +329,8 @@ def piecewise_decay(boundaries, values):
return lr return lr
return _lr_schedule
def append_LARS(params_grads, learning_rate, weight_decay): def append_LARS(params_grads, learning_rate, weight_decay):
""" """
......
...@@ -29,6 +29,7 @@ from . import utils ...@@ -29,6 +29,7 @@ from . import utils
from .. import unique_name from .. import unique_name
from functools import reduce from functools import reduce
from .. import core from .. import core
from ..imperative import layers
__all__ = [ __all__ = [
'fc', 'fc',
...@@ -2797,6 +2798,10 @@ def batch_norm(input, ...@@ -2797,6 +2798,10 @@ def batch_norm(input,
helper = LayerHelper('batch_norm', **locals()) helper = LayerHelper('batch_norm', **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
# use fp32 for bn parameter
if dtype == core.VarDesc.VarType.FP16:
dtype = core.VarDesc.VarType.FP32
input_shape = input.shape input_shape = input.shape
if data_layout == 'NCHW': if data_layout == 'NCHW':
channel_num = input_shape[1] channel_num = input_shape[1]
...@@ -2831,7 +2836,7 @@ def batch_norm(input, ...@@ -2831,7 +2836,7 @@ def batch_norm(input,
trainable=False, trainable=False,
do_model_average=do_model_average_for_mean_and_var), do_model_average=do_model_average_for_mean_and_var),
shape=param_shape, shape=param_shape,
dtype=input.dtype) dtype=dtype)
mean.stop_gradient = True mean.stop_gradient = True
variance = helper.create_parameter( variance = helper.create_parameter(
...@@ -2841,7 +2846,7 @@ def batch_norm(input, ...@@ -2841,7 +2846,7 @@ def batch_norm(input,
trainable=False, trainable=False,
do_model_average=do_model_average_for_mean_and_var), do_model_average=do_model_average_for_mean_and_var),
shape=param_shape, shape=param_shape,
dtype=input.dtype) dtype=dtype)
variance.stop_gradient = True variance.stop_gradient = True
# create output # create output
...@@ -9426,3 +9431,47 @@ def huber_loss(input, label, delta): ...@@ -9426,3 +9431,47 @@ def huber_loss(input, label, delta):
'Residual': residual}, 'Residual': residual},
attrs={'delta': delta}) attrs={'delta': delta})
return out return out
class FC(layers.PyLayer):
def __init__(self,
size,
param_attr=None,
num_flatten_dims=1,
dtype=core.VarDesc.VarType.FP32):
super(FC, self).__init__()
self._size = size
self._num_flatten_dims = num_flatten_dims
self._dtype = dtype
self._helper = LayerHelper('FC', param_attr=param_attr)
def _build_once(self, inputs):
input_shape = inputs[0].shape
param_shape = [
reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1)
] + [self._size]
self._w = self._helper.create_parameter(
attr=self._helper.param_attr,
shape=param_shape,
dtype=self._dtype,
is_bias=False)
def forward(self, inputs):
tmp = self._helper.create_variable_for_type_inference(self._dtype)
self._helper.append_op(
type="mul",
inputs={"X": inputs[0],
"Y": self._w},
outputs={"Out": tmp},
attrs={
"x_num_col_dims": self._num_flatten_dims,
"y_num_col_dims": 1
})
out = self._helper.create_variable_for_type_inference(self._dtype)
self._helper.append_op(
type="sum",
inputs={"X": [tmp]},
outputs={"Out": out},
attrs={"use_mkldnn": False})
return out
...@@ -50,17 +50,21 @@ class Optimizer(object): ...@@ -50,17 +50,21 @@ class Optimizer(object):
def __init__(self, learning_rate, regularization=None, name=None): def __init__(self, learning_rate, regularization=None, name=None):
if not isinstance(learning_rate, float) and \ if not isinstance(learning_rate, float) and \
not isinstance(learning_rate, framework.Variable): not isinstance(learning_rate, framework.Variable) and \
raise TypeError("learning rate should be float or Variable") not callable(learning_rate):
raise TypeError(
"learning rate should be float or Variable or callable(dtype)")
self._name = name self._name = name
self.regularization = regularization self.regularization = regularization
self._learning_rate = learning_rate self._learning_rate = learning_rate
# the learning rate type should be inferenced from loss # the learning rate type should be inferenced from loss
self._dtype = None self._dtype = None
# each program should have a independent learning rate # each program should have a independent learning rate
# program -> Variable(learning_rate) # program -> Variable(learning_rate) or:
# program -> callable(return learning_rate Variable)
self._learning_rate_map = dict() self._learning_rate_map = dict()
if isinstance(self._learning_rate, framework.Variable): if isinstance(self._learning_rate, framework.Variable) or \
callable(self._learning_rate):
self._learning_rate_map[framework.default_main_program( self._learning_rate_map[framework.default_main_program(
)] = self._learning_rate )] = self._learning_rate
# Dictionary of accumulators. Some optimizer subclasses need to # Dictionary of accumulators. Some optimizer subclasses need to
...@@ -75,6 +79,11 @@ class Optimizer(object): ...@@ -75,6 +79,11 @@ class Optimizer(object):
if isinstance(lr, framework.Variable): if isinstance(lr, framework.Variable):
return return
elif callable(lr):
dtype = 'float32' if self._dtype is None else self._dtype
self._learning_rate_map[framework.default_main_program()] = lr(
dtype)
return
else: else:
if not isinstance(self._learning_rate, float): if not isinstance(self._learning_rate, float):
raise TypeError( raise TypeError(
......
...@@ -92,35 +92,27 @@ class ParallelExecutor(object): ...@@ -92,35 +92,27 @@ class ParallelExecutor(object):
num_trainers=1, num_trainers=1,
trainer_id=0, trainer_id=0,
scope=None): scope=None):
# step1: get places, the places are used in run too.
self._places = [] self._places = []
self._act_places = []
if use_cuda: if use_cuda:
gpus = []
gpus_env = os.getenv("FLAGS_selected_gpus") gpus_env = os.getenv("FLAGS_selected_gpus")
if gpus_env: if gpus_env:
gpus = [int(s) for s in gpus_env.split(",")] gpus = [int(s) for s in gpus_env.split(",")]
else: else:
for i in six.moves.range(core.get_cuda_device_count()): gpus = [
gpus.append(i) i for i in six.moves.range(core.get_cuda_device_count())
for i in gpus: ]
p = core.Place() self._places = [core.CUDAPlace(i) for i in gpus]
self._act_places.append(core.CUDAPlace(i))
p.set_place(self._act_places[-1])
self._places.append(p)
else: else:
cpu_num = int( cpu_num = int(
os.environ.get('CPU_NUM', multiprocessing.cpu_count())) os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
for i in six.moves.range(cpu_num): self._places = [core.CPUPlace() for _ in six.moves.range(cpu_num)]
p = core.Place()
self._act_places.append(core.CPUPlace())
p.set_place(self._act_places[-1])
self._places.append(p)
assert self._places, "no place for execution" assert self._places, "no place for execution"
# step2: init exec_strategy
if exec_strategy is None: if exec_strategy is None:
exec_strategy = ExecutionStrategy() exec_strategy = ExecutionStrategy()
exec_strategy.use_cuda = use_cuda exec_strategy.use_cuda = use_cuda
if exec_strategy.num_threads == 0: if exec_strategy.num_threads == 0:
if use_cuda: if use_cuda:
# Experiments on se-resnext shows that too many threads hurt # Experiments on se-resnext shows that too many threads hurt
...@@ -131,49 +123,54 @@ class ParallelExecutor(object): ...@@ -131,49 +123,54 @@ class ParallelExecutor(object):
os.environ.get('CPU_NUM', multiprocessing.cpu_count())) os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
exec_strategy.num_threads = cpu_num * 2 exec_strategy.num_threads = cpu_num * 2
# step3: init build_strategy
if build_strategy is None: if build_strategy is None:
build_strategy = BuildStrategy() build_strategy = BuildStrategy()
build_strategy.num_trainers = num_trainers build_strategy.num_trainers = num_trainers
build_strategy.trainer_id = trainer_id build_strategy.trainer_id = trainer_id
main = main_program # step4: get main_program, scope, local_scopes
main = main if main else framework.default_main_program() main = main_program if main_program \
else framework.default_main_program()
scope = scope if scope is not None else executor.global_scope()
if share_vars_from and not isinstance(share_vars_from,
ParallelExecutor):
raise TypeError("share_vars_from must be ParallelExecutor.")
local_scopes = share_vars_from.executor.local_scopes()\
if share_vars_from else []
# step5: check trainers_endpoints, it is used for distribution.
trainers_endpoints = main._trainers_endpoints trainers_endpoints = main._trainers_endpoints
if num_trainers > 1 and trainers_endpoints: if num_trainers > 1 and trainers_endpoints:
assert num_trainers == len( assert num_trainers == len(
trainers_endpoints), "num_trainers == len(end_points)" trainers_endpoints), "num_trainers == len(end_points)"
build_strategy.trainers_endpoints = trainers_endpoints build_strategy.trainers_endpoints = trainers_endpoints
if scope == None: # step5: get persistable_vars, parameter_vars, places. persistable_vars
scope = executor.global_scope() # need be broadcast to other local_scope.
persistable_vars = set([
if share_vars_from and not isinstance(share_vars_from, cpt.to_text(v.name) for v in [
ParallelExecutor):
raise TypeError("share_vars_from must be ParallelExecutor.")
local_scopes = share_vars_from.executor.local_scopes(
) if share_vars_from else []
self.persistable_vars = [
v.name for v in [
var for var in main.list_vars() var for var in main.list_vars()
if var.persistable and var.type != core.VarDesc.VarType.RAW if var.persistable and var.type != core.VarDesc.VarType.RAW
] ]
] ])
def place_obj(place):
p = core.Place()
p.set_place(place)
return p
places = list(map(place_obj, self._places))
# step6: init ParallelExecutor
self.executor = core.ParallelExecutor( self.executor = core.ParallelExecutor(
self._places, places, persistable_vars, main.desc,
set([
cpt.to_text(p.name)
for p in main.global_block().iter_parameters()
if not p.stop_gradient
]),
set(cpt.to_text(var) for var in self.persistable_vars), main.desc,
cpt.to_text(loss_name) cpt.to_text(loss_name)
if loss_name else six.u(''), scope, local_scopes, exec_strategy, if loss_name else six.u(''), scope, local_scopes, exec_strategy,
build_strategy, num_trainers, trainer_id) build_strategy, num_trainers, trainer_id)
self.scope = scope self.scope = scope
def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True): def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True):
...@@ -261,7 +258,7 @@ class ParallelExecutor(object): ...@@ -261,7 +258,7 @@ class ParallelExecutor(object):
self.executor.feed_and_split_tensor_into_local_scopes( self.executor.feed_and_split_tensor_into_local_scopes(
feed_tensor_dict) feed_tensor_dict)
elif isinstance(feed, list) or isinstance(feed, tuple): elif isinstance(feed, list) or isinstance(feed, tuple):
if len(feed) != len(self._act_places): if len(feed) != len(self._places):
raise ValueError( raise ValueError(
"Feed a list of tensor, the list should be the same size as places" "Feed a list of tensor, the list should be the same size as places"
) )
...@@ -277,7 +274,7 @@ class ParallelExecutor(object): ...@@ -277,7 +274,7 @@ class ParallelExecutor(object):
tensor = each[feed_name] tensor = each[feed_name]
if not isinstance(tensor, core.LoDTensor): if not isinstance(tensor, core.LoDTensor):
tmp = core.LoDTensor() tmp = core.LoDTensor()
tmp.set(tensor, self._act_places[i]) tmp.set(tensor, self._places[i])
tensor = tmp tensor = tmp
res_dict[feed_name] = tensor res_dict[feed_name] = tensor
res.append(res_dict) res.append(res_dict)
...@@ -294,4 +291,4 @@ class ParallelExecutor(object): ...@@ -294,4 +291,4 @@ class ParallelExecutor(object):
@property @property
def device_count(self): def device_count(self):
return len(self._act_places) return len(self._places)
...@@ -370,6 +370,8 @@ class OpTest(unittest.TestCase): ...@@ -370,6 +370,8 @@ class OpTest(unittest.TestCase):
return [place] return [place]
else: else:
return [] return []
else:
return []
places = [fluid.CPUPlace()] places = [fluid.CPUPlace()]
cpu_only = self._cpu_only if hasattr(self, '_cpu_only') else False cpu_only = self._cpu_only if hasattr(self, '_cpu_only') else False
if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type)\ if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type)\
......
...@@ -22,8 +22,10 @@ from op_test import OpTest ...@@ -22,8 +22,10 @@ from op_test import OpTest
class TestAccuracyOp(OpTest): class TestAccuracyOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "accuracy" self.op_type = "accuracy"
self.dtype = np.float32
self.init_dtype()
n = 8192 n = 8192
infer = np.random.random((n, 1)).astype("float32") infer = np.random.random((n, 1)).astype(self.dtype)
indices = np.random.randint(0, 2, (n, 1)) indices = np.random.randint(0, 2, (n, 1))
label = np.random.randint(0, 2, (n, 1)) label = np.random.randint(0, 2, (n, 1))
self.inputs = {'Out': infer, 'Indices': indices, "Label": label} self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
...@@ -34,14 +36,25 @@ class TestAccuracyOp(OpTest): ...@@ -34,14 +36,25 @@ class TestAccuracyOp(OpTest):
num_correct += 1 num_correct += 1
break break
self.outputs = { self.outputs = {
'Accuracy': np.array([num_correct / float(n)]).astype("float32"), 'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype),
'Correct': np.array([num_correct]).astype("int32"), 'Correct': np.array([num_correct]).astype("int32"),
'Total': np.array([n]).astype("int32") 'Total': np.array([n]).astype("int32")
} }
def init_dtype(self):
pass
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
class TestAccuracyOpFp16(TestAccuracyOp):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
self.check_output(atol=1e-3)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -16,7 +16,7 @@ from __future__ import print_function ...@@ -16,7 +16,7 @@ from __future__ import print_function
import unittest import unittest
from test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride from test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride, TestWithGroup, TestWith1x1, TestWithInput1x1Filter1x1
class TestMKLDNN(TestConv2dOp): class TestMKLDNN(TestConv2dOp):
...@@ -37,5 +37,23 @@ class TestMKLDNNWithStride(TestWithStride): ...@@ -37,5 +37,23 @@ class TestMKLDNNWithStride(TestWithStride):
self.data_format = "NCHW" self.data_format = "NCHW"
class TestMKLDNNWithGroup(TestWithGroup):
def init_kernel_type(self):
self.use_mkldnn = True
self.data_format = "NCHW"
class TestMKLDNNWith1x1(TestWith1x1):
def init_kernel_type(self):
self.use_mkldnn = True
self.data_format = "NCHW"
class TestMKLDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
def init_kernel_type(self):
self.use_mkldnn = True
self.data_format = "NCHW"
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -21,14 +21,16 @@ from op_test import OpTest ...@@ -21,14 +21,16 @@ from op_test import OpTest
class ElementwiseDivOp(OpTest): class ElementwiseDivOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "elementwise_div" self.op_type = "elementwise_div"
self.dtype = np.float32
self.init_dtype()
""" Warning """ Warning
CPU gradient check error! CPU gradient check error!
'X': np.random.random((32,84)).astype("float32"), 'X': np.random.random((32,84)).astype("float32"),
'Y': np.random.random((32,84)).astype("float32") 'Y': np.random.random((32,84)).astype("float32")
""" """
self.inputs = { self.inputs = {
'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"), 'X': np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype),
'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32") 'Y': np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
} }
self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])} self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
...@@ -46,6 +48,9 @@ class ElementwiseDivOp(OpTest): ...@@ -46,6 +48,9 @@ class ElementwiseDivOp(OpTest):
self.check_grad( self.check_grad(
['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Y')) ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Y'))
def init_dtype(self):
pass
class TestElementwiseDivOp_scalar(ElementwiseDivOp): class TestElementwiseDivOp_scalar(ElementwiseDivOp):
def setUp(self): def setUp(self):
...@@ -126,5 +131,21 @@ class TestElementwiseDivOp_broadcast_3(ElementwiseDivOp): ...@@ -126,5 +131,21 @@ class TestElementwiseDivOp_broadcast_3(ElementwiseDivOp):
} }
class TestElementwiseDivOpFp16(ElementwiseDivOp):
def init_dtype(self):
self.dtype = np.float16
def test_check_grad_normal(self):
self.check_grad(['X', 'Y'], 'Out', max_relative_error=1)
def test_check_grad_ingore_x(self):
self.check_grad(
['Y'], 'Out', max_relative_error=1, no_grad_set=set("X"))
def test_check_grad_ingore_y(self):
self.check_grad(
['X'], 'Out', max_relative_error=1, no_grad_set=set('Y'))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -135,5 +135,10 @@ class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp): ...@@ -135,5 +135,10 @@ class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
} }
class TestElementwiseMulOpFp16(ElementwiseMulOp):
def init_dtype(self):
self.dtype = np.float16
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -22,12 +22,22 @@ from op_test import OpTest ...@@ -22,12 +22,22 @@ from op_test import OpTest
class TestFillZerosLikeOp(OpTest): class TestFillZerosLikeOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "fill_zeros_like" self.op_type = "fill_zeros_like"
self.inputs = {'X': np.random.random((219, 232)).astype("float32")} self.dtype = np.float32
self.init_dtype()
self.inputs = {'X': np.random.random((219, 232)).astype(self.dtype)}
self.outputs = {'Out': np.zeros_like(self.inputs["X"])} self.outputs = {'Out': np.zeros_like(self.inputs["X"])}
def init_dtype(self):
pass
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
class TestFillZerosLikeOpFp16(TestFillZerosLikeOp):
def init_dtype(self):
self.dtype = np.float16
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -12,12 +12,23 @@ ...@@ -12,12 +12,23 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import contextlib
import unittest import unittest
import sys
import numpy as np import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.layers.nn import FC
@contextlib.contextmanager
def new_program_scope():
prog = fluid.Program()
startup_prog = fluid.Program()
scope = fluid.core.Scope()
with fluid.scope_guard(scope):
with fluid.program_guard(prog, startup_prog):
yield
class MyLayer(fluid.imperative.PyLayer): class MyLayer(fluid.imperative.PyLayer):
...@@ -30,6 +41,23 @@ class MyLayer(fluid.imperative.PyLayer): ...@@ -30,6 +41,23 @@ class MyLayer(fluid.imperative.PyLayer):
return [fluid.layers.elementwise_mul(x, x)] return [fluid.layers.elementwise_mul(x, x)]
class MLP(fluid.imperative.PyLayer):
def __init__(self):
super(MLP, self).__init__()
self._fc1 = FC(3,
fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.1)))
self._fc2 = FC(4,
fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.1)))
def forward(self, inputs):
x = self._fc1(inputs[0])
x = self._fc2(x)
x = fluid.layers.reduce_sum(x)
return x
class TestImperative(unittest.TestCase): class TestImperative(unittest.TestCase):
def test_layer(self): def test_layer(self):
with fluid.imperative.guard(): with fluid.imperative.guard():
...@@ -39,13 +67,56 @@ class TestImperative(unittest.TestCase): ...@@ -39,13 +67,56 @@ class TestImperative(unittest.TestCase):
l.forward([]) l.forward([])
def test_layer_in_out(self): def test_layer_in_out(self):
np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32)
with fluid.imperative.guard(): with fluid.imperative.guard():
l = MyLayer() l = MyLayer()
x = l(np.array([1.0, 2.0, -1.0], dtype=np.float32))[0] x = l(np_inp)[0]
self.assertIsNotNone(x) self.assertIsNotNone(x)
sys.stderr.write("%s output: %s\n" % (x, x._numpy())) dy_out = x._numpy()
x._backward() x._backward()
sys.stderr.write("grad %s\n" % l._x_for_debug._gradient()) dy_grad = l._x_for_debug._gradient()
with new_program_scope():
inp = fluid.layers.data(
name="inp", shape=[3], append_batch_size=False)
l = MyLayer()
x = l(inp)[0]
param_grads = fluid.backward.append_backward(
x, parameter_list=[l._x_for_debug.name])[0]
exe = fluid.Executor(fluid.CPUPlace())
static_out, static_grad = exe.run(
feed={inp.name: np_inp},
fetch_list=[x.name, param_grads[1].name])
self.assertTrue(np.allclose(dy_out, static_out))
self.assertTrue(np.allclose(dy_grad, static_grad))
def test_mlp(self):
np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
with fluid.imperative.guard():
mlp = MLP()
out = mlp(np_inp)
dy_out = out._numpy()
out._backward()
dy_grad = mlp._fc1._w._gradient()
with new_program_scope():
inp = fluid.layers.data(
name="inp", shape=[2, 2], append_batch_size=False)
mlp = MLP()
out = mlp(inp)
param_grads = fluid.backward.append_backward(
out, parameter_list=[mlp._fc1._w.name])[0]
exe = fluid.Executor(fluid.CPUPlace())
exe.run(fluid.default_startup_program())
static_out, static_grad = exe.run(
feed={inp.name: np_inp},
fetch_list=[out.name, param_grads[1].name])
self.assertTrue(np.allclose(dy_out, static_out))
self.assertTrue(np.allclose(dy_grad, static_grad))
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -97,7 +97,7 @@ class TestLearningRateDecay(unittest.TestCase): ...@@ -97,7 +97,7 @@ class TestLearningRateDecay(unittest.TestCase):
startup_prog = fluid.Program() startup_prog = fluid.Program()
with fluid.program_guard(main_prog, startup_prog): with fluid.program_guard(main_prog, startup_prog):
decayed_lr = fluid_decay_fn(**kwargs) decayed_lr = fluid_decay_fn(**kwargs)("float32")
place = fluid.CPUPlace() place = fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
......
...@@ -24,11 +24,13 @@ from op_test import OpTest ...@@ -24,11 +24,13 @@ from op_test import OpTest
class TestMomentumOp1(OpTest): class TestMomentumOp1(OpTest):
def setUp(self): def setUp(self):
self.op_type = "momentum" self.op_type = "momentum"
self.dtype = np.float32
self.init_dtype()
param = np.random.random((123, 321)).astype("float32") param = np.random.random((123, 321)).astype(self.dtype)
grad = np.random.random((123, 321)).astype("float32") grad = np.random.random((123, 321)).astype(self.dtype)
velocity = np.zeros((123, 321)).astype("float32") velocity = np.zeros((123, 321)).astype(self.dtype)
learning_rate = np.array([0.001]).astype("float32") learning_rate = np.array([0.001]).astype(self.dtype)
mu = 0.0001 mu = 0.0001
use_nesterov = False use_nesterov = False
...@@ -50,10 +52,21 @@ class TestMomentumOp1(OpTest): ...@@ -50,10 +52,21 @@ class TestMomentumOp1(OpTest):
self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out} self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
def init_dtype(self):
pass
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
class TestMomentumOpFp16(TestMomentumOp1):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
self.check_output(atol=1e-3)
class TestMomentumOp2(OpTest): class TestMomentumOp2(OpTest):
'''Test Momentum with default values for attributes '''Test Momentum with default values for attributes
''' '''
......
...@@ -23,8 +23,11 @@ class TestTopkOp(OpTest): ...@@ -23,8 +23,11 @@ class TestTopkOp(OpTest):
def setUp(self): def setUp(self):
self.set_args() self.set_args()
self.op_type = "top_k" self.op_type = "top_k"
self.dtype = np.float32
self.init_dtype()
k = self.top_k k = self.top_k
input = np.random.random((self.row, k)).astype("float32") input = np.random.random((self.row, k)).astype(self.dtype)
output = np.ndarray((self.row, k)) output = np.ndarray((self.row, k))
indices = np.ndarray((self.row, k)).astype("int64") indices = np.ndarray((self.row, k)).astype("int64")
...@@ -38,6 +41,9 @@ class TestTopkOp(OpTest): ...@@ -38,6 +41,9 @@ class TestTopkOp(OpTest):
self.outputs = {'Out': output, 'Indices': indices} self.outputs = {'Out': output, 'Indices': indices}
def init_dtype(self):
pass
def set_args(self): def set_args(self):
self.row = 32 self.row = 32
self.top_k = 1 self.top_k = 1
...@@ -46,6 +52,11 @@ class TestTopkOp(OpTest): ...@@ -46,6 +52,11 @@ class TestTopkOp(OpTest):
self.check_output() self.check_output()
class TestTopkOpFp16(TestTopkOp):
def init_dtype(self):
self.dtype = np.float16
class TestTopkOp3d(OpTest): class TestTopkOp3d(OpTest):
def setUp(self): def setUp(self):
self.op_type = "top_k" self.op_type = "top_k"
......
...@@ -107,9 +107,9 @@ packages=['paddle', ...@@ -107,9 +107,9 @@ packages=['paddle',
'paddle.fluid.distributed', 'paddle.fluid.distributed',
'paddle.fluid.layers', 'paddle.fluid.layers',
'paddle.fluid.contrib', 'paddle.fluid.contrib',
'paddle.fluid.contrib.utils',
'paddle.fluid.contrib.decoder', 'paddle.fluid.contrib.decoder',
'paddle.fluid.contrib.quantize', 'paddle.fluid.contrib.quantize',
'paddle.fluid.contrib.utils',
'paddle.fluid.transpiler', 'paddle.fluid.transpiler',
'paddle.fluid.transpiler.details'] 'paddle.fluid.transpiler.details']
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册