提交 52842139 编写于 作者: Z zhoukunsheng

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into rsqrt

...@@ -134,7 +134,7 @@ paddle.fluid.layers.sampled_softmax_with_cross_entropy (ArgSpec(args=['logits', ...@@ -134,7 +134,7 @@ paddle.fluid.layers.sampled_softmax_with_cross_entropy (ArgSpec(args=['logits',
paddle.fluid.layers.hsigmoid (ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)), ('document', '80641ee6810b1cdc3fd6e14fc89ecc9d')) paddle.fluid.layers.hsigmoid (ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)), ('document', '80641ee6810b1cdc3fd6e14fc89ecc9d'))
paddle.fluid.layers.beam_search (ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False)), ('document', 'b350b9a30a18e7efd7e1bb740eef6996')) paddle.fluid.layers.beam_search (ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False)), ('document', 'b350b9a30a18e7efd7e1bb740eef6996'))
paddle.fluid.layers.row_conv (ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)), ('document', '17485788fffe4e2d36dc58c2ac8d174e')) paddle.fluid.layers.row_conv (ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)), ('document', '17485788fffe4e2d36dc58c2ac8d174e'))
paddle.fluid.layers.multiplex (ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None), ('document', '013795af319e2e86d3506741941078ee')) paddle.fluid.layers.multiplex (ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None), ('document', '2c4d1ae83da6ed35e3b36ba1b3b51d23'))
paddle.fluid.layers.layer_norm (ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)), ('document', 'de6a906950bae9f3c245cb744d22b94e')) paddle.fluid.layers.layer_norm (ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)), ('document', 'de6a906950bae9f3c245cb744d22b94e'))
paddle.fluid.layers.group_norm (ArgSpec(args=['input', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW', None)), ('document', '419c3a24a83cc89219a029cf4092788b')) paddle.fluid.layers.group_norm (ArgSpec(args=['input', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW', None)), ('document', '419c3a24a83cc89219a029cf4092788b'))
paddle.fluid.layers.spectral_norm (ArgSpec(args=['weight', 'dim', 'power_iters', 'eps', 'name'], varargs=None, keywords=None, defaults=(0, 1, 1e-12, None)), ('document', '3f536aafba30d793287b52d231baff1b')) paddle.fluid.layers.spectral_norm (ArgSpec(args=['weight', 'dim', 'power_iters', 'eps', 'name'], varargs=None, keywords=None, defaults=(0, 1, 1e-12, None)), ('document', '3f536aafba30d793287b52d231baff1b'))
......
...@@ -195,8 +195,7 @@ cc_library(prune SRCS prune.cc DEPS framework_proto) ...@@ -195,8 +195,7 @@ cc_library(prune SRCS prune.cc DEPS framework_proto)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
proto_desc) proto_desc)
cc_test(inplace_op_inference_test SRCS inplace_op_inference_test.cc DEPS op_registry proto_desc op_info memory_optimize_helper) cc_test(inplace_op_inference_test SRCS inplace_op_inference_test.cc DEPS inplace_op_pass op_registry proto_desc op_info memory_optimize_helper pass_builder)
cc_library(selected_rows SRCS selected_rows.cc DEPS tensor) cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
......
...@@ -134,6 +134,11 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, ...@@ -134,6 +134,11 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
out_layout = out_layout =
out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout; out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout;
auto& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(
pool.Get(expected_kernel_type.place_));
auto& cpu_engine = dev_ctx->GetEngine();
std::vector<int> in_tz = paddle::framework::vectorize2int(in.dims()); std::vector<int> in_tz = paddle::framework::vectorize2int(in.dims());
std::vector<int> out_tz = in_tz; std::vector<int> out_tz = in_tz;
...@@ -142,25 +147,29 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, ...@@ -142,25 +147,29 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
"Input tensor type is not supported: %s", in.type()); "Input tensor type is not supported: %s", in.type());
memory::data_type out_type = in_type; memory::data_type out_type = in_type;
auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
auto out_format =
platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
// output tensor has the same dims as input. Reorder don't change dims // output tensor has the same dims as input. Reorder don't change dims
out->Resize(in.dims()); out->Resize(in.dims());
// tempory mem pd fr out , to make reorder if (in_format != out_format) {
auto out_mem_pd = paddle::platform::create_prim_desc_from_dims(
paddle::framework::vectorize2int(out->dims()),
mkldnn::memory::format::blocked, out_type);
if (in.get_mkldnn_prim_desc() != out_mem_pd) {
void* in_data = GetDataFromTensor(in, in_type); void* in_data = GetDataFromTensor(in, in_type);
auto out_data = out->mutable_data(expected_kernel_type.place_, in.type()); auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());
auto in_memory = memory(in.get_mkldnn_prim_desc(), in_data); auto in_memory =
auto out_memory = memory(out_mem_pd, out_data); memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
auto out_memory =
memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
platform::Reorder(in_memory, out_memory); platform::Reorder(in_memory, out_memory);
} else { } else {
out->ShareDataWith(in); out->ShareDataWith(in);
} }
out->set_layout(out_layout); out->set_layout(out_layout);
// reset format since the out tensor will be feed to non-MKLDNN OPkernel
out->set_format(memory::format::format_undef);
#endif #endif
} }
......
...@@ -51,31 +51,13 @@ void TransformData(const OpKernelType &expected_kernel_type, ...@@ -51,31 +51,13 @@ void TransformData(const OpKernelType &expected_kernel_type,
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
// Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
// Just set layout/format. No real transform occur // Just set layout/format. No real transform occur
auto out_format = platform::MKLDNNFormatForSize(in.dims().size(),
ToMKLDNNFormat(lin));
out.ShareDataWith(input_tensor); out.ShareDataWith(input_tensor);
// TODO(jczaja): Remove that once all mkldnn ops out.set_layout(DataLayout::kMKLDNN);
// are modified to work with mkldnn_blocked out.set_format(out_format);
auto mkldnn_fmt = [&](int rank) {
switch (rank) {
case 5:
return mkldnn::memory::format::ncdhw;
case 4:
return mkldnn::memory::format::nchw;
case 3:
return mkldnn::memory::format::ncw;
case 2:
return mkldnn::memory::format::nc;
case 1:
return mkldnn::memory::format::x;
default:
return mkldnn::memory::format::blocked;
}
};
auto out_mem_pd = paddle::platform::create_prim_desc_from_dims(
paddle::framework::vectorize2int(out.dims()),
mkldnn_fmt(out.dims().size()));
out.set_mkldnn_prim_desc(out_mem_pd);
#endif #endif
} else { } else {
// Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
......
...@@ -10,7 +10,10 @@ cc_library(fetch_barrier_op_handle SRCS fetch_barrier_op_handle.cc DEPS framewor ...@@ -10,7 +10,10 @@ cc_library(fetch_barrier_op_handle SRCS fetch_barrier_op_handle.cc DEPS framewor
cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper) cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper)
cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper) cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper)
cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper) cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper)
cc_library(alloc_continuous_space_for_grad_pass SRCS alloc_continuous_space_for_grad_pass.cc DEPS graph graph_helper) cc_library(alloc_continuous_space_for_grad_pass SRCS alloc_continuous_space_for_grad_pass.cc DEPS graph graph_helper)
cc_library(fuse_adam_op_pass SRCS fuse_adam_op_pass.cc fuse_optimizer_op_pass.cc DEPS graph graph_helper)
cc_library(fuse_sgd_op_pass SRCS fuse_sgd_op_pass.cc fuse_optimizer_op_pass.cc DEPS graph graph_helper)
cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows) cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
...@@ -104,5 +107,7 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS ...@@ -104,5 +107,7 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS
graph_viz_pass multi_devices_graph_pass graph_viz_pass multi_devices_graph_pass
multi_devices_graph_print_pass multi_devices_graph_check_pass multi_devices_graph_print_pass multi_devices_graph_check_pass
fuse_elewise_add_act_pass multi_batch_merge_pass fuse_elewise_add_act_pass multi_batch_merge_pass
fuse_relu_depthwise_conv_pass fuse_relu_depthwise_conv_pass
memory_optimize_pass lock_free_optimize_pass alloc_continuous_space_for_grad_pass fuse_all_reduce_op_pass) memory_optimize_pass lock_free_optimize_pass
alloc_continuous_space_for_grad_pass fuse_all_reduce_op_pass
fuse_adam_op_pass fuse_sgd_op_pass)
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
DEFINE_uint32(fuse_parameter_memory_size, 0, // 0 KB DEFINE_uint32(fuse_parameter_memory_size, 0, // 0 KB
"fuse_parameter_memory_size is up limited memory size " "fuse_parameter_memory_size is up limited memory size "
"of one group parameters' gradient which is the input " "of one group parameters' gradient which is the input "
...@@ -105,20 +106,29 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { ...@@ -105,20 +106,29 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
auto ele_dtype = iter->second->Var()->GetDataType(); auto ele_dtype = iter->second->Var()->GetDataType();
if (dtype == kDefaultDtype) { if (dtype == kDefaultDtype) {
dtype = ele_dtype; dtype = ele_dtype;
PADDLE_ENFORCE_NE(ele_dtype, kDefaultDtype); PADDLE_ENFORCE_NE(ele_dtype, kDefaultDtype,
"The data type should not be bool.");
} }
PADDLE_ENFORCE_EQ(ele_dtype, dtype); PADDLE_ENFORCE_EQ(ele_dtype, dtype,
"The data type of input is not consistent.");
} }
// Create the fused variable name. // Create a FusedVarsSet to avoid duplicating names for fused_var in other
// pass.
if (!result.Has(kFusedVars)) { if (!result.Has(kFusedVars)) {
result.Set(kFusedVars, new FusedVars); result.Set(kFusedVars, new FusedVars);
} }
const std::string prefix(kFusedVarNamePrefix); // the kFusedGrads is used be fuse_optimizer_op_pass.
// The fused_var_name should be unique. result.Set(kFusedGrads, new FusedGrads);
auto fused_var_name = prefix + "GRAD@" + params_grads[0].second;
// the fused_var_name should be unique, so it appends
// params_grads.begin()->second.
auto fused_var_name = std::string(kFusedVarNamePrefix) + "@GRAD@" +
params_grads.begin()->second;
result.Get<FusedGrads>(kFusedGrads) = fused_var_name;
auto &fused_var_set = result.Get<FusedVars>(kFusedVars); auto &fused_var_set = result.Get<FusedVars>(kFusedVars);
PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0); PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0,
"%s is duplicate in FusedVars.", fused_var_name);
fused_var_set.insert(fused_var_name); fused_var_set.insert(fused_var_name);
InitFusedVarsAndAllocSpaceForVars(places, local_scopes, vars, InitFusedVarsAndAllocSpaceForVars(places, local_scopes, vars,
...@@ -295,17 +305,6 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { ...@@ -295,17 +305,6 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
return type == proto::VarType::LOD_TENSOR; return type == proto::VarType::LOD_TENSOR;
} }
void AppendAllocSpaceForVarsOp(const std::vector<std::string> &params_name,
const std::vector<std::string> &grads_name,
const std::string &fused_var_name,
BlockDesc *global_block) const {
auto op_desc = global_block->AppendOp();
op_desc->SetType("alloc_continuous_space");
op_desc->SetInput("Input", params_name);
op_desc->SetOutput("Output", grads_name);
op_desc->SetOutput("FusedOutput", {fused_var_name});
}
void RecordParamsAndGrads(ir::Node *node, void RecordParamsAndGrads(ir::Node *node,
ParamsAndGrads *params_grads) const { ParamsAndGrads *params_grads) const {
try { try {
...@@ -358,6 +357,7 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { ...@@ -358,6 +357,7 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
} }
} }
// Alloc continuous space for vars.
std::vector<std::string> grads_name; std::vector<std::string> grads_name;
std::vector<std::string> params_name; std::vector<std::string> params_name;
grads_name.reserve(params_grads.size()); grads_name.reserve(params_grads.size());
...@@ -370,7 +370,6 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { ...@@ -370,7 +370,6 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
AppendAllocSpaceForVarsOp(params_name, grads_name, fused_var_name, AppendAllocSpaceForVarsOp(params_name, grads_name, fused_var_name,
program_desc.MutableBlock(0)); program_desc.MutableBlock(0));
// Run Only Once Programs
for (size_t i = 0; i < local_scopes.size(); ++i) { for (size_t i = 0; i < local_scopes.size(); ++i) {
for (auto &op_desc : program_desc.Block(0).AllOps()) { for (auto &op_desc : program_desc.Block(0).AllOps()) {
auto op = OpRegistry::CreateOp(*op_desc); auto op = OpRegistry::CreateOp(*op_desc);
...@@ -378,6 +377,17 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { ...@@ -378,6 +377,17 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
} }
} }
} }
void AppendAllocSpaceForVarsOp(const std::vector<std::string> &params_name,
const std::vector<std::string> &grads_name,
const std::string &fused_var_name,
BlockDesc *global_block) const {
auto op_desc = global_block->AppendOp();
op_desc->SetType("alloc_continuous_space");
op_desc->SetInput("Input", params_name);
op_desc->SetOutput("Output", grads_name);
op_desc->SetOutput("FusedOutput", {fused_var_name});
}
}; };
} // namespace details } // namespace details
......
...@@ -27,20 +27,17 @@ void BroadcastOpHandle::RunImpl() { ...@@ -27,20 +27,17 @@ void BroadcastOpHandle::RunImpl() {
if (places_.size() == 1) return; if (places_.size() == 1) return;
// The input and output may have dummy vars. // The input and output may have dummy vars.
VarHandle *in_var_handle; auto in_var_handles = DynamicCast<VarHandle>(inputs_);
{
auto in_var_handles = DynamicCast<VarHandle>(inputs_);
PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL,
"The number of input should be one.");
in_var_handle = in_var_handles[0];
}
auto out_var_handles = DynamicCast<VarHandle>(outputs_); auto out_var_handles = DynamicCast<VarHandle>(outputs_);
PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL,
"The number of input should be one.");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
out_var_handles.size(), places_.size(), out_var_handles.size(), places_.size(),
"The number of output should equal to the number of places."); "The number of output should equal to the number of places.");
VarHandle *in_var_handle = in_var_handles[0];
WaitInputVarGenerated(); WaitInputVarGenerated();
std::vector<const Scope *> var_scopes; std::vector<const Scope *> var_scopes;
......
...@@ -17,7 +17,6 @@ limitations under the License. */ ...@@ -17,7 +17,6 @@ limitations under the License. */
#include <glog/logging.h> #include <glog/logging.h>
#include <memory> #include <memory>
#include <utility> #include <utility>
#include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include "paddle/fluid/framework/details/multi_devices_graph_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
...@@ -82,23 +81,43 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -82,23 +81,43 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
AppendPass("inplace_pass"); AppendPass("inplace_pass");
} }
if (strategy.fuse_elewise_add_act_ops_) { if (strategy_.fuse_elewise_add_act_ops_) {
VLOG(10) << "Add fuse_elewise_add_act_pass"; VLOG(10) << "Add fuse_elewise_add_act_pass";
AppendPass("fuse_elewise_add_act_pass"); AppendPass("fuse_elewise_add_act_pass");
} }
// for single card training, fuse_all_reduce_ops is unnecessary. // for single card training, fuse_all_reduce_ops is unnecessary.
// alloc_continuous_space_for_grad_pass should be before of MultiDevPass. // alloc_continuous_space_for_grad_pass should be before of MultiDevPass.
if (strategy.fuse_all_reduce_ops_) { if (strategy_.fuse_all_reduce_ops_) {
VLOG(10) << "Add alloc_continuous_space_for_grad_pass"; VLOG(10) << "Add alloc_continuous_space_for_grad_pass";
AppendPass("alloc_continuous_space_for_grad_pass"); AppendPass("alloc_continuous_space_for_grad_pass");
} }
if (strategy_.fuse_all_optimizer_ops_) {
if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce ||
strategy_.is_distribution_) {
VLOG(3)
<< "Currently, fuse_all_optimizer_ops only works under AllReduce "
"mode.";
strategy_.fuse_all_optimizer_ops_ = false;
} else {
VLOG(10) << "Add alloc_continuous_space_for_grad_pass";
AppendPass("alloc_continuous_space_for_grad_pass");
// NOTE: fuse_all_xx_ops will count the number of xx operator first,
// if the number is zero, fuse_all_reduce_ops will do nothing.
// Currently, only one type of optimization algorithm can be fused.
VLOG(10) << "Add fuse_adam_op_pass";
AppendPass("fuse_adam_op_pass");
VLOG(10) << "Add fuse_sgd_op_pass";
AppendPass("fuse_sgd_op_pass");
}
}
// Add a graph viz pass to record a graph. // Add a graph viz pass to record a graph.
if (!strategy.debug_graphviz_path_.empty()) { if (!strategy.debug_graphviz_path_.empty()) {
auto viz_pass = AppendPass("graph_viz_pass"); auto viz_pass = AppendPass("graph_viz_pass");
const std::string graph_path = string::Sprintf( const std::string graph_path = string::Sprintf(
"%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph"); "%s%s", strategy_.debug_graphviz_path_.c_str(), "_fused_graph");
viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path)); viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
} }
...@@ -118,14 +137,14 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -118,14 +137,14 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
// the de-fact IR, any reuse on Graph is meaningless. // the de-fact IR, any reuse on Graph is meaningless.
// A side-effect of that, memory optimize cannot forsee the fetched vars // A side-effect of that, memory optimize cannot forsee the fetched vars
// , so fetchlist should be set persistable before call the Run interface. // , so fetchlist should be set persistable before call the Run interface.
if (strategy.memory_optimize_) { if (strategy_.memory_optimize_) {
VLOG(10) << "Add memory_optimize_pass"; VLOG(10) << "Add memory_optimize_pass";
AppendPass("memory_optimize_pass"); AppendPass("memory_optimize_pass");
} }
AppendMultiDevPass(strategy); AppendMultiDevPass(strategy_);
if (strategy.fuse_all_reduce_ops_) { if (strategy_.fuse_all_reduce_ops_) {
// NOTE: fuse_all_reduce_ops will count the number of all_reduce operator // NOTE: fuse_all_reduce_ops will count the number of all_reduce operator
// first, if the number is zero, fuse_all_reduce_ops will do nothing. // first, if the number is zero, fuse_all_reduce_ops will do nothing.
VLOG(10) << "Add fuse_all_reduce_op_pass"; VLOG(10) << "Add fuse_all_reduce_op_pass";
...@@ -151,7 +170,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -151,7 +170,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
AppendPass("all_reduce_deps_pass"); AppendPass("all_reduce_deps_pass");
} }
if (SeqOnlyAllReduceOps(strategy)) { if (SeqOnlyAllReduceOps(strategy_)) {
VLOG(10) << "Add all_reduce_deps_pass"; VLOG(10) << "Add all_reduce_deps_pass";
AppendPass("all_reduce_deps_pass"); AppendPass("all_reduce_deps_pass");
} }
...@@ -165,7 +184,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -165,7 +184,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
// Convert graph to run on multi-devices. // Convert graph to run on multi-devices.
void AppendMultiDevPass(const BuildStrategy &strategy) { void AppendMultiDevPass(const BuildStrategy &strategy) {
ir::Pass *multi_devices_pass = nullptr; ir::Pass *multi_devices_pass = nullptr;
if (strategy_.is_distribution_) { if (strategy.is_distribution_) {
VLOG(10) << "Add dist_multi_devices_pass"; VLOG(10) << "Add dist_multi_devices_pass";
multi_devices_pass = AppendPass("dist_multi_devices_pass").get(); multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
} else { } else {
...@@ -235,17 +254,22 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, ...@@ -235,17 +254,22 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
pass->Erase(kNCCLCtxs); pass->Erase(kNCCLCtxs);
pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx); pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx);
#endif #endif
} else if (pass->Type() == "fuse_all_reduce_op_pass") { } else if (pass->Type() == "alloc_continuous_space_for_grad_pass" ||
pass->Type() == "fuse_adam_op_pass" ||
pass->Type() == "fuse_sgd_op_pass" ||
pass->Type() == "fuse_all_reduce_op_pass") {
pass->Erase(kPlaces); pass->Erase(kPlaces);
pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places); pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
pass->Erase(kLocalScopes); pass->Erase(kLocalScopes);
pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes, pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
&local_scopes); &local_scopes);
if (pass->Type() == "fuse_all_reduce_op_pass") {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
pass->Erase(kNCCLCtxs); pass->Erase(kNCCLCtxs);
pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx); pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx);
#endif #endif
}
} else if (pass->Type() == "alloc_continuous_space_for_grad_pass") { } else if (pass->Type() == "alloc_continuous_space_for_grad_pass") {
pass->Erase(kPlaces); pass->Erase(kPlaces);
pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places); pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
...@@ -294,4 +318,6 @@ USE_PASS(inplace_pass); ...@@ -294,4 +318,6 @@ USE_PASS(inplace_pass);
USE_PASS(lock_free_optimize_pass); USE_PASS(lock_free_optimize_pass);
USE_PASS(alloc_continuous_space_for_grad_pass); USE_PASS(alloc_continuous_space_for_grad_pass);
USE_PASS(graph_to_program_pass); USE_PASS(graph_to_program_pass);
USE_PASS(fuse_adam_op_pass);
USE_PASS(fuse_sgd_op_pass);
USE_PASS(fuse_all_reduce_op_pass); USE_PASS(fuse_all_reduce_op_pass);
...@@ -18,7 +18,6 @@ ...@@ -18,7 +18,6 @@
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/pass_builder.h" #include "paddle/fluid/framework/ir/pass_builder.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
...@@ -76,6 +75,8 @@ struct BuildStrategy { ...@@ -76,6 +75,8 @@ struct BuildStrategy {
bool fuse_elewise_add_act_ops_{false}; bool fuse_elewise_add_act_ops_{false};
bool fuse_all_optimizer_ops_{false};
bool fuse_all_reduce_ops_{false}; bool fuse_all_reduce_ops_{false};
bool fuse_relu_depthwise_conv_{false}; bool fuse_relu_depthwise_conv_{false};
......
...@@ -31,9 +31,10 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor( ...@@ -31,9 +31,10 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
local_scopes_(local_scopes), local_scopes_(local_scopes),
places_(places), places_(places),
graph_(graph), graph_(graph),
fetch_ctxs_(places),
pool_(strategy.num_threads_), pool_(strategy.num_threads_),
prepare_pool_(1), // add one more thread for generate op_deps // add one more thread for generate op_deps
fetch_ctxs_(places) { prepare_pool_(1) {
for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) { for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
int dep = static_cast<int>(op->NotReadyInputSize()); int dep = static_cast<int>(op->NotReadyInputSize());
op_deps_.emplace(op, dep); op_deps_.emplace(op, dep);
......
...@@ -14,7 +14,9 @@ ...@@ -14,7 +14,9 @@
#pragma once #pragma once
#include <ThreadPool.h> #include <ThreadPool.h>
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/blocking_queue.h"
#include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/details/exception_holder.h"
...@@ -37,6 +39,8 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor { ...@@ -37,6 +39,8 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
const ir::Graph &Graph() const override; const ir::Graph &Graph() const override;
private: private:
// Note(zcd): the ThreadPool should be placed last so that ThreadPool should
// be destroyed first.
ExecutionStrategy strategy_; ExecutionStrategy strategy_;
std::vector<Scope *> local_scopes_; std::vector<Scope *> local_scopes_;
std::vector<platform::Place> places_; std::vector<platform::Place> places_;
...@@ -45,21 +49,22 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor { ...@@ -45,21 +49,22 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
std::unordered_map<OpHandleBase *, int> op_deps_; std::unordered_map<OpHandleBase *, int> op_deps_;
std::vector<OpHandleBase *> bootstrap_ops_; std::vector<OpHandleBase *> bootstrap_ops_;
::ThreadPool pool_;
::ThreadPool prepare_pool_;
platform::DeviceContextPool fetch_ctxs_; platform::DeviceContextPool fetch_ctxs_;
std::atomic<int> remaining_; std::atomic<int> remaining_;
std::future<
std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>>
atomic_op_deps_;
ExceptionHolder exception_;
::ThreadPool pool_;
::ThreadPool prepare_pool_;
void RunOpAsync(std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps, void RunOpAsync(std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
OpHandleBase *op, OpHandleBase *op,
const std::shared_ptr<BlockingQueue<size_t>> &complete_q); const std::shared_ptr<BlockingQueue<size_t>> &complete_q);
void PrepareAtomicOpDeps(); void PrepareAtomicOpDeps();
std::future<
std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>>
atomic_op_deps_;
ExceptionHolder exception_;
}; };
} // namespace details } // namespace details
} // namespace framework } // namespace framework
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fuse_adam_op_pass.h"
#include <algorithm>
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace framework {
namespace details {
const std::string FuseAdamOpPass::GetOpType() const { return "adam"; }
const std::vector<std::string> FuseAdamOpPass::GetAuxiliaryVarNames() const {
return {"Param", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow"};
}
void FuseAdamOpPass::FuseOptimizerOps(
const std::unordered_map<std::string, std::vector<std::string>>
&aux_var_set,
const std::unordered_map<std::string, std::string> &fused_vars_name,
const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const {
FuseAdamOps(aux_var_set, fused_vars_name, adam_ops, graph);
FuseScaleOps(aux_var_set.at("Beta1Pow"), fused_vars_name.at("Beta1Pow"),
adam_ops, graph);
FuseScaleOps(aux_var_set.at("Beta2Pow"), fused_vars_name.at("Beta2Pow"),
adam_ops, graph);
}
void FuseAdamOpPass::FuseAdamOps(
const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
const std::unordered_map<std::string, std::string> &fused_vars_name,
const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const {
PADDLE_ENFORCE_GT(adam_ops.size(), static_cast<size_t>(0));
// Check attributions
// NOTE: If new attribution is added, the following code maybe need change.
int op_role = boost::get<int>(
adam_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
float beta1 = boost::get<float>(adam_ops[0]->Op()->GetAttr("beta1"));
float beta2 = boost::get<float>(adam_ops[0]->Op()->GetAttr("beta2"));
float epsilon = boost::get<float>(adam_ops[0]->Op()->GetAttr("epsilon"));
bool lazy_mode = boost::get<bool>(adam_ops[0]->Op()->GetAttr("lazy_mode"));
int64_t min_row_size_to_use_multithread = boost::get<int64_t>(
adam_ops[0]->Op()->GetAttr("min_row_size_to_use_multithread"));
for (auto &adam_op : adam_ops) {
PADDLE_ENFORCE_EQ(beta1,
boost::get<float>(adam_op->Op()->GetAttr("beta1")));
PADDLE_ENFORCE_EQ(beta2,
boost::get<float>(adam_op->Op()->GetAttr("beta2")));
PADDLE_ENFORCE_EQ(epsilon,
boost::get<float>(adam_op->Op()->GetAttr("epsilon")));
PADDLE_ENFORCE_EQ(lazy_mode,
boost::get<bool>(adam_op->Op()->GetAttr("lazy_mode")));
PADDLE_ENFORCE_EQ(min_row_size_to_use_multithread,
boost::get<int64_t>(adam_op->Op()->GetAttr(
"min_row_size_to_use_multithread")));
PADDLE_ENFORCE_EQ(op_role, boost::get<int>(adam_op->Op()->GetAttr(
OpProtoAndCheckerMaker::OpRoleAttrName())));
}
// NOTE: fused_var is only exist in scope, so the graph doesn't have fused_var
// node.
VLOG(10) << "Insert adam to graph ";
OpDesc adam_desc(adam_ops[0]->Op()->Block());
adam_desc.SetType("adam");
adam_desc.SetInput("Param", {fused_vars_name.at("Param")});
adam_desc.SetInput("Grad", {fused_vars_name.at("Grad")});
adam_desc.SetInput("Moment1", {fused_vars_name.at("Moment1")});
adam_desc.SetInput("Moment2", {fused_vars_name.at("Moment2")});
// TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal.
adam_desc.SetInput("LearningRate", adam_ops[0]->Op()->Input("LearningRate"));
adam_desc.SetInput("Beta1Pow", adam_ops[0]->Op()->Input("Beta1Pow"));
adam_desc.SetInput("Beta2Pow", adam_ops[0]->Op()->Input("Beta2Pow"));
adam_desc.SetOutput("ParamOut", {fused_vars_name.at("Param")});
adam_desc.SetOutput("Moment1Out", {fused_vars_name.at("Moment1")});
adam_desc.SetOutput("Moment2Out", {fused_vars_name.at("Moment2")});
adam_desc.SetAttr("beta1", beta1);
adam_desc.SetAttr("beta2", beta2);
adam_desc.SetAttr("epsilon", epsilon);
adam_desc.SetAttr("lazy_mode", lazy_mode);
adam_desc.SetAttr("min_row_size_to_use_multithread",
min_row_size_to_use_multithread);
adam_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
auto adam_node = graph->CreateOpNode(&adam_desc);
InserInputAndOutputForOptOps(adam_ops, adam_node);
}
void FuseAdamOpPass::FuseScaleOps(const std::vector<std::string> &beta_name,
const std::string &fused_var_name,
const std::vector<ir::Node *> &adam_ops,
ir::Graph *graph) const {
PADDLE_ENFORCE_EQ(beta_name.size(), adam_ops.size());
const std::string scale_op_name = "scale";
// Get the scale_ops of dealing the adam's beta var.
std::vector<ir::Node *> scale_ops;
scale_ops.reserve(beta_name.size());
for (size_t i = 0; i < adam_ops.size(); ++i) {
auto &beta_1_pow_name = beta_name[i];
auto beta_pow_iter = std::find_if(
adam_ops[i]->inputs.begin(), adam_ops[i]->inputs.end(),
[&beta_name, &beta_1_pow_name](ir::Node *var_node) -> bool {
return var_node->Var() && var_node->Var()->Name() == beta_1_pow_name;
});
PADDLE_ENFORCE(beta_pow_iter != adam_ops[i]->inputs.end());
auto beta_pow_node = *beta_pow_iter;
auto scale_op_iter = std::find_if(
beta_pow_node->outputs.begin(), beta_pow_node->outputs.end(),
[&scale_op_name](ir::Node *op_node) -> bool {
return op_node->Op() && op_node->Op()->Type() == scale_op_name;
});
PADDLE_ENFORCE(scale_op_iter != beta_pow_node->outputs.end());
scale_ops.emplace_back(*scale_op_iter);
}
PADDLE_ENFORCE_EQ(scale_ops.size(), beta_name.size());
// Check attributions
// NOTE: If new attribution is added, the following code maybe need change.
int op_role = boost::get<int>(
scale_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
float scale = boost::get<float>(scale_ops[0]->Op()->GetAttr("scale"));
float bias = boost::get<float>(scale_ops[0]->Op()->GetAttr("bias"));
bool bias_after_scale =
boost::get<bool>(scale_ops[0]->Op()->GetAttr("bias_after_scale"));
for (auto &scale_op : scale_ops) {
PADDLE_ENFORCE_EQ(scale,
boost::get<float>(scale_op->Op()->GetAttr("scale")));
PADDLE_ENFORCE_EQ(bias, boost::get<float>(scale_op->Op()->GetAttr("bias")));
PADDLE_ENFORCE_EQ(
bias_after_scale,
boost::get<bool>(scale_op->Op()->GetAttr("bias_after_scale")));
PADDLE_ENFORCE_EQ(op_role, boost::get<int>(scale_op->Op()->GetAttr(
OpProtoAndCheckerMaker::OpRoleAttrName())));
}
// NOTE: fused_var is only exist in scope, so the graph doesn't have fused_var
// node.
VLOG(10) << "Insert fused scale to graph.";
OpDesc scale_desc(scale_ops[0]->Op()->Block());
scale_desc.SetType("scale");
scale_desc.SetInput("X", {fused_var_name});
scale_desc.SetOutput("Out", {fused_var_name});
scale_desc.SetAttr("scale", scale);
scale_desc.SetAttr("bias", bias);
scale_desc.SetAttr("bias_after_scale", bias_after_scale);
scale_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
auto scale_node = graph->CreateOpNode(&scale_desc);
for (auto scale_op : scale_ops) {
// set inputs
scale_node->inputs.insert(scale_node->inputs.begin(),
scale_op->inputs.begin(), scale_op->inputs.end());
for (auto &input : scale_op->inputs) {
std::replace(input->outputs.begin(), input->outputs.end(), scale_op,
scale_node);
}
// set outputs
scale_node->outputs.insert(scale_node->outputs.begin(),
scale_op->outputs.begin(),
scale_op->outputs.end());
for (auto &output : scale_op->outputs) {
std::replace(output->inputs.begin(), output->inputs.end(), scale_op,
scale_node);
}
}
// Delete scale_ops
for (auto &scale_op : scale_ops) {
graph->RemoveNode(scale_op);
}
}
} // namespace details
} // namespace framework
} // namespace paddle
REGISTER_PASS(fuse_adam_op_pass, paddle::framework::details::FuseAdamOpPass)
.RequirePassAttr(paddle::framework::details::kPlaces)
.RequirePassAttr(paddle::framework::details::kLocalScopes);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/build_strategy.h"
#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph.h"
namespace paddle {
namespace framework {
namespace details {
class FuseAdamOpPass : public FuseOptimizerOpPass {
private:
virtual const std::string GetOpType() const;
virtual const std::vector<std::string> GetAuxiliaryVarNames() const;
// Fuse Adam Ops and Scale Ops which are used to update "Beta1Pow", "Beta2Pow"
virtual void FuseOptimizerOps(
const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
const std::unordered_map<std::string, std::string> &fused_vars_name,
const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const;
void FuseAdamOps(
const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
const std::unordered_map<std::string, std::string> &fused_vars_name,
const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const;
void FuseScaleOps(const std::vector<std::string> &aux_var_set,
const std::string &fused_var_name,
const std::vector<ir::Node *> &adam_ops,
ir::Graph *graph) const;
};
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h"
#include <algorithm>
#include <unordered_set>
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace framework {
namespace details {
void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
ir::Graph &result = *graph;
auto &places = Get<const std::vector<platform::Place>>(kPlaces);
auto &local_scopes = Get<const std::vector<Scope *>>(kLocalScopes);
const std::string fuse_op_type = GetOpType();
const std::vector<std::string> aux_var_names = GetAuxiliaryVarNames();
// Step 1: Get the specified op and auxiliary variables.
std::vector<ir::Node *> topo_nodes = ir::TopologySortOperations(result);
std::unordered_map<std::string, std::vector<std::string>> aux_var_set;
std::vector<ir::Node *> opt_ops;
for (auto &node : topo_nodes) {
GetSpecifiedOpsAndVars(fuse_op_type, aux_var_names, node, &opt_ops,
&aux_var_set);
}
VLOG(10) << "Find " << fuse_op_type << " operators: " << opt_ops.size();
if (opt_ops.size() == 0) {
return;
}
if (result.Has(kFusedOptType)) {
VLOG(10)
<< "Currently only support fusing one type optimizer op. Has fused "
<< result.Get<FusedOptType>(kFusedOptType);
return;
} else {
result.Set(kFusedOptType, new FusedOptType);
}
result.Get<FusedOptType>(kFusedOptType) = fuse_op_type;
// Step 2: Insert fused_var_name to FusedVars, and the FusedVars need be
// initialized in scopes before execution.
if (!result.Has(kFusedVars)) {
result.Set(kFusedVars, new FusedVars);
}
std::unordered_map<std::string, std::string> fused_vars_name;
fused_vars_name.reserve(aux_var_names.size() + 1);
auto &fused_var_set = result.Get<FusedVars>(kFusedVars);
const std::string prefix(kFusedVarNamePrefix);
// NOTE: the fused_var_name should be unique.
for (auto &var_name : aux_var_names) {
auto fused_var_name = prefix + "_" + fuse_op_type + "_" + var_name + "_" +
aux_var_set[var_name][0];
VLOG(10) << fused_var_name;
fused_vars_name.emplace(var_name, fused_var_name);
PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0);
fused_var_set.insert(fused_var_name);
}
// Step 3: Get the fused Gradient's name
auto &params_grads = result.Get<ParamsAndGrads>(kParamsAndGrads);
if (!result.Has(kFusedGrads)) {
PADDLE_THROW(
"The alloc_continuous_space_for_grad_pass should be called before this "
"pass.");
}
auto &fused_grad = result.Get<FusedGrads>(kFusedGrads);
auto &fused_vars = result.Get<FusedVars>(kFusedVars);
auto iter = std::find(fused_vars.begin(), fused_vars.end(), fused_grad);
PADDLE_ENFORCE(iter != fused_vars.end(), "Not find the fused_grad.");
fused_vars_name.emplace("Grad", fused_grad);
// Step 4: Sort the parameters and auxiliary variables according
// to parameters' name to make variables' name correspond correctly.
PADDLE_ENFORCE(result.Has(kParamsAndGrads), "Does't find kParamsAndGrads.");
PADDLE_ENFORCE_EQ(params_grads.size(), aux_var_set.begin()->second.size(),
"The size of params_grads and aux_var_set are not equal.");
SortParametersAndAuxVars(params_grads, &aux_var_set, &opt_ops);
// Step 5: Alloc continuous space for Parameters and AuxiliaryVar(e.g.
// Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops separately.
InitFusedVarsAndAllocSpaceForVars(places, local_scopes, aux_var_names,
aux_var_set, fused_vars_name);
// Step 6: Fuse optimizer Ops and Scale Ops
FuseOptimizerOps(aux_var_set, fused_vars_name, opt_ops, &result);
// Step 7: Remove optimizer Ops
for (auto &opt_op : opt_ops) {
graph->RemoveNode(opt_op);
}
}
void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars(
const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const std::vector<std::string> &aux_var_names,
const std::unordered_map<std::string, std::vector<std::string>>
&aux_var_set,
const std::unordered_map<std::string, std::string> &fused_vars_name) const {
VLOG(10) << "Init FusedVars.";
// Alloc parameters and auxiliary vars in the respective scope.
size_t idx = local_scopes.size();
for (auto iter = local_scopes.rbegin(); iter != local_scopes.rend();
++iter, --idx) {
auto &scope = *iter;
for (auto &var_name : aux_var_names) {
auto fused_var_name = fused_vars_name.at(var_name);
VLOG(10) << "Init " << fused_var_name;
PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr,
"%s has exist in scope[%d]", fused_var_name, idx);
scope->Var(fused_var_name)->GetMutable<LoDTensor>();
}
}
ProgramDesc program_desc;
auto *global_block = program_desc.MutableBlock(0);
for (auto &var_name : aux_var_names) {
AppendAllocContinuousSpace(aux_var_set.at(var_name),
fused_vars_name.at(var_name), true,
global_block);
}
for (size_t i = 0; i < local_scopes.size(); ++i) {
for (auto &op_desc : global_block->AllOps()) {
auto op = OpRegistry::CreateOp(*op_desc);
op->Run(*local_scopes[i], places[i]);
}
}
}
void FuseOptimizerOpPass::SortParametersAndAuxVars(
const std::vector<std::pair<std::string, std::string>> &params_grads,
std::unordered_map<std::string, std::vector<std::string>> *aux_vars_set,
std::vector<ir::Node *> *ops) const {
PADDLE_ENFORCE_NE(aux_vars_set->count("Param"), static_cast<size_t>(0));
auto &param_vec = aux_vars_set->at("Param");
std::vector<size_t> param_sort_idx;
param_sort_idx.reserve(param_vec.size());
for (auto &p_g : params_grads) {
auto iter = std::find(param_vec.begin(), param_vec.end(), p_g.first);
PADDLE_ENFORCE(iter != param_vec.end());
auto idx = std::distance(param_vec.begin(), iter);
param_sort_idx.emplace_back(idx);
}
for (auto &aux_vars : *aux_vars_set) {
std::vector<std::string> sorted_vars;
sorted_vars.reserve(aux_vars.second.size());
for (size_t i = 0; i < aux_vars.second.size(); ++i) {
sorted_vars.emplace_back(aux_vars.second.at(param_sort_idx[i]));
}
std::swap(aux_vars.second, sorted_vars);
std::stringstream out;
for (auto &var_name : aux_vars.second) {
out << var_name << " ";
}
VLOG(10) << aux_vars.first << ": " << out.str();
}
std::vector<ir::Node *> sorted_ops;
sorted_ops.reserve(ops->size());
for (size_t i = 0; i < ops->size(); ++i) {
sorted_ops.emplace_back(ops->at(param_sort_idx[i]));
}
std::swap(*ops, sorted_ops);
}
void FuseOptimizerOpPass::GetSpecifiedOpsAndVars(
const std::string &op_type, const std::vector<std::string> &aux_vars_name,
ir::Node *node, std::vector<ir::Node *> *ops,
std::unordered_map<std::string, std::vector<std::string>> *aux_args_name)
const {
if (node->Op()->Type() != op_type) return;
for (auto &var_n : aux_vars_name) {
auto arg_names = node->Op()->Input(var_n);
PADDLE_ENFORCE_EQ(arg_names.size(), static_cast<size_t>(1));
(*aux_args_name)[var_n].emplace_back(arg_names[0]);
VLOG(10) << var_n << ", " << arg_names[0];
}
ops->emplace_back(node);
}
void FuseOptimizerOpPass::AppendAllocContinuousSpace(
const std::vector<std::string> &args, const std::string &out_arg,
bool copy_data, BlockDesc *global_block) const {
auto op_desc = global_block->AppendOp();
op_desc->SetType("alloc_continuous_space");
op_desc->SetInput("Input", args);
op_desc->SetOutput("Output", args);
op_desc->SetOutput("FusedOutput", {out_arg});
op_desc->SetAttr("copy_data", copy_data);
op_desc->SetAttr("check_name", true);
}
void FuseOptimizerOpPass::InserInputAndOutputForOptOps(
const std::vector<ir::Node *> &opt_ops, ir::Node *opt_node) const {
std::unordered_set<ir::Node *> inputs;
std::unordered_set<ir::Node *> outputs;
for (auto opt_op : opt_ops) {
// set inputs
inputs.insert(opt_op->inputs.begin(), opt_op->inputs.end());
for (auto &input : opt_op->inputs) {
replace(input->outputs.begin(), input->outputs.end(), opt_op, opt_node);
}
// set outputs
outputs.insert(opt_op->outputs.begin(), opt_op->outputs.end());
for (auto &output : opt_op->outputs) {
replace(output->inputs.begin(), output->inputs.end(), opt_op, opt_node);
}
}
opt_node->inputs.insert(opt_node->inputs.begin(), inputs.begin(),
inputs.end());
opt_node->outputs.insert(opt_node->outputs.begin(), outputs.begin(),
outputs.end());
}
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/build_strategy.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph.h"
namespace paddle {
namespace framework {
namespace details {
class FuseOptimizerOpPass : public ir::Pass {
protected:
void ApplyImpl(ir::Graph *graph) const override;
protected:
virtual void SortParametersAndAuxVars(
const std::vector<std::pair<std::string, std::string>> &params_grads,
std::unordered_map<std::string, std::vector<std::string>> *aux_var_set,
std::vector<ir::Node *> *ops) const;
void InserInputAndOutputForOptOps(const std::vector<ir::Node *> &opt_ops,
ir::Node *opt_node) const;
private:
virtual const std::string GetOpType() const = 0;
virtual const std::vector<std::string> GetAuxiliaryVarNames() const = 0;
virtual void FuseOptimizerOps(
const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
const std::unordered_map<std::string, std::string> &fused_vars_name,
const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const = 0;
void GetSpecifiedOpsAndVars(
const std::string &op_type, const std::vector<std::string> &aux_vars_name,
ir::Node *node, std::vector<ir::Node *> *ops,
std::unordered_map<std::string, std::vector<std::string>> *aux_args_name)
const;
void AppendAllocContinuousSpace(const std::vector<std::string> &args,
const std::string &out_arg, bool copy_data,
BlockDesc *global_block) const;
void InitFusedVarsAndAllocSpaceForVars(
const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const std::vector<std::string> &aux_var_names,
const std::unordered_map<std::string, std::vector<std::string>>
&aux_var_set,
const std::unordered_map<std::string, std::string> &fused_vars_name)
const;
};
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fuse_sgd_op_pass.h"
#include <algorithm>
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace framework {
namespace details {
const std::string FuseSgdOpPass::GetOpType() const { return "sgd"; }
const std::vector<std::string> FuseSgdOpPass::GetAuxiliaryVarNames() const {
return {"Param"};
}
void FuseSgdOpPass::FuseOptimizerOps(
const std::unordered_map<std::string, std::vector<std::string>>
&aux_var_set,
const std::unordered_map<std::string, std::string> &fused_vars_name,
const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const {
FuseSgdOps(aux_var_set, fused_vars_name, sgd_ops, graph);
}
void FuseSgdOpPass::FuseSgdOps(
const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
const std::unordered_map<std::string, std::string> &fused_vars_name,
const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const {
PADDLE_ENFORCE_GT(sgd_ops.size(), static_cast<size_t>(0));
// NOTE: fused_var is only exist in scope, so the graph doesn't have fused_var
// node.
int op_role = boost::get<int>(
sgd_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
VLOG(10) << "Insert sgd to graph ";
// Add fused scale
OpDesc Sgd_desc(sgd_ops[0]->Op()->Block());
Sgd_desc.SetType("sgd");
Sgd_desc.SetInput("Param", {fused_vars_name.at("Param")});
Sgd_desc.SetInput("Grad", {fused_vars_name.at("Grad")});
Sgd_desc.SetOutput("ParamOut", {fused_vars_name.at("Param")});
// TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal.
Sgd_desc.SetInput("LearningRate", sgd_ops[0]->Op()->Input("LearningRate"));
// NOTE: multi_devices_pass requires that every op should have a role.
Sgd_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
auto sgd_node = graph->CreateOpNode(&Sgd_desc);
InserInputAndOutputForOptOps(sgd_ops, sgd_node);
}
} // namespace details
} // namespace framework
} // namespace paddle
REGISTER_PASS(fuse_sgd_op_pass, paddle::framework::details::FuseSgdOpPass)
.RequirePassAttr(paddle::framework::details::kPlaces)
.RequirePassAttr(paddle::framework::details::kLocalScopes);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/build_strategy.h"
#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph.h"
namespace paddle {
namespace framework {
namespace details {
class FuseSgdOpPass : public FuseOptimizerOpPass {
private:
virtual const std::string GetOpType() const;
virtual const std::vector<std::string> GetAuxiliaryVarNames() const;
// Fuse Sgd Ops
virtual void FuseOptimizerOps(
const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
const std::unordered_map<std::string, std::string> &fused_vars_name,
const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const;
void FuseSgdOps(
const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
const std::unordered_map<std::string, std::string> &fused_vars_name,
const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const;
};
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -24,6 +24,19 @@ namespace paddle { ...@@ -24,6 +24,19 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
// Note(zcd): Addresses should be aligned, otherwise, the results may have
// diff.
static size_t Alignment(size_t size, const platform::Place &place) {
// Allow to allocate the minimum chunk size is 4 KB.
size_t alignment = 1 << 12;
if (platform::is_gpu_place(place)) {
// Allow to allocate the minimum chunk size is 256 B.
alignment = 1 << 8;
}
size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining);
}
typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>> typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>>
GradientAndLoDTensor; GradientAndLoDTensor;
...@@ -111,10 +124,11 @@ void FusedAllReduceOpHandle::RunImpl() { ...@@ -111,10 +124,11 @@ void FusedAllReduceOpHandle::RunImpl() {
return grad1.second->data<void>() < grad2.second->data<void>(); return grad1.second->data<void>() < grad2.second->data<void>();
}); });
size_t size_of_dtype = framework::SizeOfType(dtype);
for (size_t k = 1; k < g_tensor.size(); ++k) { for (size_t k = 1; k < g_tensor.size(); ++k) {
const void *cur_address = g_tensor.at(k - 1).second->data<void>(); const void *cur_address = g_tensor.at(k - 1).second->data<void>();
int64_t len = g_tensor.at(k - 1).second->numel(); int64_t len = g_tensor.at(k - 1).second->numel();
auto offset = len * framework::SizeOfType(dtype); auto offset = Alignment(len * size_of_dtype, places_[0]);
void *infer_next_address = reinterpret_cast<void *>( void *infer_next_address = reinterpret_cast<void *>(
reinterpret_cast<uintptr_t>(cur_address) + offset); reinterpret_cast<uintptr_t>(cur_address) + offset);
const void *next_address = g_tensor.at(k).second->data<void>(); const void *next_address = g_tensor.at(k).second->data<void>();
...@@ -228,18 +242,21 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel( ...@@ -228,18 +242,21 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel(
const std::vector<std::pair<std::string, const LoDTensor *>> &grad_tensor, const std::vector<std::pair<std::string, const LoDTensor *>> &grad_tensor,
proto::VarType::Type *dtype, int64_t *numel) const { proto::VarType::Type *dtype, int64_t *numel) const {
*numel = 0; *numel = 0;
size_t size_of_dtype = 0;
for (size_t i = 0; i < grad_tensor.size(); ++i) { for (size_t i = 0; i < grad_tensor.size(); ++i) {
// Get element number
int64_t len = grad_tensor.at(i).second->numel();
PADDLE_ENFORCE_GT(len, 0);
*numel += len;
// Get dtype // Get dtype
auto ele_type = grad_tensor.at(i).second->type(); auto ele_type = grad_tensor.at(i).second->type();
if (i == 0) { if (i == 0) {
*dtype = ele_type; *dtype = ele_type;
size_of_dtype = framework::SizeOfType(ele_type);
} }
PADDLE_ENFORCE_EQ(ele_type, *dtype); PADDLE_ENFORCE_EQ(ele_type, *dtype);
// Get element number
int64_t len = grad_tensor.at(i).second->numel();
PADDLE_ENFORCE_GT(len, 0);
// Alignment(len)
*numel += Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
} }
} }
......
...@@ -156,7 +156,6 @@ void InplacePass::ApplyImpl(ir::Graph* graph) const { ...@@ -156,7 +156,6 @@ void InplacePass::ApplyImpl(ir::Graph* graph) const {
continue; continue;
TryInplaceOpInputOutput(op, graph); TryInplaceOpInputOutput(op, graph);
} }
// graph->ResolveHazard(var_nodes_);
} }
void InplacePass::InplaceModifyDesc(const std::string& var, void InplacePass::InplaceModifyDesc(const std::string& var,
...@@ -168,7 +167,7 @@ void InplacePass::InplaceModifyDesc(const std::string& var, ...@@ -168,7 +167,7 @@ void InplacePass::InplaceModifyDesc(const std::string& var,
auto* op_desc = op->Op(); auto* op_desc = op->Op();
op_desc->RenameInput(var, cache_var); op_desc->RenameInput(var, cache_var);
op_desc->RenameOutput(var, cache_var); op_desc->RenameOutput(var, cache_var);
if (op_desc->Block()->HasVar(var)) op_desc->Block()->RemoveVar(var);
op_desc->Flush(); op_desc->Flush();
} }
} }
...@@ -265,8 +264,6 @@ void InplacePass::WithdrawModify(const NodeSwapQueue& nodes, ...@@ -265,8 +264,6 @@ void InplacePass::WithdrawModify(const NodeSwapQueue& nodes,
void InplacePass::TryInplaceOpInputOutput(ir::Node* op, void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
ir::Graph* graph) const { ir::Graph* graph) const {
VLOG(4) << "Try to inplace op " << op->Name(); VLOG(4) << "Try to inplace op " << op->Name();
// PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr,
// "op_desc is nullptr");
// some pre-requirments need to meet if the op want to inplaced. // some pre-requirments need to meet if the op want to inplaced.
PADDLE_ENFORCE(op->Op() != nullptr, "op_desc is nullptr"); PADDLE_ENFORCE(op->Op() != nullptr, "op_desc is nullptr");
...@@ -446,19 +443,20 @@ bool GraphView::CheckDeps(ir::Node* var, ir::Node* current_op) const { ...@@ -446,19 +443,20 @@ bool GraphView::CheckDeps(ir::Node* var, ir::Node* current_op) const {
// check if op2 depends on op1's output // check if op2 depends on op1's output
bool GraphView::CheckOpDeps(ir::Node* op1, ir::Node* op2) const { bool GraphView::CheckOpDeps(ir::Node* op1, ir::Node* op2) const {
auto print_op = [&](ir::Node* op, const char* name) { if (VLOG_IS_ON(4)) {
std::ostringstream os; auto print_op = [&](ir::Node* op, const char* name) {
os << " " << name << " : " << op->Name() << " "; std::ostringstream os;
os << "Input args : "; os << " " << name << " : " << op->Name() << " ";
for (auto& arg : op->inputs) os << arg->Name() << " "; os << "Input args : ";
os << "Output args : "; for (auto& arg : op->inputs) os << arg->Name() << " ";
for (auto& arg : op->outputs) os << arg->Name() << " "; os << "Output args : ";
os << "Level : " << op_level_.at(op); for (auto& arg : op->outputs) os << arg->Name() << " ";
VLOG(4) << os.str(); os << "Level : " << op_level_.at(op);
}; VLOG(4) << os.str();
print_op(op1, "OP1"); };
print_op(op2, "OP2"); print_op(op1, "OP1");
print_op(op2, "OP2");
}
if (op1 == op2) return true; if (op1 == op2) return true;
if (op_level_.at(op1) >= op_level_.at(op2)) return false; if (op_level_.at(op1) >= op_level_.at(op2)) return false;
......
...@@ -142,16 +142,15 @@ TEST(OrderedSet, FindBestFitNode) { ...@@ -142,16 +142,15 @@ TEST(OrderedSet, FindBestFitNode) {
for (auto& node : nodes) { for (auto& node : nodes) {
pool.Insert(node.get()); pool.Insert(node.get());
} }
// FIXME(liuwei1031) this API has changed,
// disable these tests temporarily auto* n = nodes[0].get();
// FindNextBestFitNode auto* cache = pool.FindBestFitNode(n);
// auto* n = nodes[0].get(); ASSERT_TRUE(cache->Name() == "a" || cache->Name() == "c");
// auto* cache = pool.FindBestFitNode(n); auto* cache_b = pool.FindNextBestFitNode(n, cache);
// PADDLE_ENFORCE(cache->Name() == "a"); ASSERT_TRUE(cache_b->Name() != cache->Name());
// cache = pool.FindNextBestFitNode(n, cache); ASSERT_TRUE(cache_b->Name() == "a" || cache_b->Name() == "c");
// PADDLE_ENFORCE(cache->Name() == "c"); cache = pool.FindNextBestFitNode(n, cache_b);
// cache = pool.FindNextBestFitNode(n, cache); ASSERT_TRUE(cache == nullptr);
// PADDLE_ENFORCE(cache->Name() == "b");
} }
} // namespace details } // namespace details
......
...@@ -20,7 +20,6 @@ ...@@ -20,7 +20,6 @@
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/build_strategy.h" #include "paddle/fluid/framework/details/build_strategy.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
...@@ -34,6 +33,10 @@ namespace framework { ...@@ -34,6 +33,10 @@ namespace framework {
class Scope; class Scope;
namespace details { namespace details {
constexpr char kLossVarName[] = "loss_var_name";
constexpr char kStrategy[] = "strategy";
constexpr char kNRanks[] = "nranks";
class MultiDevSSAGraphBuilderBase : public ir::Pass { class MultiDevSSAGraphBuilderBase : public ir::Pass {
protected: protected:
void ApplyImpl(ir::Graph *graph) const override; void ApplyImpl(ir::Graph *graph) const override;
......
...@@ -20,7 +20,6 @@ ...@@ -20,7 +20,6 @@
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/var_handle.h" #include "paddle/fluid/framework/details/var_handle.h"
...@@ -41,22 +40,25 @@ namespace details { ...@@ -41,22 +40,25 @@ namespace details {
// `std::vector<VarHandle*>` is the version of varaibles. // `std::vector<VarHandle*>` is the version of varaibles.
typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle *>>> typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle *>>>
GraphVars; GraphVars;
const char kGraphVars[] = "vars"; constexpr char kGraphVars[] = "vars";
// aux variables to represent dependency. Useful to resolve data hazard.
typedef std::unordered_set<VarHandleBase *> GraphDepVars;
const char kGraphDepVars[] = "dep_vars";
constexpr char kNCCLCtxs[] = "nccl_ctxs";
constexpr char kLossVarName[] = "loss_var_name";
constexpr char kPlaces[] = "places"; constexpr char kPlaces[] = "places";
constexpr char kLocalScopes[] = "local_scopes"; constexpr char kLocalScopes[] = "local_scopes";
constexpr char kStrategy[] = "strategy"; constexpr char kNCCLCtxs[] = "nccl_ctxs";
constexpr char kNRanks[] = "nranks";
// aux variables to represent dependency. Useful to resolve data hazard.
typedef std::unordered_set<VarHandleBase *> GraphDepVars;
constexpr char kGraphDepVars[] = "dep_vars";
typedef std::unordered_set<std::string> FusedVars; typedef std::unordered_set<std::string> FusedVars;
constexpr char kFusedVars[] = "fused_vars"; constexpr char kFusedVars[] = "fused_vars";
constexpr char kFusedVarNamePrefix[] = "@FUSEDVAR@";
typedef std::string FusedOptType;
constexpr char kFusedOptType[] = "fused_opt_type";
typedef std::string FusedGrads;
constexpr char kFusedGrads[] = "fused_gradients";
typedef std::vector<std::pair<std::string, std::string>> ParamsAndGrads; typedef std::vector<std::pair<std::string, std::string>> ParamsAndGrads;
constexpr char kParamsAndGrads[] = "params_grads"; constexpr char kParamsAndGrads[] = "params_grads";
...@@ -65,8 +67,6 @@ typedef std::vector<std::vector<std::pair<std::string, std::string>>> ...@@ -65,8 +67,6 @@ typedef std::vector<std::vector<std::pair<std::string, std::string>>>
GroupGradsAndParams; GroupGradsAndParams;
constexpr char kGroupGradsAndParams[] = "group_grads_params"; constexpr char kGroupGradsAndParams[] = "group_grads_params";
constexpr char kFusedVarNamePrefix[] = "@FUSEDVAR@";
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -24,13 +24,13 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( ...@@ -24,13 +24,13 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes, const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, ir::Graph *graph) const std::vector<platform::Place> &places, ir::Graph *graph)
: graph_(graph), : graph_(graph),
pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
: nullptr),
prepare_pool_(1),
local_scopes_(local_scopes), local_scopes_(local_scopes),
places_(places), places_(places),
fetch_ctxs_(places), fetch_ctxs_(places),
strategy_(strategy) { strategy_(strategy),
prepare_pool_(1),
pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
: nullptr) {
PrepareOpDeps(); PrepareOpDeps();
CopyOpDeps(); CopyOpDeps();
} }
......
...@@ -63,13 +63,20 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { ...@@ -63,13 +63,20 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
details::OpHandleBase *op); details::OpHandleBase *op);
private: private:
// Note(zcd): the ThreadPool should be placed last so that ThreadPool should
// be destroyed first.
ir::Graph *graph_; ir::Graph *graph_;
std::unique_ptr<::ThreadPool> pool_;
::ThreadPool prepare_pool_;
std::vector<Scope *> local_scopes_; std::vector<Scope *> local_scopes_;
std::vector<platform::Place> places_; std::vector<platform::Place> places_;
platform::DeviceContextPool fetch_ctxs_; platform::DeviceContextPool fetch_ctxs_;
ExceptionHolder exception_holder_; ExceptionHolder exception_holder_;
std::unique_ptr<OpDependentData> op_deps_;
std::future<std::unique_ptr<OpDependentData>> op_deps_futures_;
ExecutionStrategy strategy_;
// use std::list because clear(), push_back, and for_each are O(1)
std::list<std::future<void>> run_op_futures_;
::ThreadPool prepare_pool_;
std::unique_ptr<::ThreadPool> pool_;
void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops, void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops,
OpHandleBase *op_instance) const; OpHandleBase *op_instance) const;
...@@ -88,14 +95,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { ...@@ -88,14 +95,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
void PrepareOpDeps(); void PrepareOpDeps();
void CopyOpDeps(); void CopyOpDeps();
private:
std::future<std::unique_ptr<OpDependentData>> op_deps_futures_;
ExecutionStrategy strategy_;
std::unique_ptr<OpDependentData> op_deps_;
// use std::list because clear(), push_back, and for_each are O(1)
std::list<std::future<void>> run_op_futures_;
}; };
} // namespace details } // namespace details
......
...@@ -12,9 +12,14 @@ ...@@ -12,9 +12,14 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <iostream>
#include <iterator> #include <iterator>
#include <memory>
#include <string> #include <string>
#include <vector>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/framework/details/inplace_op_pass.h"
#include "paddle/fluid/framework/ir/pass_builder.h"
#include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
...@@ -165,118 +170,147 @@ REGISTER_OPERATOR(multi_out_grad, f::NOP, f::MultiOutGradInplaceInToOut, ...@@ -165,118 +170,147 @@ REGISTER_OPERATOR(multi_out_grad, f::NOP, f::MultiOutGradInplaceInToOut,
namespace paddle { namespace paddle {
namespace framework { namespace framework {
// TEST(InferInplace, SingleOpInplaceInToOut) { void FakeSuccData(ProgramDesc* prog) { // NOLINT
// ProgramDesc prog; prog->MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
// auto* op = prog.MutableBlock(0)->AppendOp(); prog->MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128});
// op->SetType("single_op"); prog->MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
// op->SetInput("X", {"test2_a", "test2_b", "test2_c"}); prog->MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
// op->SetOutput("Out", {"test2_out"}); prog->MutableBlock(0)->Var("test2_out");
// prog->MutableBlock(0)->Var("test2_out")->SetShape({64, 32, 128, 128});
// prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); }
// prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128});
// prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); void FakeNoInplaceData(ProgramDesc* prog) { // NOLINT
// prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); prog->MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
// prog.MutableBlock(0)->Var("test2_out"); prog->MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128});
// prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 128, 128}); prog->MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
// prog->MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
// auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; prog->MutableBlock(0)->Var("test2_out");
// auto in_to_outs = infer_inplace(*op); prog->MutableBlock(0)->Var("test2_out")->SetShape({64, 31, 128, 128});
// EXPECT_EQ(in_to_outs.size(), 1ul); }
// auto it = in_to_outs.begin();
// EXPECT_EQ(it->first, "test2_a"); ir::Node* GetNodeFromGraph(ir::Graph* g, std::string name) {
// EXPECT_EQ(it->second, "test2_out"); ir::Node* op_node = nullptr;
// } for (auto& item : g->Nodes()) {
// if (item->Name() == name) {
// TEST(InferInplace, SingleGradOpInplaceInToOut) { op_node = item;
// ProgramDesc prog; break;
// auto* op = prog.MutableBlock(0)->AppendOp(); }
// op->SetType("single_op_grad"); }
// op->SetInput(GradVarName("Out"), {"test2_out"}); return op_node;
// op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"}); }
//
// prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); std::unique_ptr<ir::Graph> test_SingleOpInplaceInToOut(
// prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16, 1024, 1024}); std::unique_ptr<ir::Graph> g) {
// prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); std::unique_ptr<details::InplacePass> pass(new details::InplacePass());
// prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); ir::Node* op_node = GetNodeFromGraph(g.get(), "single_op");
// prog.MutableBlock(0)->Var("test2_out"); EXPECT_NE(op_node, nullptr);
// prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 1024, 1024}); pass->Apply(g.get());
// return g;
// auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; }
// auto in_to_outs = infer_inplace(*op);
// EXPECT_EQ(in_to_outs.size(), 1ul); TEST(InferInplace, SingleOpInplaceInToOut) {
// auto it = in_to_outs.begin(); ProgramDesc prog;
// EXPECT_EQ(it->first, "test2_out"); auto* op = prog.MutableBlock(0)->AppendOp();
// EXPECT_EQ(it->second, "test2_a"); op->SetType("single_op");
// } op->SetInput("X", {"test2_a", "test2_b", "test2_c"});
// op->SetOutput("Out", {"test2_out"});
// TEST(InferInplace, MultiOutInplaceInToOut) {
// ProgramDesc prog; FakeSuccData(&prog);
// auto* op = prog.MutableBlock(0)->AppendOp(); std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
// op->SetType("multi_out_op"); g = test_SingleOpInplaceInToOut(std::move(g));
// op->SetInput("X", {"a0", "a1"}); auto op_node = GetNodeFromGraph(g.get(), "single_op");
// op->SetInput("Y", {"b0"});
// op->SetInput("Z", {"c0", "c1"}); EXPECT_EQ(op_node->outputs[0]->Name(), "test2_a");
// op->SetOutput("Out", {"o0"}); }
// op->SetOutput("YOut", {"y0"});
// op->SetOutput("ZOut", {"z0"}); TEST(InferInplace, SingleOpInplaceInToOutNoInplace) {
// ProgramDesc prog;
// prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR); auto* op = prog.MutableBlock(0)->AppendOp();
// prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR); op->SetType("single_op");
// prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR); op->SetInput("X", {"test2_a", "test2_b", "test2_c"});
// prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR); op->SetOutput("Out", {"test2_out"});
// prog.MutableBlock(0)->Var("o0");
// prog.MutableBlock(0)->Var("y0"); FakeNoInplaceData(&prog);
// prog.MutableBlock(0)->Var("z0"); std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
// prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024}); g = test_SingleOpInplaceInToOut(std::move(g));
// prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024}); auto op_node = GetNodeFromGraph(g.get(), "single_op");
// prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
// prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024}); EXPECT_EQ(op_node->outputs[0]->Name(), "test2_out");
// prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024}); }
// prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
// TEST(InferInplace, MultiOutInplaceInToOut) {
// auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; ProgramDesc prog;
// auto in_to_outs = infer_inplace(*op); auto* op = prog.MutableBlock(0)->AppendOp();
// EXPECT_EQ(in_to_outs.size(), 3ul); op->SetType("multi_out_op");
// std::unordered_map<std::string, std::string> expects = { op->SetInput("X", {"a0", "a1"});
// {"a0", "o0"}, {"b0", "y0"}, {"c0", "z0"}, op->SetInput("Y", {"b0"});
// }; op->SetInput("Z", {"c0", "c1"});
// EXPECT_TRUE(expects == in_to_outs); op->SetOutput("Out", {"o0"});
// } op->SetOutput("YOut", {"y0"});
// op->SetOutput("ZOut", {"z0"});
// TEST(InferInplace, MultiGradInplaceInToOut) {
// ProgramDesc prog; prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR);
// auto* op = prog.MutableBlock(0)->AppendOp(); prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR);
// op->SetType("multi_out_grad"); prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR);
// op->SetInput(GradVarName("Out"), {"o0"}); prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
// op->SetInput(GradVarName("YOut"), {"y0"}); prog.MutableBlock(0)->Var("o0");
// op->SetInput(GradVarName("ZOut"), {"z0"}); prog.MutableBlock(0)->Var("y0");
// op->SetOutput(GradVarName("X"), {"a0", "a1"}); prog.MutableBlock(0)->Var("z0");
// op->SetOutput(GradVarName("Y"), {"b0"}); prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
// op->SetOutput(GradVarName("Z"), {"c0", "c1"}); prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
// prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
// prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
// prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
// prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
// prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
// prog.MutableBlock(0)->Var("o0"); std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
// prog.MutableBlock(0)->Var("y0"); std::unique_ptr<details::InplacePass> pass(new details::InplacePass());
// prog.MutableBlock(0)->Var("z0"); pass->Apply(g.get());
// prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024}); auto op_node = GetNodeFromGraph(g.get(), "multi_out_op");
// prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024}); ASSERT_TRUE(op_node != nullptr);
// prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024}); EXPECT_EQ(op_node->outputs[0]->Name(), "a0");
// prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024}); EXPECT_EQ(op_node->outputs[1]->Name(), "b0");
// prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024}); EXPECT_EQ(op_node->outputs[2]->Name(), "c0");
// prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024}); }
//
// auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; TEST(InferInplace, MultiGradInplaceInToOut) {
// auto in_to_outs = infer_inplace(*op); ProgramDesc prog;
// auto* op = prog.MutableBlock(0)->AppendOp();
// EXPECT_EQ(in_to_outs.size(), 3ul); op->SetType("multi_out_grad");
// std::unordered_map<std::string, std::string> expects = { op->SetInput(GradVarName("Out"), {"o0"});
// {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"}, op->SetInput(GradVarName("YOut"), {"y0"});
// }; op->SetInput(GradVarName("ZOut"), {"z0"});
// EXPECT_TRUE(expects == in_to_outs); op->SetOutput(GradVarName("X"), {"a0", "a1"});
// } op->SetOutput(GradVarName("Y"), {"b0"});
op->SetOutput(GradVarName("Z"), {"c0", "c1"});
prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("o0");
prog.MutableBlock(0)->Var("y0");
prog.MutableBlock(0)->Var("z0");
prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("z0")->SetShape({32, 15, 1024, 1024});
std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
std::unique_ptr<details::InplacePass> pass(new details::InplacePass());
pass->Apply(g.get());
auto op_node = GetNodeFromGraph(g.get(), "multi_out_grad");
ASSERT_TRUE(op_node != nullptr);
EXPECT_EQ(op_node->outputs[0]->Name(), "o0");
EXPECT_EQ(op_node->outputs[2]->Name(), "y0");
EXPECT_EQ(op_node->outputs[3]->Name(), "c0");
std::unordered_map<std::string, std::string> expects = {
{"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"},
};
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -56,8 +56,8 @@ proto::VarType::Type GetDataTypeOfVar(const Variable* var) { ...@@ -56,8 +56,8 @@ proto::VarType::Type GetDataTypeOfVar(const Variable* var) {
} }
} }
static DDim GetDims(const Scope& scope, const std::string& name, static DDim GetDimsDebug(const Scope& scope, const std::string& name,
bool get_actual_dim = false) { bool get_actual_dim = false) {
Variable* var = scope.FindVar(name); Variable* var = scope.FindVar(name);
if (var == nullptr) { if (var == nullptr) {
return DDim({-1}); return DDim({-1});
...@@ -65,9 +65,9 @@ static DDim GetDims(const Scope& scope, const std::string& name, ...@@ -65,9 +65,9 @@ static DDim GetDims(const Scope& scope, const std::string& name,
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
const LoDTensor& tensor = var->Get<LoDTensor>(); const LoDTensor& tensor = var->Get<LoDTensor>();
// if (UNLIKELY(!tensor.IsInitialized())) { if (UNLIKELY(!tensor.IsInitialized())) {
// return DDim({-1}); return DDim({-1});
// } }
return tensor.dims(); return tensor.dims();
} else if (var->IsType<SelectedRows>()) { } else if (var->IsType<SelectedRows>()) {
if (get_actual_dim) { if (get_actual_dim) {
...@@ -123,7 +123,7 @@ static int GetRowSize(const Scope& scope, const std::string& name) { ...@@ -123,7 +123,7 @@ static int GetRowSize(const Scope& scope, const std::string& name) {
return -1; return -1;
} }
static LoD GetLoD(const Scope& scope, const std::string& name) { static LoD GetLoDDebug(const Scope& scope, const std::string& name) {
Variable* var = scope.FindVar(name); Variable* var = scope.FindVar(name);
auto default_lod = LoD({{}}); auto default_lod = LoD({{}});
...@@ -133,9 +133,9 @@ static LoD GetLoD(const Scope& scope, const std::string& name) { ...@@ -133,9 +133,9 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
const LoDTensor& tensor = var->Get<LoDTensor>(); const LoDTensor& tensor = var->Get<LoDTensor>();
// if (UNLIKELY(!tensor.IsInitialized())) { if (UNLIKELY(!tensor.IsInitialized())) {
// return default_lod; return default_lod;
// } }
return tensor.lod(); return tensor.lod();
} else { } else {
return default_lod; return default_lod;
...@@ -274,8 +274,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const { ...@@ -274,8 +274,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
} }
std::string dtype = GetDtype(*scope, var_name); std::string dtype = GetDtype(*scope, var_name);
ss << ":" << dtype; ss << ":" << dtype;
ss << "[" << GetDims(*scope, var_name, true) << "]"; ss << "[" << GetDimsDebug(*scope, var_name, true) << "]";
ss << "(" << GetLoD(*scope, var_name) << ")"; ss << "(" << GetLoDDebug(*scope, var_name) << ")";
} }
} }
if (i != input.second.size() - 1) { if (i != input.second.size() - 1) {
...@@ -305,8 +305,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const { ...@@ -305,8 +305,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
} }
std::string dtype = GetDtype(*scope, output.second[i]); std::string dtype = GetDtype(*scope, output.second[i]);
ss << ":" << dtype; ss << ":" << dtype;
ss << "[" << GetDims(*scope, var_name, true) << "]"; ss << "[" << GetDimsDebug(*scope, var_name, true) << "]";
ss << "(" << GetLoD(*scope, var_name) << ")"; ss << "(" << GetLoDDebug(*scope, var_name) << ")";
} }
} }
if (i != output.second.size() - 1) { if (i != output.second.size() - 1) {
......
...@@ -365,6 +365,9 @@ class ExecutionContext { ...@@ -365,6 +365,9 @@ class ExecutionContext {
auto shared_allocation = std::shared_ptr<memory::allocation::Allocation>( auto shared_allocation = std::shared_ptr<memory::allocation::Allocation>(
allocation_ptr, deleter); allocation_ptr, deleter);
PADDLE_ENFORCE(
dynamic_cast<platform::TemporaryAllocation*>(allocation_ptr) != nullptr,
"The AllocationPtr must be TemporaryAllocation.");
PADDLE_ENFORCE_GE(allocation_ptr->size(), PADDLE_ENFORCE_GE(allocation_ptr->size(),
framework::product(dim) * sizeof(T)); framework::product(dim) * sizeof(T));
......
...@@ -70,7 +70,7 @@ Tensor& Tensor::ShareDataWith(const Tensor& src) { ...@@ -70,7 +70,7 @@ Tensor& Tensor::ShareDataWith(const Tensor& src) {
return *this; return *this;
} }
Tensor Tensor::Slice(int begin_idx, int end_idx) const { Tensor Tensor::Slice(int64_t begin_idx, int64_t end_idx) const {
check_memory_size(); check_memory_size();
PADDLE_ENFORCE_GE(begin_idx, 0, PADDLE_ENFORCE_GE(begin_idx, 0,
"The start row index must be greater than 0."); "The start row index must be greater than 0.");
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <cstring> #include <cstring>
#include <memory> #include <memory>
#include <typeindex> #include <typeindex>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/ddim.h"
...@@ -27,10 +28,6 @@ limitations under the License. */ ...@@ -27,10 +28,6 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_utils.h"
#endif
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -41,34 +38,10 @@ class Tensor { ...@@ -41,34 +38,10 @@ class Tensor {
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
public: public:
// TODO(jczaja): This is depracted and will be removed inline mkldnn::memory::format format() const { return format_; }
inline mkldnn::memory::format format() const {
if (layout_ == DataLayout::kMKLDNN) {
return static_cast<mkldnn::memory::format>(mem_pd_.desc().data.format);
} else {
return mkldnn::memory::format::format_undef;
}
}
// TODO(jczaja): This is depracted and will be removed inline void set_format(const mkldnn::memory::format format) {
inline void set_format( format_ = format;
const mkldnn::memory::format fmt,
mkldnn::memory::data_type data_type = mkldnn::memory::f32) {
mem_pd_ = paddle::platform::create_prim_desc_from_format(
paddle::framework::vectorize2int(dims()), fmt, data_type);
layout_ = DataLayout::kMKLDNN;
}
inline mkldnn::memory::primitive_desc get_mkldnn_prim_desc() const {
return mem_pd_;
}
inline void set_mkldnn_prim_desc(
const mkldnn::memory::primitive_desc& mem_pd) {
// Internally MKL-DNN is just copying (increasing reference counter)
// to shared_ptr. So asignment should be quite cheap
mem_pd_ = mem_pd;
layout_ = DataLayout::kMKLDNN;
} }
protected: protected:
...@@ -76,9 +49,12 @@ class Tensor { ...@@ -76,9 +49,12 @@ class Tensor {
* @brief the detail format of memory block which have layout as kMKLDNN * @brief the detail format of memory block which have layout as kMKLDNN
* *
* @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C, * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C,
* nChw16c, etc. For a MKLDNN memory block, we store memory descriptor * nChw16c, etc. For a MKLDNN memory block, layout will be set as
* DataLayout::kMKLDNN meanwhile detail memory format will be kept in
* this field.
*/ */
mutable mkldnn::memory::primitive_desc mem_pd_;
mkldnn::memory::format format_ = mkldnn::memory::format::format_undef;
#endif #endif
public: public:
...@@ -157,7 +133,7 @@ class Tensor { ...@@ -157,7 +133,7 @@ class Tensor {
* @param[in] end_idx The index of the end row(exclusive) to slice. * @param[in] end_idx The index of the end row(exclusive) to slice.
* The index number begins from 0. * The index number begins from 0.
*/ */
Tensor Slice(int begin_idx, int end_idx) const; Tensor Slice(int64_t begin_idx, int64_t end_idx) const;
platform::Place place() const { platform::Place place() const {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
......
...@@ -44,11 +44,6 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, ...@@ -44,11 +44,6 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
<< dst_place; << dst_place;
return; return;
} }
#ifdef PADDLE_WITH_MKLDNN
if (src.layout() == DataLayout::kMKLDNN) {
dst->set_mkldnn_prim_desc(src.get_mkldnn_prim_desc());
}
#endif
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
boost::get<platform::CPUPlace>(src_place), src_ptr, size); boost::get<platform::CPUPlace>(src_place), src_ptr, size);
} }
......
...@@ -23,6 +23,12 @@ function(inference_analysis_api_test target install_dir filename) ...@@ -23,6 +23,12 @@ function(inference_analysis_api_test target install_dir filename)
ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt) ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt)
endfunction() endfunction()
function(inference_analysis_api_int8_test target model_dir data_dir filename)
inference_analysis_test(${target} SRCS ${filename}
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} benchmark
ARGS --infer_model=${model_dir}/model --infer_data=${data_dir}/data.bin --batch_size=100)
endfunction()
function(inference_analysis_api_test_with_fake_data target install_dir filename model_name) function(inference_analysis_api_test_with_fake_data target install_dir filename model_name)
download_model(${install_dir} ${model_name}) download_model(${install_dir} ${model_name})
inference_analysis_test(${target} SRCS ${filename} inference_analysis_test(${target} SRCS ${filename}
...@@ -138,6 +144,28 @@ inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 ...@@ -138,6 +144,28 @@ inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv
"${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz" SERIAL) "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz" SERIAL)
# int8 image classification tests
if(WITH_MKLDNN)
set(INT8_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/int8")
if (NOT EXISTS ${INT8_DATA_DIR})
inference_download_and_uncompress(${INT8_DATA_DIR} "https://paddle-inference-dist.bj.bcebos.com/int8" "imagenet_val_100.tar.gz")
endif()
#resnet50 int8
set(INT8_RESNET50_MODEL_DIR "${INT8_DATA_DIR}/resnet50")
if (NOT EXISTS ${INT8_RESNET50_MODEL_DIR})
inference_download_and_uncompress(${INT8_RESNET50_MODEL_DIR} "https://paddle-inference-dist.bj.bcebos.com/int8" "resnet50_int8_model.tar.gz" )
endif()
inference_analysis_api_int8_test(test_analyzer_int8_resnet50 ${INT8_RESNET50_MODEL_DIR} ${INT8_DATA_DIR} analyzer_int8_image_classification_tester.cc SERIAL)
#mobilenet int8
set(INT8_MOBILENET_MODEL_DIR "${INT8_DATA_DIR}/mobilenet")
if (NOT EXISTS ${INT8_MOBILENET_MODEL_DIR})
inference_download_and_uncompress(${INT8_MOBILENET_MODEL_DIR} "https://paddle-inference-dist.bj.bcebos.com/int8" "mobilenetv1_int8_model.tar.gz" )
endif()
inference_analysis_api_int8_test(test_analyzer_int8_mobilenet ${INT8_MOBILENET_MODEL_DIR} ${INT8_DATA_DIR} analyzer_int8_image_classification_tester.cc SERIAL)
endif()
# bert, max_len=20, embedding_dim=128 # bert, max_len=20, embedding_dim=128
set(BERT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/bert_emb128") set(BERT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/bert_emb128")
download_model_and_data(${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_data_len20.txt.tar.gz") download_model_and_data(${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_data_len20.txt.tar.gz")
......
...@@ -53,19 +53,6 @@ void Split(const std::string &line, char sep, std::vector<T> *v) { ...@@ -53,19 +53,6 @@ void Split(const std::string &line, char sep, std::vector<T> *v) {
} }
} }
template <typename T>
constexpr paddle::PaddleDType GetPaddleDType();
template <>
constexpr paddle::PaddleDType GetPaddleDType<int64_t>() {
return paddle::PaddleDType::INT64;
}
template <>
constexpr paddle::PaddleDType GetPaddleDType<float>() {
return paddle::PaddleDType::FLOAT32;
}
// Parse tensor from string // Parse tensor from string
template <typename T> template <typename T>
bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) { bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) {
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
DEFINE_int32(iterations, 0, "Number of iterations");
namespace paddle {
namespace inference {
namespace analysis {
void SetConfig(AnalysisConfig *cfg) {
cfg->SetModel(FLAGS_infer_model);
cfg->SetProgFile("__model__");
cfg->DisableGpu();
cfg->SwitchIrOptim();
cfg->SwitchSpecifyInputNames(false);
cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
cfg->EnableMKLDNN();
}
template <typename T>
class TensorReader {
public:
TensorReader(std::ifstream &file, size_t beginning_offset,
std::vector<int> shape, std::string name)
: file_(file), position(beginning_offset), shape_(shape), name_(name) {
numel =
std::accumulate(shape_.begin(), shape_.end(), 1, std::multiplies<T>());
}
PaddleTensor NextBatch() {
PaddleTensor tensor;
tensor.name = name_;
tensor.shape = shape_;
tensor.dtype = GetPaddleDType<T>();
tensor.data.Resize(numel * sizeof(T));
file_.seekg(position);
file_.read(static_cast<char *>(tensor.data.data()), numel * sizeof(T));
position = file_.tellg();
if (file_.eof()) LOG(ERROR) << name_ << ": reached end of stream";
if (file_.fail())
throw std::runtime_error(name_ + ": failed reading file.");
return tensor;
}
protected:
std::ifstream &file_;
size_t position;
std::vector<int> shape_;
std::string name_;
size_t numel;
};
std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
const std::vector<std::vector<PaddleTensor>> &test_data, int num_images) {
int test_data_batch_size = test_data[0][0].shape[0];
CHECK_LE(static_cast<size_t>(num_images),
test_data.size() * test_data_batch_size);
PaddleTensor images;
images.name = "input";
images.shape = {num_images, 3, 224, 224};
images.dtype = PaddleDType::FLOAT32;
images.data.Resize(sizeof(float) * num_images * 3 * 224 * 224);
PaddleTensor labels;
labels.name = "labels";
labels.shape = {num_images, 1};
labels.dtype = PaddleDType::INT64;
labels.data.Resize(sizeof(int64_t) * num_images);
for (int i = 0; i < num_images; i++) {
auto batch = i / test_data_batch_size;
auto element_in_batch = i % test_data_batch_size;
std::copy_n(static_cast<float *>(test_data[batch][0].data.data()) +
element_in_batch * 3 * 224 * 224,
3 * 224 * 224,
static_cast<float *>(images.data.data()) + i * 3 * 224 * 224);
std::copy_n(static_cast<int64_t *>(test_data[batch][1].data.data()) +
element_in_batch,
1, static_cast<int64_t *>(labels.data.data()) + i);
}
auto warmup_data = std::make_shared<std::vector<PaddleTensor>>(2);
(*warmup_data)[0] = std::move(images);
(*warmup_data)[1] = std::move(labels);
return warmup_data;
}
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
int32_t batch_size = FLAGS_batch_size) {
std::ifstream file(FLAGS_infer_data, std::ios::binary);
if (!file) {
FAIL() << "Couldn't open file: " << FLAGS_infer_data;
}
int64_t total_images{0};
file.read(reinterpret_cast<char *>(&total_images), sizeof(total_images));
LOG(INFO) << "Total images in file: " << total_images;
std::vector<int> image_batch_shape{batch_size, 3, 224, 224};
std::vector<int> label_batch_shape{batch_size, 1};
auto labels_offset_in_file =
static_cast<size_t>(file.tellg()) +
sizeof(float) * total_images *
std::accumulate(image_batch_shape.begin() + 1,
image_batch_shape.end(), 1, std::multiplies<int>());
TensorReader<float> image_reader(file, 0, image_batch_shape, "input");
TensorReader<int64_t> label_reader(file, labels_offset_in_file,
label_batch_shape, "label");
auto iterations = total_images / batch_size;
if (FLAGS_iterations > 0 && FLAGS_iterations < iterations)
iterations = FLAGS_iterations;
for (auto i = 0; i < iterations; i++) {
auto images = image_reader.NextBatch();
auto labels = label_reader.NextBatch();
inputs->emplace_back(
std::vector<PaddleTensor>{std::move(images), std::move(labels)});
}
}
TEST(Analyzer_int8_resnet50, quantization) {
AnalysisConfig cfg;
SetConfig(&cfg);
AnalysisConfig q_cfg;
SetConfig(&q_cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all, 100);
std::shared_ptr<std::vector<PaddleTensor>> warmup_data =
GetWarmupData(input_slots_all, 100);
q_cfg.EnableMkldnnQuantizer();
q_cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data);
q_cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(100);
CompareQuantizedAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
reinterpret_cast<const PaddlePredictor::Config *>(&q_cfg),
input_slots_all);
}
TEST(Analyzer_int8_resnet50, profile) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
std::shared_ptr<std::vector<PaddleTensor>> warmup_data =
GetWarmupData(input_slots_all, 100);
cfg.EnableMkldnnQuantizer();
cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data);
cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(100);
std::vector<PaddleTensor> outputs;
TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads);
}
} // namespace analysis
} // namespace inference
} // namespace paddle
# copyright (c) 2019 paddlepaddle authors. all rights reserved.
#
# licensed under the apache license, version 2.0 (the "license");
# you may not use this file except in compliance with the license.
# you may obtain a copy of the license at
#
# http://www.apache.org/licenses/license-2.0
#
# unless required by applicable law or agreed to in writing, software
# distributed under the license is distributed on an "as is" basis,
# without warranties or conditions of any kind, either express or implied.
# see the license for the specific language governing permissions and
# limitations under the license.
import unittest
import os
import numpy as np
import time
import sys
import random
import functools
import contextlib
from PIL import Image, ImageEnhance
import math
from paddle.dataset.common import download
random.seed(0)
np.random.seed(0)
DATA_DIM = 224
SIZE_FLOAT32 = 4
SIZE_INT64 = 8
img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
def resize_short(img, target_size):
percent = float(target_size) / min(img.size[0], img.size[1])
resized_width = int(round(img.size[0] * percent))
resized_height = int(round(img.size[1] * percent))
img = img.resize((resized_width, resized_height), Image.LANCZOS)
return img
def crop_image(img, target_size, center):
width, height = img.size
size = target_size
if center == True:
w_start = (width - size) / 2
h_start = (height - size) / 2
else:
w_start = np.random.randint(0, width - size + 1)
h_start = np.random.randint(0, height - size + 1)
w_end = w_start + size
h_end = h_start + size
img = img.crop((w_start, h_start, w_end, h_end))
return img
def process_image(img_path, mode, color_jitter, rotate):
img = Image.open(img_path)
img = resize_short(img, target_size=256)
img = crop_image(img, target_size=DATA_DIM, center=True)
if img.mode != 'RGB':
img = img.convert('RGB')
img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255
img -= img_mean
img /= img_std
return img
def download_unzip():
int8_download = 'int8/download'
target_name = 'data'
cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
int8_download)
target_folder = os.path.join(cache_folder, target_name)
data_urls = []
data_md5s = []
data_urls.append(
'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partaa'
)
data_md5s.append('60f6525b0e1d127f345641d75d41f0a8')
data_urls.append(
'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partab'
)
data_md5s.append('1e9f15f64e015e58d6f9ec3210ed18b5')
file_names = []
for i in range(0, len(data_urls)):
download(data_urls[i], cache_folder, data_md5s[i])
file_names.append(data_urls[i].split('/')[-1])
zip_path = os.path.join(cache_folder, 'full_imagenet_val.tar.gz')
if not os.path.exists(zip_path):
cat_command = 'cat'
for file_name in file_names:
cat_command += ' ' + os.path.join(cache_folder, file_name)
cat_command += ' > ' + zip_path
os.system(cat_command)
print('Data is downloaded at {0}\n').format(zip_path)
if not os.path.exists(target_folder):
cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(target_folder, zip_path)
os.system(cmd)
print('Data is unzipped at {0}\n'.format(target_folder))
data_dir = os.path.join(target_folder, 'ILSVRC2012')
print('ILSVRC2012 full val set at {0}\n'.format(data_dir))
return data_dir
def reader():
data_dir = download_unzip()
file_list = os.path.join(data_dir, 'val_list.txt')
output_file = os.path.join(data_dir, 'int8_full_val.bin')
with open(file_list) as flist:
lines = [line.strip() for line in flist]
num_images = len(lines)
if not os.path.exists(output_file):
print(
'Preprocessing to binary file...<num_images><all images><all labels>...\n'
)
with open(output_file, "w+b") as of:
#save num_images(int64_t) to file
of.seek(0)
num = np.array(int(num_images)).astype('int64')
of.write(num.tobytes())
for idx, line in enumerate(lines):
img_path, label = line.split()
img_path = os.path.join(data_dir, img_path)
if not os.path.exists(img_path):
continue
#save image(float32) to file
img = process_image(
img_path, 'val', color_jitter=False, rotate=False)
np_img = np.array(img)
of.seek(SIZE_INT64 + SIZE_FLOAT32 * DATA_DIM * DATA_DIM * 3
* idx)
of.write(np_img.astype('float32').tobytes())
#save label(int64_t) to file
label_int = (int)(label)
np_label = np.array(label_int)
of.seek(SIZE_INT64 + SIZE_FLOAT32 * DATA_DIM * DATA_DIM * 3
* num_images + idx * SIZE_INT64)
of.write(np_label.astype('int64').tobytes())
print('The preprocessed binary file path {}\n'.format(output_file))
if __name__ == '__main__':
reader()
...@@ -50,6 +50,7 @@ DEFINE_bool(use_analysis, true, ...@@ -50,6 +50,7 @@ DEFINE_bool(use_analysis, true,
DEFINE_bool(record_benchmark, false, DEFINE_bool(record_benchmark, false,
"Record benchmark after profiling the model"); "Record benchmark after profiling the model");
DEFINE_double(accuracy, 1e-3, "Result Accuracy."); DEFINE_double(accuracy, 1e-3, "Result Accuracy.");
DEFINE_double(quantized_accuracy, 1e-2, "Result Quantized Accuracy.");
DEFINE_bool(zero_copy, false, "Use ZeroCopy to speedup Feed/Fetch."); DEFINE_bool(zero_copy, false, "Use ZeroCopy to speedup Feed/Fetch.");
DECLARE_bool(profile); DECLARE_bool(profile);
...@@ -58,6 +59,19 @@ DECLARE_int32(paddle_num_threads); ...@@ -58,6 +59,19 @@ DECLARE_int32(paddle_num_threads);
namespace paddle { namespace paddle {
namespace inference { namespace inference {
template <typename T>
constexpr paddle::PaddleDType GetPaddleDType();
template <>
constexpr paddle::PaddleDType GetPaddleDType<int64_t>() {
return paddle::PaddleDType::INT64;
}
template <>
constexpr paddle::PaddleDType GetPaddleDType<float>() {
return paddle::PaddleDType::FLOAT32;
}
void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) { void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
const auto *analysis_config = const auto *analysis_config =
reinterpret_cast<const AnalysisConfig *>(config); reinterpret_cast<const AnalysisConfig *>(config);
...@@ -392,6 +406,32 @@ void TestPrediction(const PaddlePredictor::Config *config, ...@@ -392,6 +406,32 @@ void TestPrediction(const PaddlePredictor::Config *config,
} }
} }
void CompareTopAccuracy(const std::vector<PaddleTensor> &output_slots1,
const std::vector<PaddleTensor> &output_slots2) {
// first output: avg_cost
if (output_slots1.size() == 0 || output_slots2.size() == 0)
throw std::invalid_argument(
"CompareTopAccuracy: output_slots vector is empty.");
PADDLE_ENFORCE(output_slots1.size() >= 2UL);
PADDLE_ENFORCE(output_slots2.size() >= 2UL);
// second output: acc_top1
if (output_slots1[1].lod.size() > 0 || output_slots2[1].lod.size() > 0)
throw std::invalid_argument(
"CompareTopAccuracy: top1 accuracy output has nonempty LoD.");
if (output_slots1[1].dtype != paddle::PaddleDType::FLOAT32 ||
output_slots2[1].dtype != paddle::PaddleDType::FLOAT32)
throw std::invalid_argument(
"CompareTopAccuracy: top1 accuracy output is of a wrong type.");
float *top1_quantized = static_cast<float *>(output_slots1[1].data.data());
float *top1_reference = static_cast<float *>(output_slots2[1].data.data());
LOG(INFO) << "top1 INT8 accuracy: " << *top1_quantized;
LOG(INFO) << "top1 FP32 accuracy: " << *top1_reference;
LOG(INFO) << "Accepted accuracy drop threshold: " << FLAGS_quantized_accuracy;
CHECK_LE(std::abs(*top1_quantized - *top1_reference),
FLAGS_quantized_accuracy);
}
void CompareDeterministic( void CompareDeterministic(
const PaddlePredictor::Config *config, const PaddlePredictor::Config *config,
const std::vector<std::vector<PaddleTensor>> &inputs) { const std::vector<std::vector<PaddleTensor>> &inputs) {
...@@ -421,6 +461,17 @@ void CompareNativeAndAnalysis( ...@@ -421,6 +461,17 @@ void CompareNativeAndAnalysis(
CompareResult(analysis_outputs, native_outputs); CompareResult(analysis_outputs, native_outputs);
} }
void CompareQuantizedAndAnalysis(
const PaddlePredictor::Config *config,
const PaddlePredictor::Config *qconfig,
const std::vector<std::vector<PaddleTensor>> &inputs) {
PrintConfig(config, true);
std::vector<PaddleTensor> analysis_outputs, quantized_outputs;
TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
TestOneThreadPrediction(qconfig, inputs, &quantized_outputs, true);
CompareTopAccuracy(quantized_outputs, analysis_outputs);
}
void CompareNativeAndAnalysis( void CompareNativeAndAnalysis(
PaddlePredictor *native_pred, PaddlePredictor *analysis_pred, PaddlePredictor *native_pred, PaddlePredictor *analysis_pred,
const std::vector<std::vector<PaddleTensor>> &inputs) { const std::vector<std::vector<PaddleTensor>> &inputs) {
......
...@@ -4,7 +4,6 @@ cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator) ...@@ -4,7 +4,6 @@ cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator) cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator) cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler) cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler)
cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator)
cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator) cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator)
if (WITH_GPU) if (WITH_GPU)
...@@ -38,20 +37,30 @@ else () ...@@ -38,20 +37,30 @@ else ()
set(AllocatorFacadeDeps) set(AllocatorFacadeDeps)
endif() endif()
list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator best_fit_allocator aligned_allocator auto_increment_allocator conditional_allocator retry_allocator buffered_allocator legacy_allocator zero_size_allocator)
cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator) cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
cc_library(auto_increment_allocator SRCS auto_increment_allocator.cc DEPS allocator) cc_library(auto_increment_allocator SRCS auto_increment_allocator.cc DEPS allocator)
cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator)
cc_library(conditional_allocator SRCS conditional_allocator.cc DEPS allocator) cc_library(conditional_allocator SRCS conditional_allocator.cc DEPS allocator)
cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps}) cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags)
cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy) cc_library(allocator_facade SRCS allocator_facade.cc DEPS
${AllocatorFacadeDeps}
cpu_allocator
locked_allocator
best_fit_allocator
aligned_allocator
auto_increment_allocator
zero_size_allocator
conditional_allocator
retry_allocator
buffered_allocator
allocator_strategy
legacy_allocator
)
nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade) nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade)
cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator best_fit_allocator locked_allocator cpu_allocator) cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator best_fit_allocator locked_allocator cpu_allocator)
cc_test(naive_best_fit_allocator_facade_test SRCS naive_best_fit_allocator_facade_test.cc DEPS allocator_facade)
cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc DEPS allocator_facade) cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc DEPS allocator_facade)
cc_test(allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.cc DEPS allocator_facade) cc_test(allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.cc DEPS allocator_facade)
...@@ -94,8 +94,6 @@ class AlignedAllocator : public ThinAlignedAllocator { ...@@ -94,8 +94,6 @@ class AlignedAllocator : public ThinAlignedAllocator {
underlying_allocator_->Allocate(size + kAlignment, attr); underlying_allocator_->Allocate(size + kAlignment, attr);
return new AlignedAllocation<kAlignment>(std::move(raw_allocation), size); return new AlignedAllocation<kAlignment>(std::move(raw_allocation), size);
} }
void FreeImpl(Allocation* allocation) override { delete allocation; }
}; };
} // namespace allocation } // namespace allocation
......
...@@ -27,24 +27,16 @@ bool Allocator::IsAllocThreadSafe() const { return false; } ...@@ -27,24 +27,16 @@ bool Allocator::IsAllocThreadSafe() const { return false; }
AllocationPtr Allocator::Allocate(size_t size, Allocator::Attr attr) { AllocationPtr Allocator::Allocate(size_t size, Allocator::Attr attr) {
auto ptr = AllocateImpl(size, attr); auto ptr = AllocateImpl(size, attr);
ptr->RegisterDecoratedAllocator(this); ptr->set_allocator(this);
return AllocationPtr(ptr); return AllocationPtr(ptr);
} }
void Allocator::FreeImpl(Allocation* allocation) { void Allocator::Free(Allocation* allocation) { delete allocation; }
Allocator* allocator = allocation->TopDecoratedAllocator();
allocator->Free(allocation);
}
void Allocator::Free(Allocation* allocation) {
allocation->PopDecoratedAllocator();
FreeImpl(allocation);
}
const char* BadAlloc::what() const noexcept { return msg_.c_str(); } const char* BadAlloc::what() const noexcept { return msg_.c_str(); }
void AllocationDeleter::operator()(Allocation* allocation) const { void AllocationDeleter::operator()(Allocation* allocation) const {
Allocator* allocator = allocation->TopDecoratedAllocator(); auto* allocator = allocation->allocator();
allocator->Free(allocation); allocator->Free(allocation);
} }
......
...@@ -46,56 +46,13 @@ class Allocator; ...@@ -46,56 +46,13 @@ class Allocator;
// NOTE: this is the base class of Allocation. Each allocator can use its own // NOTE: this is the base class of Allocation. Each allocator can use its own
// allocation object. // allocation object.
// NOTE: the `Allocation::ptr()` could be nullptr, if the allocation size is 0 // NOTE: the `Allocation::ptr()` could be nullptr, if the allocation size is 0
/**
* Allocation is returned by Allocator::Allocate() method.
*
* An allocator may be decorated by another allocator. For example, we can
* decorate
* a RetryAllocator to any allocator to perform allocation retry when first
* allocation request fails.
*
* Explanations of Allocator design is as follows:
*
* Suppose we have an allocator which is decorated by several allocators:
*
* A(1) <- A(2) <- A(3) <- ... <- A(n)
*
* , and the public allocator is A(1).
*
* The allocation process would be:
*
* A(n).Allocate() -> ... -> A(2).Allocate() -> A(1).Allocate()
*
* , and the free process would be:
*
* A(1).Free() -> A(2).Free() -> ... -> A(n).Free()
*
* Therefore, we should record the allocator chain when allocating, so
* that we can free the allocation in the reverse order of allocator chain.
* The field `decorated_allocators_` is used to record this chain.
*
* Another example is that we want to add additional fields in Allocation,
* e.g., something what is done in AlignedAllocator, etc.
* In this case, we should declare a derived class of Allocation, which
* contains an underlying Allocation allocated by the underlying allocator.
* Therefore, `decorated_allocators_` of the new Allocation object would
* be a new chain, differing from the underlying Allocation object.
*/
class Allocation { class Allocation {
public: public:
Allocation(void* ptr, size_t size, platform::Place place) Allocation(void* ptr, size_t size, platform::Place place)
: ptr_(ptr), size_(size), place_(place) { : allocator_(nullptr), ptr_(ptr), size_(size), place_(place) {}
// NOTE(zjl): Since decorated_allocators_ is usually a small vector
// We reserve a small buffer to it to prevent frequent heap allocation
// Not quite sure whether we need something like gtl vector.
decorated_allocators_.reserve(8);
}
Allocation(const Allocation& o) = delete; Allocation(const Allocation& o) = delete;
Allocation& operator=(const Allocation& o) = delete; Allocation& operator=(const Allocation& o) = delete;
Allocation(Allocation&& o) = delete;
Allocation& operator=(Allocation&& o) = delete;
// Returns the holding pointer. // Returns the holding pointer.
// NOTE: For performance consideration, it is better not to make this method // NOTE: For performance consideration, it is better not to make this method
...@@ -117,31 +74,17 @@ class Allocation { ...@@ -117,31 +74,17 @@ class Allocation {
const platform::Place& place() const { return place_; } const platform::Place& place() const { return place_; }
virtual ~Allocation(); Allocator* allocator() { return allocator_; }
private:
const std::vector<Allocator*>& DecoratedAllocators() const {
return decorated_allocators_;
}
inline void RegisterDecoratedAllocator(Allocator* allocator) {
decorated_allocators_.push_back(allocator);
}
inline void PopDecoratedAllocator() { decorated_allocators_.pop_back(); } void set_allocator(Allocator* allocator) { allocator_ = allocator; }
inline Allocator* TopDecoratedAllocator() { virtual ~Allocation();
return decorated_allocators_.back();
}
private: private:
Allocator* allocator_;
void* ptr_; void* ptr_;
size_t size_; size_t size_;
platform::Place place_; platform::Place place_;
std::vector<Allocator*> decorated_allocators_;
friend class Allocator;
friend class AllocationDeleter;
}; };
using AllocationPtr = std::unique_ptr<Allocation, AllocationDeleter>; using AllocationPtr = std::unique_ptr<Allocation, AllocationDeleter>;
...@@ -191,12 +134,9 @@ class Allocator { ...@@ -191,12 +134,9 @@ class Allocator {
// True if the `Allocate` is thread safe. // True if the `Allocate` is thread safe.
virtual bool IsAllocThreadSafe() const; virtual bool IsAllocThreadSafe() const;
// This function should not be called outside
void Free(Allocation* allocation);
protected: protected:
virtual void Free(Allocation* allocation);
virtual Allocation* AllocateImpl(size_t size, Allocator::Attr attr) = 0; virtual Allocation* AllocateImpl(size_t size, Allocator::Attr attr) = 0;
virtual void FreeImpl(Allocation* allocation);
private: private:
friend class AllocationDeleter; friend class AllocationDeleter;
......
...@@ -49,17 +49,6 @@ namespace paddle { ...@@ -49,17 +49,6 @@ namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
static inline std::shared_ptr<Allocator> WrapRetryAllocator(
std::shared_ptr<Allocator> allocator, int64_t retry_time) {
if (retry_time > 0) {
auto* retry_allocator =
new RetryAllocator(std::move(allocator), retry_time);
allocator.reset(retry_allocator);
}
return allocator;
}
// TODO(yy): Dirty code here. This class should be configurable in runtime. // TODO(yy): Dirty code here. This class should be configurable in runtime.
class CPUManagedAllocator : public Allocator { class CPUManagedAllocator : public Allocator {
public: public:
...@@ -123,10 +112,14 @@ class ChunkedAllocator : public Allocator { ...@@ -123,10 +112,14 @@ class ChunkedAllocator : public Allocator {
std::shared_ptr<Allocator> CreateAllocatorWithChunk() { std::shared_ptr<Allocator> CreateAllocatorWithChunk() {
chunks_.emplace_back(raw_allocator_->Allocate(max_chunk_size_)); chunks_.emplace_back(raw_allocator_->Allocate(max_chunk_size_));
auto* allocation = chunks_.back().get(); auto* allocation = chunks_.back().get();
std::shared_ptr<Allocator> allocator(new LockedAllocator( std::unique_ptr<Allocator> allocator(new LockedAllocator(
std::shared_ptr<Allocator>(new BestFitAllocator(allocation)))); std::unique_ptr<Allocator>(new BestFitAllocator(allocation))));
allocator = WrapRetryAllocator(allocator, retry_time_); if (retry_time_ > 0) {
auto* retry_allocator =
new RetryAllocator(std::move(allocator), retry_time_);
allocator.reset(retry_allocator);
}
return std::make_shared<AlignedAllocator<64u>>(std::move(allocator)); return std::make_shared<AlignedAllocator<64u>>(std::move(allocator));
} }
...@@ -197,23 +190,13 @@ class AllocatorFacadePrivate { ...@@ -197,23 +190,13 @@ class AllocatorFacadePrivate {
~AllocatorFacadePrivate() = default; ~AllocatorFacadePrivate() = default;
AllocatorFacadePrivate() { AllocatorFacadePrivate() {
auto strategy = GetAllocatorStrategy(); if (GetAllocatorStrategy() == AllocatorStrategy::kLegacy) {
switch (strategy) { InitLegacyAllocator();
case AllocatorStrategy::kLegacy: { } else {
InitLegacyAllocator(); InitCPUAllocator();
break; InitCUDAAllocator();
} InitCUDAPinnedAllocator();
case AllocatorStrategy::kNaiveBestFit: { WrapZeroSizeAllocator();
InitCPUAllocator();
InitCUDAAllocator();
InitCUDAPinnedAllocator();
WrapZeroSizeAllocator();
break;
}
default: {
PADDLE_THROW("Unsupported allocator strategy: %d",
static_cast<int>(strategy));
}
} }
} }
...@@ -271,7 +254,8 @@ AllocatorFacade& AllocatorFacade::Instance() { ...@@ -271,7 +254,8 @@ AllocatorFacade& AllocatorFacade::Instance() {
std::shared_ptr<Allocation> AllocatorFacade::AllocShared( std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
const platform::Place& place, size_t size, Allocator::Attr attr) { const platform::Place& place, size_t size, Allocator::Attr attr) {
return std::shared_ptr<Allocation>(Alloc(place, size, attr)); return std::shared_ptr<Allocation>(Alloc(place, size, attr).release(),
AllocationDeleter());
} }
AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size, AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
......
...@@ -19,22 +19,16 @@ ...@@ -19,22 +19,16 @@
DEFINE_string( DEFINE_string(
allocator_strategy, "legacy", allocator_strategy, "legacy",
"The allocation strategy. Legacy means the original allocator of Fluid." "The allocation strategy. Legacy means the original allocator of Fluid."
"naive_best_fit means the experimental best fit allocator. " "New means the experimental allocators of Fluid. in [legacy, new]");
"allocator. Enum in [legacy, naive_best_fit].");
namespace paddle { namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
static AllocatorStrategy GetStrategyFromFlag() { static AllocatorStrategy GetStrategyFromFlag() {
if (FLAGS_allocator_strategy == "legacy") { return FLAGS_allocator_strategy == "legacy"
return AllocatorStrategy::kLegacy; ? AllocatorStrategy::kLegacy
} else if (FLAGS_allocator_strategy == "naive_best_fit") { : AllocatorStrategy::kNaiveBestFit;
return AllocatorStrategy::kNaiveBestFit;
} else {
PADDLE_THROW("Unsupported allocator strategy: %s",
FLAGS_allocator_strategy);
}
} }
AllocatorStrategy GetAllocatorStrategy() { AllocatorStrategy GetAllocatorStrategy() {
......
...@@ -109,7 +109,7 @@ size_t BestFitAllocator::NumFreeChunks() const { ...@@ -109,7 +109,7 @@ size_t BestFitAllocator::NumFreeChunks() const {
} }
return num; return num;
} }
void BestFitAllocator::FreeImpl(Allocation* allocation) { void BestFitAllocator::Free(Allocation* allocation) {
auto* bf_allocation = dynamic_cast<BestFitAllocation*>(allocation); auto* bf_allocation = dynamic_cast<BestFitAllocation*>(allocation);
PADDLE_ENFORCE_NOT_NULL(bf_allocation, PADDLE_ENFORCE_NOT_NULL(bf_allocation,
"The input allocation is not BestFitAllocation."); "The input allocation is not BestFitAllocation.");
......
...@@ -119,7 +119,7 @@ class BestFitAllocator : public Allocator { ...@@ -119,7 +119,7 @@ class BestFitAllocator : public Allocator {
void InsertFreeNode(const ListIt& it); void InsertFreeNode(const ListIt& it);
protected: protected:
void FreeImpl(Allocation* allocation) override; void Free(Allocation* allocation) override;
Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override; Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
private: private:
......
...@@ -22,11 +22,11 @@ namespace paddle { ...@@ -22,11 +22,11 @@ namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
BufferedAllocator::BufferedAllocator(std::shared_ptr<Allocator> allocator) BufferedAllocator::BufferedAllocator(std::unique_ptr<Allocator> &&allocator)
: underlying_allocator_(std::move(allocator)) { : underlying_allocator_(std::move(allocator)) {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
underlying_allocator_, underlying_allocator_,
"Underlying allocator of BufferedAllocator must not be null"); "Underlying allocator of BufferedAllocator must be unmanaged");
if (underlying_allocator_->IsAllocThreadSafe()) { if (underlying_allocator_->IsAllocThreadSafe()) {
mtx_.reset(new std::mutex()); mtx_.reset(new std::mutex());
} }
...@@ -41,19 +41,19 @@ void BufferedAllocator::FreeCache(size_t size) { ...@@ -41,19 +41,19 @@ void BufferedAllocator::FreeCache(size_t size) {
while (!allocations_.empty()) { // free the largest while (!allocations_.empty()) { // free the largest
auto it = --allocations_.end(); auto it = --allocations_.end();
cur += it->second->size(); cur += it->second->size();
underlying_allocator_->Free(it->second.release()); delete it->second.release();
allocations_.erase(it); allocations_.erase(it);
if (cur >= size) return; if (cur >= size) return;
} }
} }
bool BufferedAllocator::IsAllocThreadSafe() const { return mtx_ != nullptr; } bool BufferedAllocator::IsAllocThreadSafe() const {
return this->underlying_allocator_->IsAllocThreadSafe();
void BufferedAllocator::FreeImpl(Allocation *allocation) { }
void BufferedAllocator::Free(Allocation *allocation) {
platform::LockGuardPtr<std::mutex> guard(mtx_); platform::LockGuardPtr<std::mutex> guard(mtx_);
allocations_.emplace(allocation->size(), AllocationPtr(allocation)); allocations_.emplace(allocation->size(), AllocationPtr(allocation));
} }
Allocation *BufferedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { Allocation *BufferedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
{ {
platform::LockGuardPtr<std::mutex> guard(mtx_); platform::LockGuardPtr<std::mutex> guard(mtx_);
...@@ -61,15 +61,17 @@ Allocation *BufferedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { ...@@ -61,15 +61,17 @@ Allocation *BufferedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
if (it != allocations_.end() && it->first < size * 2) { if (it != allocations_.end() && it->first < size * 2) {
AllocationPtr result(std::move(it->second)); AllocationPtr result(std::move(it->second));
allocations_.erase(it); allocations_.erase(it);
return result.release(); return new AllocationWithUnderlying(std::move(result));
} }
} }
try { try {
return underlying_allocator_->Allocate(size, attr).release(); return new AllocationWithUnderlying(
underlying_allocator_->Allocate(size, attr));
} catch (BadAlloc &) { } catch (BadAlloc &) {
FreeCache(size); FreeCache(size);
return underlying_allocator_->Allocate(size, attr).release(); return new AllocationWithUnderlying(
underlying_allocator_->Allocate(size, attr));
} }
} }
......
...@@ -31,7 +31,7 @@ namespace allocation { ...@@ -31,7 +31,7 @@ namespace allocation {
// underlying_allocator_ // underlying_allocator_
class BufferedAllocator : public Allocator { class BufferedAllocator : public Allocator {
public: public:
explicit BufferedAllocator(std::shared_ptr<Allocator> allocator); explicit BufferedAllocator(std::unique_ptr<Allocator> &&allocator);
~BufferedAllocator(); ~BufferedAllocator();
...@@ -44,11 +44,11 @@ class BufferedAllocator : public Allocator { ...@@ -44,11 +44,11 @@ class BufferedAllocator : public Allocator {
void FreeCache(size_t size); void FreeCache(size_t size);
protected: protected:
void FreeImpl(Allocation *allocation) override; void Free(Allocation *allocation) override;
Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override; Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
private: private:
std::shared_ptr<Allocator> underlying_allocator_; std::unique_ptr<Allocator> underlying_allocator_;
std::multimap<size_t, AllocationPtr> allocations_; std::multimap<size_t, AllocationPtr> allocations_;
std::unique_ptr<std::mutex> mtx_; std::unique_ptr<std::mutex> mtx_;
}; };
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/fluid/memory/allocation/buffered_allocator.h" #include "paddle/fluid/memory/allocation/buffered_allocator.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <memory>
#include <utility> #include <utility>
#include "paddle/fluid/memory/allocation/best_fit_allocator.h" #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h" #include "paddle/fluid/memory/allocation/cpu_allocator.h"
...@@ -65,7 +66,7 @@ class StubAllocator : public Allocator { ...@@ -65,7 +66,7 @@ class StubAllocator : public Allocator {
size_t GetFreeCount() const { return destruct_count_; } size_t GetFreeCount() const { return destruct_count_; }
protected: protected:
void FreeImpl(Allocation *allocation) override { void Free(Allocation *allocation) override {
auto *alloc = dynamic_cast<StubAllocation *>(allocation); auto *alloc = dynamic_cast<StubAllocation *>(allocation);
PADDLE_ENFORCE_NOT_NULL(alloc); PADDLE_ENFORCE_NOT_NULL(alloc);
if (alloc->ptr()) delete[] static_cast<uint8_t *>(alloc->ptr()); if (alloc->ptr()) delete[] static_cast<uint8_t *>(alloc->ptr());
......
...@@ -20,27 +20,25 @@ namespace paddle { ...@@ -20,27 +20,25 @@ namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
CPUAllocation::CPUAllocation(void *ptr, size_t size)
: Allocation(ptr, size, platform::CPUPlace()) {}
bool CPUAllocator::IsAllocThreadSafe() const { return true; } bool CPUAllocator::IsAllocThreadSafe() const { return true; }
void CPUAllocator::FreeImpl(Allocation *allocation) { void CPUAllocator::Free(Allocation *allocation) {
void *p = allocation->ptr(); PADDLE_ENFORCE_NOT_NULL(dynamic_cast<CPUAllocation *>(allocation));
#ifdef _WIN32 free(allocation->ptr());
_aligned_free(p);
#else
free(p);
#endif
delete allocation; delete allocation;
} }
Allocation *CPUAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { Allocation *CPUAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
void *p; void *ptr;
#ifdef _WIN32 auto status = posix_memalign(&ptr, kAlignment, size);
p = _aligned_malloc(size, kAlignment); if (UNLIKELY(status) != 0) {
#else throw BadAlloc(string::Sprintf("Cannot allocate cpu memory %d. Errno is %d",
PADDLE_ENFORCE_EQ(posix_memalign(&p, kAlignment, size), 0, "Alloc %ld error!", size, status));
size); }
#endif return new CPUAllocation(ptr, size);
return new Allocation(p, size, platform::CPUPlace());
} }
} // namespace allocation } // namespace allocation
} // namespace memory } // namespace memory
......
...@@ -31,13 +31,19 @@ namespace allocation { ...@@ -31,13 +31,19 @@ namespace allocation {
// //
// NOTE(yy): It is no need to use `BestFitAllocator` in CPU. We can import // NOTE(yy): It is no need to use `BestFitAllocator` in CPU. We can import
// an open-sourced allocator into Paddle. // an open-sourced allocator into Paddle.
class CPUAllocator;
class CPUAllocation : public Allocation {
public:
CPUAllocation(void* ptr, size_t size);
};
class CPUAllocator : public Allocator { class CPUAllocator : public Allocator {
public: public:
constexpr static size_t kAlignment = 4096UL; constexpr static size_t kAlignment = 64u;
bool IsAllocThreadSafe() const override; bool IsAllocThreadSafe() const override;
protected: protected:
void FreeImpl(Allocation* allocation) override; void Free(Allocation* allocation) override;
Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override; Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
}; };
} // namespace allocation } // namespace allocation
......
...@@ -23,14 +23,15 @@ namespace paddle { ...@@ -23,14 +23,15 @@ namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
bool CUDAAllocator::IsAllocThreadSafe() const { return true; } bool CUDAAllocator::IsAllocThreadSafe() const { return true; }
void CUDAAllocator::FreeImpl(Allocation* allocation) { void CUDAAllocator::Free(Allocation* allocation) {
platform::CUDADeviceGuard guard(place_.device); platform::CUDADeviceGuard guard(place_.device);
PADDLE_ENFORCE_EQ(boost::get<platform::CUDAPlace>(allocation->place()), auto* cuda_allocation = dynamic_cast<CUDAAllocation*>(allocation);
PADDLE_ENFORCE_NOT_NULL(cuda_allocation);
PADDLE_ENFORCE_EQ(boost::get<platform::CUDAPlace>(cuda_allocation->place()),
place_); place_);
PADDLE_ENFORCE(cudaFree(allocation->ptr())); PADDLE_ENFORCE(cudaFree(allocation->ptr()));
delete allocation; delete allocation;
} }
Allocation* CUDAAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { Allocation* CUDAAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
platform::CUDADeviceGuard guard(place_.device); platform::CUDADeviceGuard guard(place_.device);
void* ptr; void* ptr;
...@@ -40,9 +41,8 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { ...@@ -40,9 +41,8 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
"Cannot allocate %d on GPU %d, cuda status %d, %s", size, place_.device, "Cannot allocate %d on GPU %d, cuda status %d, %s", size, place_.device,
status, cudaGetErrorString(status))); status, cudaGetErrorString(status)));
} }
return new Allocation(ptr, size, platform::Place(place_)); return new CUDAAllocation(ptr, size, platform::Place(place_));
} }
} // namespace allocation } // namespace allocation
} // namespace memory } // namespace memory
} // namespace paddle } // namespace paddle
...@@ -20,6 +20,13 @@ namespace paddle { ...@@ -20,6 +20,13 @@ namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
// CUDA System allocator and allocation.
// Just a flag type.
class CUDAAllocation : public Allocation {
public:
using Allocation::Allocation;
};
class CUDAAllocator : public Allocator { class CUDAAllocator : public Allocator {
public: public:
explicit CUDAAllocator(const platform::CUDAPlace& place) : place_(place) {} explicit CUDAAllocator(const platform::CUDAPlace& place) : place_(place) {}
...@@ -28,7 +35,7 @@ class CUDAAllocator : public Allocator { ...@@ -28,7 +35,7 @@ class CUDAAllocator : public Allocator {
bool IsAllocThreadSafe() const override; bool IsAllocThreadSafe() const override;
protected: protected:
void FreeImpl(Allocation* allocation) override; void Free(Allocation* allocation) override;
Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override; Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
private: private:
......
...@@ -134,22 +134,26 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) { ...@@ -134,22 +134,26 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
class GPUBuddyAllocatorList { BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
public: static std::once_flag init_flag;
GPUBuddyAllocatorList() static detail::BuddyAllocator **a_arr = nullptr;
: allocators_(platform::GetCUDADeviceCount()), static std::vector<int> devices;
flags_(platform::GetCUDADeviceCount()) {
allocation::GPUMemMonitor.Initialize(allocators_.size()); std::call_once(init_flag, [gpu_id]() {
} devices = platform::GetSelectedDevices();
int gpu_num = devices.size();
BuddyAllocator *Get(size_t dev_id) { allocation::GPUMemMonitor.Initialize(devices.size());
PADDLE_ENFORCE(dev_id < flags_.size(), "Invalid device id %s", dev_id);
std::call_once(flags_[dev_id], [this, dev_id] { a_arr = new BuddyAllocator *[gpu_num];
for (size_t i = 0; i < devices.size(); ++i) {
int dev_id = devices[i];
a_arr[i] = nullptr;
platform::SetDeviceId(dev_id); platform::SetDeviceId(dev_id);
allocators_[dev_id] = new BuddyAllocator( a_arr[i] = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
std::unique_ptr<detail::SystemAllocator>( new detail::GPUAllocator(dev_id)),
new detail::GPUAllocator(dev_id)), platform::GpuMinChunkSize(),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); platform::GpuMaxChunkSize());
VLOG(10) << "\n\nNOTE:\n" VLOG(10) << "\n\nNOTE:\n"
<< "You can set GFlags environment variable " << "You can set GFlags environment variable "
...@@ -163,19 +167,13 @@ class GPUBuddyAllocatorList { ...@@ -163,19 +167,13 @@ class GPUBuddyAllocatorList {
<< FLAGS_initial_gpu_memory_in_mb << FLAGS_initial_gpu_memory_in_mb
<< ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is " << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
<< FLAGS_reallocate_gpu_memory_in_mb << "\n\n"; << FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
}); }
return allocators_[dev_id]; });
}
private:
std::vector<BuddyAllocator *> allocators_;
std::vector<std::once_flag> flags_;
};
BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
static GPUBuddyAllocatorList allocators;
platform::SetDeviceId(gpu_id); platform::SetDeviceId(gpu_id);
return allocators.Get(gpu_id); auto pos = std::distance(devices.begin(),
std::find(devices.begin(), devices.end(), gpu_id));
return a_arr[pos];
} }
#endif #endif
...@@ -194,7 +192,7 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place, ...@@ -194,7 +192,7 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
auto *buddy_allocator = GetGPUBuddyAllocator(place.device); auto *buddy_allocator = GetGPUBuddyAllocator(place.device);
auto *ptr = buddy_allocator->Alloc(size); auto *ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr && size > 0) { if (ptr == nullptr) {
int cur_dev = platform::GetCurrentDeviceId(); int cur_dev = platform::GetCurrentDeviceId();
platform::SetDeviceId(place.device); platform::SetDeviceId(place.device);
size_t avail, total; size_t avail, total;
...@@ -349,7 +347,7 @@ Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { ...@@ -349,7 +347,7 @@ Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
return tmp_alloc; return tmp_alloc;
} }
void LegacyAllocator::FreeImpl(Allocation *allocation) { void LegacyAllocator::Free(Allocation *allocation) {
boost::apply_visitor( boost::apply_visitor(
legacy::FreeVisitor(allocation->ptr(), allocation->size()), legacy::FreeVisitor(allocation->ptr(), allocation->size()),
allocation->place()); allocation->place());
......
...@@ -73,7 +73,7 @@ class LegacyAllocator : public Allocator { ...@@ -73,7 +73,7 @@ class LegacyAllocator : public Allocator {
protected: protected:
Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override; Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
void FreeImpl(Allocation *allocation) override; void Free(Allocation *allocation) override;
private: private:
platform::Place place_; platform::Place place_;
......
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
#include <utility> #include <utility>
#include "paddle/fluid/memory/allocation/allocation_with_underlying.h" #include "paddle/fluid/memory/allocation/allocation_with_underlying.h"
#include "paddle/fluid/platform/lock_guard_ptr.h" #include "paddle/fluid/platform/lock_guard_ptr.h"
namespace paddle { namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
...@@ -25,24 +24,26 @@ namespace allocation { ...@@ -25,24 +24,26 @@ namespace allocation {
bool LockedAllocator::IsAllocThreadSafe() const { return true; } bool LockedAllocator::IsAllocThreadSafe() const { return true; }
LockedAllocator::LockedAllocator( LockedAllocator::LockedAllocator(
std::shared_ptr<Allocator> underlying_allocator) std::unique_ptr<Allocator> &&underlying_allocator)
: underlying_allocator_(std::move(underlying_allocator)) { : underlying_allocator_(std::move(underlying_allocator)) {
PADDLE_ENFORCE_NOT_NULL(underlying_allocator_); PADDLE_ENFORCE_NOT_NULL(underlying_allocator_);
if (!underlying_allocator_->IsAllocThreadSafe()) { if (!underlying_allocator_->IsAllocThreadSafe()) {
mtx_.reset(new std::mutex()); mtx_.reset(new std::mutex());
} }
} }
void LockedAllocator::Free(Allocation *allocation) {
void LockedAllocator::FreeImpl(Allocation *allocation) { {
platform::LockGuardPtr<std::mutex> guard(mtx_); platform::LockGuardPtr<std::mutex> guard(mtx_);
underlying_allocator_->Free(allocation); reinterpret_cast<AllocationWithUnderlying *>(allocation)
->allocation_.reset(); // Destroy inner allocation
}
delete allocation;
} }
Allocation *LockedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { Allocation *LockedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
platform::LockGuardPtr<std::mutex> guard(mtx_); platform::LockGuardPtr<std::mutex> guard(mtx_);
return underlying_allocator_->Allocate(size, attr).release(); return new AllocationWithUnderlying(
underlying_allocator_->Allocate(size, attr));
} }
} // namespace allocation } // namespace allocation
} // namespace memory } // namespace memory
} // namespace paddle } // namespace paddle
...@@ -24,15 +24,15 @@ namespace allocation { ...@@ -24,15 +24,15 @@ namespace allocation {
// A allocator to make underlying allocator thread safe. // A allocator to make underlying allocator thread safe.
class LockedAllocator : public Allocator { class LockedAllocator : public Allocator {
public: public:
explicit LockedAllocator(std::shared_ptr<Allocator> underlying_allocator); explicit LockedAllocator(std::unique_ptr<Allocator> &&underlying_allocator);
bool IsAllocThreadSafe() const override; bool IsAllocThreadSafe() const override;
protected: protected:
void FreeImpl(Allocation *allocation) override; void Free(Allocation *allocation) override;
Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override; Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
private: private:
std::shared_ptr<Allocator> underlying_allocator_; std::unique_ptr<Allocator> underlying_allocator_;
std::unique_ptr<std::mutex> mtx_; std::unique_ptr<std::mutex> mtx_;
}; };
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#ifdef PADDLE_WITH_CUDA
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
DECLARE_int64(gpu_allocator_retry_time);
#endif
DECLARE_string(allocator_strategy);
namespace paddle {
namespace memory {
namespace allocation {
TEST(allocator, allocator) {
#ifdef PADDLE_WITH_CUDA
FLAGS_fraction_of_gpu_memory_to_use = 0.01;
FLAGS_gpu_allocator_retry_time = 500;
FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
#endif
FLAGS_allocator_strategy = "naive_best_fit";
auto &instance = AllocatorFacade::Instance();
platform::Place place;
size_t size = 1024;
{
place = platform::CPUPlace();
size = 1024;
auto cpu_allocation = instance.Alloc(place, size);
ASSERT_NE(cpu_allocation, nullptr);
ASSERT_NE(cpu_allocation->ptr(), nullptr);
ASSERT_EQ(cpu_allocation->place(), place);
ASSERT_EQ(cpu_allocation->size(), size);
}
#ifdef PADDLE_WITH_CUDA
{
place = platform::CUDAPlace(0);
size = 1024;
auto gpu_allocation = instance.Alloc(place, size);
ASSERT_NE(gpu_allocation, nullptr);
ASSERT_NE(gpu_allocation->ptr(), nullptr);
ASSERT_EQ(gpu_allocation->place(), place);
ASSERT_GE(gpu_allocation->size(), size);
}
{
// Allocate 2GB gpu memory
place = platform::CUDAPlace(0);
size = 2 * static_cast<size_t>(1 << 30);
auto gpu_allocation = instance.Alloc(place, size);
ASSERT_NE(gpu_allocation, nullptr);
ASSERT_NE(gpu_allocation->ptr(), nullptr);
ASSERT_EQ(gpu_allocation->place(), place);
ASSERT_GE(gpu_allocation->size(), size);
}
{
place = platform::CUDAPinnedPlace();
size = (1 << 20);
auto cuda_pinned_allocation =
instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
ASSERT_NE(cuda_pinned_allocation, nullptr);
ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
ASSERT_EQ(cuda_pinned_allocation->place(), place);
ASSERT_GE(cuda_pinned_allocation->size(), size);
}
#endif
}
} // namespace allocation
} // namespace memory
} // namespace paddle
...@@ -20,15 +20,20 @@ namespace paddle { ...@@ -20,15 +20,20 @@ namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; } bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; }
void CPUPinnedAllocator::FreeImpl(Allocation *allocation) { void CPUPinnedAllocator::Free(Allocation *allocation) {
PADDLE_ENFORCE_NOT_NULL(dynamic_cast<CPUPinnedAllocation *>(allocation));
PADDLE_ENFORCE(cudaFreeHost(allocation->ptr())); PADDLE_ENFORCE(cudaFreeHost(allocation->ptr()));
delete allocation; delete allocation;
} }
Allocation *CPUPinnedAllocator::AllocateImpl(size_t size, Allocation *CPUPinnedAllocator::AllocateImpl(size_t size,
Allocator::Attr attr) { Allocator::Attr attr) {
// PADDLE_ENFORCE_EQ(
// attr, kCrossDevice,
// "CPUPinnedAllocator should be used for Cross-Device Communication");
void *ptr; void *ptr;
PADDLE_ENFORCE(cudaHostAlloc(&ptr, size, cudaHostAllocPortable)); PADDLE_ENFORCE(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
return new Allocation(ptr, size, platform::CUDAPinnedPlace()); return new CPUPinnedAllocation(ptr, size);
} }
} // namespace allocation } // namespace allocation
} // namespace memory } // namespace memory
......
...@@ -20,12 +20,18 @@ namespace memory { ...@@ -20,12 +20,18 @@ namespace memory {
namespace allocation { namespace allocation {
// Allocator uses `cudaHostAlloc` // Allocator uses `cudaHostAlloc`
class CPUPinnedAllocation : public Allocation {
public:
CPUPinnedAllocation(void *ptr, size_t size)
: Allocation(ptr, size, platform::CUDAPinnedPlace()) {}
};
class CPUPinnedAllocator : public Allocator { class CPUPinnedAllocator : public Allocator {
public: public:
bool IsAllocThreadSafe() const override; bool IsAllocThreadSafe() const override;
protected: protected:
void FreeImpl(Allocation *allocation) override; void Free(Allocation *allocation) override;
Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override; Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
}; };
......
...@@ -18,15 +18,25 @@ namespace paddle { ...@@ -18,15 +18,25 @@ namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
void RetryAllocator::FreeImpl(Allocation* allocation) { bool RetryAllocator::IsAllocThreadSafe() const {
return underlying_allocator_->IsAllocThreadSafe();
}
void RetryAllocator::Free(Allocation* allocation) {
// Delete underlying allocation first. // Delete underlying allocation first.
underlying_allocator_->Free(allocation); reinterpret_cast<AllocationWithUnderlying*>(allocation)->allocation_.reset();
cv_.notify_all(); {
// notify all waited allocators, they can try to allocate memory after free.
std::lock_guard<std::mutex> lock(mutex_);
cv_.notify_all();
}
delete allocation;
} }
Allocation* RetryAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { Allocation* RetryAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
auto alloc_func = [&, this]() { auto alloc_func = [&, this]() {
return underlying_allocator_->Allocate(size, attr).release(); return new AllocationWithUnderlying(
underlying_allocator_->Allocate(size, attr));
}; };
// In fact, we can unify the code of allocation success and failure // In fact, we can unify the code of allocation success and failure
// But it would add lock even when allocation success at the first time // But it would add lock even when allocation success at the first time
......
...@@ -25,25 +25,32 @@ namespace paddle { ...@@ -25,25 +25,32 @@ namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
class RetryAllocator;
class RetryAllocator : public Allocator { class RetryAllocator : public Allocator {
public: public:
RetryAllocator(std::shared_ptr<Allocator> allocator, size_t retry_ms) RetryAllocator(std::unique_ptr<Allocator>&& allocator, size_t retry_ms)
: underlying_allocator_(std::move(allocator)), retry_time_(retry_ms) { : underlying_allocator_(std::move(allocator)), retry_time_(retry_ms) {
EnforceCheck();
}
bool IsAllocThreadSafe() const override;
private:
void EnforceCheck() {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
underlying_allocator_, underlying_allocator_.get(),
"UnderlyingAllocator of RetryAllocator must not be null"); "UnderlyingAllocator of RetryAllocator must be UnmanagedAllocator");
PADDLE_ENFORCE(underlying_allocator_->IsAllocThreadSafe(), PADDLE_ENFORCE(underlying_allocator_->IsAllocThreadSafe(),
"UnderlyingAllocator of RetryAllocator must be thread-safe"); "UnderlyingAllocator of RetryAllocator must be thread-safe");
} }
bool IsAllocThreadSafe() const override { return true; }
protected: protected:
void FreeImpl(Allocation* allocation) override; void Free(Allocation* allocation) override;
Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override; Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
private: private:
std::shared_ptr<Allocator> underlying_allocator_; std::unique_ptr<Allocator> underlying_allocator_;
std::chrono::milliseconds retry_time_; std::chrono::milliseconds retry_time_;
std::mutex mutex_; std::mutex mutex_;
std::condition_variable cv_; std::condition_variable cv_;
...@@ -51,6 +58,8 @@ class RetryAllocator : public Allocator { ...@@ -51,6 +58,8 @@ class RetryAllocator : public Allocator {
// For debug, We can add an atomic integer to record how many memory sizes are // For debug, We can add an atomic integer to record how many memory sizes are
// waited to allocate // waited to allocate
// std::atomic<size_t> waited_allocate_size_{0}; // std::atomic<size_t> waited_allocate_size_{0};
friend class RetryAllocation;
}; };
} // namespace allocation } // namespace allocation
......
...@@ -24,20 +24,11 @@ bool ZeroSizeAllocator::IsAllocThreadSafe() const { ...@@ -24,20 +24,11 @@ bool ZeroSizeAllocator::IsAllocThreadSafe() const {
Allocation *ZeroSizeAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { Allocation *ZeroSizeAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
if (size == 0) { if (size == 0) {
return new Allocation(nullptr, 0, place_); return new ZeroSizeAllocation(place_);
} else { } else {
return underlying_allocator_->Allocate(size, attr).release(); return underlying_allocator_->Allocate(size, attr).release();
} }
} }
void ZeroSizeAllocator::FreeImpl(Allocation *allocation) {
if (allocation->size() == 0) {
delete allocation;
} else {
underlying_allocator_->Free(allocation);
}
}
} // namespace allocation } // namespace allocation
} // namespace memory } // namespace memory
} // namespace paddle } // namespace paddle
...@@ -24,6 +24,12 @@ namespace allocation { ...@@ -24,6 +24,12 @@ namespace allocation {
// The allocator handles the request's size is zero. Allocator will always // The allocator handles the request's size is zero. Allocator will always
// return an allocation even the request size is zero. However, the // return an allocation even the request size is zero. However, the
// allocation.ptr() is nullptr // allocation.ptr() is nullptr
class ZeroSizeAllocation : public Allocation {
public:
explicit ZeroSizeAllocation(const platform::Place& p)
: Allocation(nullptr, 0, p) {}
};
class ZeroSizeAllocator : public Allocator { class ZeroSizeAllocator : public Allocator {
public: public:
ZeroSizeAllocator(std::shared_ptr<Allocator> underlying_allocator, ZeroSizeAllocator(std::shared_ptr<Allocator> underlying_allocator,
...@@ -34,7 +40,6 @@ class ZeroSizeAllocator : public Allocator { ...@@ -34,7 +40,6 @@ class ZeroSizeAllocator : public Allocator {
protected: protected:
Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override; Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
void FreeImpl(Allocation* allocation) override;
private: private:
std::shared_ptr<Allocator> underlying_allocator_; std::shared_ptr<Allocator> underlying_allocator_;
......
...@@ -65,7 +65,8 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> { ...@@ -65,7 +65,8 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
// Get numel and dtype // Get numel and dtype
size_t numel = 0; size_t numel = 0;
auto dtype = kDefaultDtype; auto dtype = kDefaultDtype;
GetMemSizeAndDtype(in_tensors, in_var_names, &numel, &dtype); GetMemSizeAndDtype(in_tensors, in_var_names, &numel, &dtype,
context.GetPlace());
// Alloc the continuous space // Alloc the continuous space
auto fused_tensor = context.Output<framework::LoDTensor>("FusedOutput"); auto fused_tensor = context.Output<framework::LoDTensor>("FusedOutput");
...@@ -74,14 +75,18 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> { ...@@ -74,14 +75,18 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
// Init the continuous space // Init the continuous space
auto out_tensors = context.MultiOutput<framework::LoDTensor>("Output"); auto out_tensors = context.MultiOutput<framework::LoDTensor>("Output");
int64_t offset = 0; size_t offset = 0;
size_t size_of_dtype = framework::SizeOfType(dtype);
if (context.Attr<bool>("copy_data")) { if (context.Attr<bool>("copy_data")) {
for (size_t i = 0; i < in_var_names.size(); ++i) { for (size_t i = 0; i < in_var_names.size(); ++i) {
int64_t len = out_tensors[i]->numel(); size_t len = static_cast<size_t>(in_tensors[i]->numel());
auto sub_tensor = fused_tensor->Slice(offset, offset + len); auto sub_tensor = fused_tensor->Slice(
offset += len; static_cast<int64_t>(offset), static_cast<int64_t>(offset + len));
framework::TensorCopy(*out_tensors[i], context.GetPlace(), dev_ctx, framework::TensorCopy(*in_tensors[i], context.GetPlace(), dev_ctx,
&sub_tensor); &sub_tensor);
offset +=
Alignment(len * size_of_dtype, context.GetPlace()) / size_of_dtype;
} }
} else if (context.Attr<bool>("set_constant")) { } else if (context.Attr<bool>("set_constant")) {
math::SetConstant<DeviceContext, T> set_constant; math::SetConstant<DeviceContext, T> set_constant;
...@@ -92,11 +97,13 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> { ...@@ -92,11 +97,13 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
// Make the outputs point to the continuous space. // Make the outputs point to the continuous space.
offset = 0; offset = 0;
for (size_t i = 0; i < out_tensors.size(); ++i) { for (size_t i = 0; i < out_tensors.size(); ++i) {
int64_t len = out_tensors[i]->numel(); size_t len = static_cast<size_t>(out_tensors[i]->numel());
auto dim = out_tensors[i]->dims(); auto dim = out_tensors[i]->dims();
out_tensors[i] out_tensors[i]
->ShareDataWith(fused_tensor->Slice(offset, offset + len)) ->ShareDataWith(fused_tensor->Slice(
static_cast<int64_t>(offset), static_cast<int64_t>(offset + len)))
.Resize(dim); .Resize(dim);
len = Alignment(len * size_of_dtype, context.GetPlace()) / size_of_dtype;
offset += len; offset += len;
VLOG(10) << "alloc_space_for_vars: output(" << out_var_names[i] VLOG(10) << "alloc_space_for_vars: output(" << out_var_names[i]
<< ") ,dim:(" << dim << ")" << ") ,dim:(" << dim << ")"
...@@ -104,12 +111,28 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> { ...@@ -104,12 +111,28 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
} }
} }
private:
// Note(zcd): Addresses should be aligned, otherwise, the results may have
// diff.
size_t Alignment(size_t size, const platform::Place &place) const {
// Allow to allocate the minimum chunk size is 4 KB.
size_t alignment = 1 << 12;
if (platform::is_gpu_place(place)) {
// Allow to allocate the minimum chunk size is 256 B.
alignment = 1 << 8;
}
size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining);
}
void GetMemSizeAndDtype( void GetMemSizeAndDtype(
const std::vector<const framework::LoDTensor *> &lod_tensors, const std::vector<const framework::LoDTensor *> &lod_tensors,
const std::vector<std::string> var_names, size_t *numel, const std::vector<std::string> var_names, size_t *numel,
framework::proto::VarType::Type *dtype) const { framework::proto::VarType::Type *dtype,
const platform::Place &place) const {
PADDLE_ENFORCE_EQ(lod_tensors.size(), var_names.size()); PADDLE_ENFORCE_EQ(lod_tensors.size(), var_names.size());
*numel = 0; *numel = 0;
size_t size_of_dtype = 0;
for (size_t i = 0; i < var_names.size(); ++i) { for (size_t i = 0; i < var_names.size(); ++i) {
PADDLE_ENFORCE(lod_tensors[i]->IsInitialized(), "%s is not initialized.", PADDLE_ENFORCE(lod_tensors[i]->IsInitialized(), "%s is not initialized.",
var_names[i]); var_names[i]);
...@@ -119,6 +142,7 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> { ...@@ -119,6 +142,7 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_NE(p_dtype, kDefaultDtype, "%s's type should not be %s.", PADDLE_ENFORCE_NE(p_dtype, kDefaultDtype, "%s's type should not be %s.",
var_names[i], kDefaultDtype); var_names[i], kDefaultDtype);
*dtype = p_dtype; *dtype = p_dtype;
size_of_dtype = framework::SizeOfType(p_dtype);
} }
PADDLE_ENFORCE_EQ(p_dtype, *dtype, "Input vars is not equal."); PADDLE_ENFORCE_EQ(p_dtype, *dtype, "Input vars is not equal.");
...@@ -126,7 +150,8 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> { ...@@ -126,7 +150,8 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_GT(size, 0); PADDLE_ENFORCE_GT(size, 0);
VLOG(10) << "alloc_space_for_vars: input(" << var_names[i] << ") ,dim:(" VLOG(10) << "alloc_space_for_vars: input(" << var_names[i] << ") ,dim:("
<< lod_tensors[i]->dims() << ")"; << lod_tensors[i]->dims() << ")";
*numel += size; *numel += Alignment(static_cast<size_t>(size) * size_of_dtype, place) /
size_of_dtype;
} }
} }
}; };
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/bpr_loss_op.h" #include "paddle/fluid/operators/bpr_loss_op.h"
#include <memory>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -127,6 +128,23 @@ neural networks>(https://arxiv.org/abs/1511.06939) ...@@ -127,6 +128,23 @@ neural networks>(https://arxiv.org/abs/1511.06939)
)DOC"); )DOC");
} }
}; };
class BprLossGradDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("bpr_loss_grad");
op->SetInput("X", Input("X"));
op->SetInput("Label", Input("Label"));
op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -134,7 +152,7 @@ namespace ops = paddle::operators; ...@@ -134,7 +152,7 @@ namespace ops = paddle::operators;
using CPUCtx = paddle::platform::CPUDeviceContext; using CPUCtx = paddle::platform::CPUDeviceContext;
REGISTER_OPERATOR(bpr_loss, ops::BprLossOp, ops::BprLossOpMaker, REGISTER_OPERATOR(bpr_loss, ops::BprLossOp, ops::BprLossOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); ops::BprLossGradDescMaker);
REGISTER_OPERATOR(bpr_loss_grad, ops::BprLossGradientOp); REGISTER_OPERATOR(bpr_loss_grad, ops::BprLossGradientOp);
REGISTER_OP_CPU_KERNEL(bpr_loss, ops::BprLossOpKernel<CPUCtx, float>, REGISTER_OP_CPU_KERNEL(bpr_loss, ops::BprLossOpKernel<CPUCtx, float>,
ops::BprLossOpKernel<CPUCtx, double>); ops::BprLossOpKernel<CPUCtx, double>);
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <memory>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
...@@ -568,13 +569,31 @@ class ROIPerspectiveTransformOpMaker ...@@ -568,13 +569,31 @@ class ROIPerspectiveTransformOpMaker
} }
}; };
class ROIPerspectiveTransformGradDescMaker
: public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("roi_perspective_transform_grad");
op->SetInput("X", Input("X"));
op->SetInput("ROIs", Input("ROIs"));
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(roi_perspective_transform, ops::ROIPerspectiveTransformOp, REGISTER_OPERATOR(roi_perspective_transform, ops::ROIPerspectiveTransformOp,
ops::ROIPerspectiveTransformOpMaker, ops::ROIPerspectiveTransformOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); ops::ROIPerspectiveTransformGradDescMaker);
REGISTER_OPERATOR(roi_perspective_transform_grad, REGISTER_OPERATOR(roi_perspective_transform_grad,
ops::ROIPerspectiveTransformGradOp); ops::ROIPerspectiveTransformGradOp);
REGISTER_OP_CPU_KERNEL(roi_perspective_transform, REGISTER_OP_CPU_KERNEL(roi_perspective_transform,
......
...@@ -77,7 +77,8 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> { ...@@ -77,7 +77,8 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
} else { } else {
functor.RunMidWise(n, pre, post); functor.RunMidWise(n, pre, post);
} }
z->set_mkldnn_prim_desc(x->get_mkldnn_prim_desc()); z->set_layout(DataLayout::kMKLDNN);
z->set_format(x->format());
} else { } else {
PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN && PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
x->format() != memory::format::format_undef, x->format() != memory::format::format_undef,
...@@ -115,8 +116,7 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> { ...@@ -115,8 +116,7 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_pd); auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_pd);
// create mkldnn memory for dst // create mkldnn memory for dst
auto dst_mem_pd = sum_pd.dst_primitive_desc(); memory dst_memory = memory(sum_pd.dst_primitive_desc(), z_data);
memory dst_memory = memory(dst_mem_pd, z_data);
std::vector<primitive::at> inputs; std::vector<primitive::at> inputs;
inputs.push_back(srcs[0]); inputs.push_back(srcs[0]);
...@@ -129,7 +129,9 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> { ...@@ -129,7 +129,9 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
pipeline.push_back(sum_prim); pipeline.push_back(sum_prim);
stream(stream::kind::eager).submit(pipeline).wait(); stream(stream::kind::eager).submit(pipeline).wait();
z->set_mkldnn_prim_desc(dst_mem_pd); z->set_layout(DataLayout::kMKLDNN);
z->set_format(
(memory::format)dst_memory.get_primitive_desc().desc().data.format);
} }
} }
}; };
...@@ -150,19 +152,24 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> { ...@@ -150,19 +152,24 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
auto* out = dout; auto* out = dout;
auto *x = dout, *y = dout; auto *x = dout, *y = dout;
auto set_mkldnn_format = [](Tensor* in, const Tensor* out) {
in->set_layout(DataLayout::kMKLDNN);
in->set_format(out->format());
};
if (dx != nullptr && dy != nullptr && dx->dims() == dy->dims()) { if (dx != nullptr && dy != nullptr && dx->dims() == dy->dims()) {
if (dx->dims() == dy->dims()) { if (dx->dims() == dy->dims()) {
auto blas = math::GetBlas<paddle::platform::CPUDeviceContext, T>(ctx); auto blas = math::GetBlas<paddle::platform::CPUDeviceContext, T>(ctx);
if (dx) { if (dx) {
blas.VCOPY(dout->numel(), dout->data<T>(), blas.VCOPY(dout->numel(), dout->data<T>(),
dx->mutable_data<T>(ctx.GetPlace())); dx->mutable_data<T>(ctx.GetPlace()));
dx->set_mkldnn_prim_desc(dout->get_mkldnn_prim_desc()); set_mkldnn_format(dx, dout);
} }
if (dy) { if (dy) {
blas.VCOPY(dout->numel(), dout->data<T>(), blas.VCOPY(dout->numel(), dout->data<T>(),
dy->mutable_data<T>(ctx.GetPlace())); dy->mutable_data<T>(ctx.GetPlace()));
dy->set_mkldnn_prim_desc(dout->get_mkldnn_prim_desc()); set_mkldnn_format(dy, dout);
} }
} }
} else { } else {
......
...@@ -65,11 +65,17 @@ by input arguments. ...@@ -65,11 +65,17 @@ by input arguments.
} }
}; };
DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
GaussianRandomBatchSizeLikeNoNeedBufferVarsInference, "Input");
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OP_WITHOUT_GRADIENT( REGISTER_OPERATOR(
gaussian_random_batch_size_like, gaussian_random_batch_size_like,
paddle::operators::GaussianRandomBatchSizeLikeOp, paddle::operators::GaussianRandomBatchSizeLikeOp,
paddle::operators::GaussianRandomBatchSizeLikeOpMaker); paddle::operators::GaussianRandomBatchSizeLikeOpMaker,
paddle::framework::EmptyGradOpMaker,
paddle::operators::GaussianRandomBatchSizeLikeNoNeedBufferVarsInference);
// Kernels are registered in gaussian_random_op.cc and gaussian_random_op.cu // Kernels are registered in gaussian_random_op.cc and gaussian_random_op.cu
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/im2sequence_op.h" #include "paddle/fluid/operators/im2sequence_op.h"
#include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -146,12 +147,28 @@ class Im2SequenceGradOp : public framework::OperatorWithKernel { ...@@ -146,12 +147,28 @@ class Im2SequenceGradOp : public framework::OperatorWithKernel {
} }
}; };
class Im2SequenceGradDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("im2sequence_grad");
op->SetInput("X", Input("X"));
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(im2sequence, ops::Im2SequenceOp, ops::Im2SequenceOpMaker, REGISTER_OPERATOR(im2sequence, ops::Im2SequenceOp, ops::Im2SequenceOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); ops::Im2SequenceGradDescMaker);
REGISTER_OPERATOR(im2sequence_grad, ops::Im2SequenceGradOp); REGISTER_OPERATOR(im2sequence_grad, ops::Im2SequenceGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
im2sequence, im2sequence,
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/interpolate_op.h" #include "paddle/fluid/operators/interpolate_op.h"
#include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -194,21 +195,46 @@ class InterpolateOpGrad : public framework::OperatorWithKernel { ...@@ -194,21 +195,46 @@ class InterpolateOpGrad : public framework::OperatorWithKernel {
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(), return framework::OpKernelType(
ctx.GetPlace()); ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
ctx.GetPlace());
}
};
class InterpolateGradDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType(ForwardOp().Type() + "_grad");
op->SetInput("X", Input("X"));
if (ForwardOp().Inputs().count("OutSize") > 0) {
op->SetInput("OutSize", Input("OutSize"));
}
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
} }
}; };
DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(InterpolateGradNoNeedBufferVarsInference,
"X");
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(bilinear_interp, ops::InterpolateOp, ops::InterpolateOpMaker, REGISTER_OPERATOR(bilinear_interp, ops::InterpolateOp, ops::InterpolateOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); ops::InterpolateGradDescMaker);
REGISTER_OPERATOR(bilinear_interp_grad, ops::InterpolateOpGrad); REGISTER_OPERATOR(bilinear_interp_grad, ops::InterpolateOpGrad,
ops::InterpolateGradNoNeedBufferVarsInference);
REGISTER_OPERATOR(nearest_interp, ops::InterpolateOp, ops::InterpolateOpMaker, REGISTER_OPERATOR(nearest_interp, ops::InterpolateOp, ops::InterpolateOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); ops::InterpolateGradDescMaker);
REGISTER_OPERATOR(nearest_interp_grad, ops::InterpolateOpGrad); REGISTER_OPERATOR(nearest_interp_grad, ops::InterpolateOpGrad,
ops::InterpolateGradNoNeedBufferVarsInference);
REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::InterpolateKernel<float>, REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::InterpolateKernel<float>,
ops::InterpolateKernel<double>, ops::InterpolateKernel<double>,
ops::InterpolateKernel<uint8_t>); ops::InterpolateKernel<uint8_t>);
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/l1_norm_op.h" #include "paddle/fluid/operators/l1_norm_op.h"
#include <memory>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -62,12 +63,28 @@ $$Out = \sum{|X|}$$ ...@@ -62,12 +63,28 @@ $$Out = \sum{|X|}$$
} }
}; };
class L1NormGradDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("l1_norm_grad");
op->SetInput("X", Input("X"));
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(l1_norm, ops::L1NormOp, ops::L1NormOpMaker, REGISTER_OPERATOR(l1_norm, ops::L1NormOp, ops::L1NormOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); ops::L1NormGradDescMaker);
REGISTER_OPERATOR(l1_norm_grad, ops::L1NormGradOp); REGISTER_OPERATOR(l1_norm_grad, ops::L1NormGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
l1_norm, ops::L1NormKernel<paddle::platform::CPUDeviceContext, float>); l1_norm, ops::L1NormKernel<paddle::platform::CPUDeviceContext, float>);
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/label_smooth_op.h" #include "paddle/fluid/operators/label_smooth_op.h"
#include <memory>
#include <string> #include <string>
namespace paddle { namespace paddle {
...@@ -105,10 +106,23 @@ class LabelSmoothGradOp : public framework::OperatorWithKernel { ...@@ -105,10 +106,23 @@ class LabelSmoothGradOp : public framework::OperatorWithKernel {
: OperatorWithKernel(type, inputs, outputs, attrs) {} : OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null."); ctx->SetOutputDim(framework::GradVarName("X"),
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), ctx->GetInputDim(framework::GradVarName("Out")));
"Input(Out@GRAD) shouldn't be null."); }
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); };
class LabelSmoothGradDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("label_smooth_grad");
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
} }
}; };
...@@ -117,7 +131,7 @@ class LabelSmoothGradOp : public framework::OperatorWithKernel { ...@@ -117,7 +131,7 @@ class LabelSmoothGradOp : public framework::OperatorWithKernel {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker, REGISTER_OPERATOR(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); ops::LabelSmoothGradDescMaker);
REGISTER_OPERATOR(label_smooth_grad, ops::LabelSmoothGradOp); REGISTER_OPERATOR(label_smooth_grad, ops::LabelSmoothGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
label_smooth, label_smooth,
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/linear_chain_crf_op.h" #include "paddle/fluid/operators/linear_chain_crf_op.h"
#include <memory>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -250,14 +251,46 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel { ...@@ -250,14 +251,46 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
} }
}; };
class LinearChainCRFGradDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("linear_chain_crf_grad");
op->SetAttrMap(Attrs());
op->SetInput("Emission", Input("Emission"));
op->SetInput("Transition", Input("Transition"));
op->SetInput("Label", Input("Label"));
op->SetInput("Alpha", Output("Alpha"));
op->SetInput("EmissionExps", Output("EmissionExps"));
op->SetInput("TransitionExps", Output("TransitionExps"));
op->SetInput(framework::GradVarName("LogLikelihood"),
OutputGrad("LogLikelihood"));
op->SetOutput(framework::GradVarName("Emission"), InputGrad("Emission"));
op->SetOutput(framework::GradVarName("Transition"),
InputGrad("Transition"));
return op;
}
};
DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
LinearChainCRFGradNoNeedBufferVarsInference, "Transition", "Emission");
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(linear_chain_crf, ops::LinearChainCRFOp, REGISTER_OPERATOR(linear_chain_crf, ops::LinearChainCRFOp,
ops::LinearChainCRFOpMaker, ops::LinearChainCRFOpMaker, ops::LinearChainCRFGradDescMaker);
paddle::framework::DefaultGradOpDescMaker<true>); REGISTER_OPERATOR(linear_chain_crf_grad, ops::LinearChainCRFGradOp,
REGISTER_OPERATOR(linear_chain_crf_grad, ops::LinearChainCRFGradOp); ops::LinearChainCRFGradNoNeedBufferVarsInference);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
linear_chain_crf, linear_chain_crf,
ops::LinearChainCRFOpKernel<paddle::platform::CPUDeviceContext, float>, ops::LinearChainCRFOpKernel<paddle::platform::CPUDeviceContext, float>,
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/log_loss_op.h" #include "paddle/fluid/operators/log_loss_op.h"
#include <memory>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -100,12 +101,29 @@ class LogLossGradOp : public framework::OperatorWithKernel { ...@@ -100,12 +101,29 @@ class LogLossGradOp : public framework::OperatorWithKernel {
} }
}; };
class LogLossGradDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("log_loss_grad");
op->SetInput("Predicted", Input("Predicted"));
op->SetInput("Labels", Input("Labels"));
op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
op->SetOutput(framework::GradVarName("Predicted"), InputGrad("Predicted"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>, REGISTER_OPERATOR(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>,
paddle::framework::DefaultGradOpDescMaker<true>); ops::LogLossGradDescMaker);
REGISTER_OPERATOR(log_loss_grad, ops::LogLossGradOp); REGISTER_OPERATOR(log_loss_grad, ops::LogLossGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
log_loss, ops::LogLossKernel<paddle::platform::CPUDeviceContext, float>); log_loss, ops::LogLossKernel<paddle::platform::CPUDeviceContext, float>);
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/lstm_op.h" #include "paddle/fluid/operators/lstm_op.h"
#include <memory>
#include <string> #include <string>
namespace paddle { namespace paddle {
...@@ -264,12 +265,51 @@ class LSTMGradOp : public framework::OperatorWithKernel { ...@@ -264,12 +265,51 @@ class LSTMGradOp : public framework::OperatorWithKernel {
} }
}; };
class LSTMGradOpDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("lstm_grad");
op->SetAttrMap(Attrs());
op->SetInput("Input", Input("Input"));
op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
if (ForwardOp().Inputs().count("H0") > 0) {
op->SetInput("H0", Input("H0"));
op->SetOutput(framework::GradVarName("H0"), InputGrad("H0"));
}
if (ForwardOp().Inputs().count("C0") > 0) {
op->SetInput("C0", Input("C0"));
op->SetOutput(framework::GradVarName("C0"), InputGrad("C0"));
}
op->SetInput("Weight", Input("Weight"));
op->SetOutput(framework::GradVarName("Weight"), InputGrad("Weight"));
op->SetInput("Bias", Input("Bias"));
op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
op->SetInput("Cell", Output("Cell"));
op->SetInput("Hidden", Output("Hidden"));
op->SetInput(framework::GradVarName("Hidden"), OutputGrad("Hidden"));
op->SetInput("BatchGate", Output("BatchGate"));
op->SetInput("BatchCellPreAct", Output("BatchCellPreAct"));
return op;
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(lstm, ops::LSTMOp, ops::LSTMOpMaker, REGISTER_OPERATOR(lstm, ops::LSTMOp, ops::LSTMOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); ops::LSTMGradOpDescMaker);
REGISTER_OPERATOR(lstm_grad, ops::LSTMGradOp); REGISTER_OPERATOR(lstm_grad, ops::LSTMGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
lstm, ops::LSTMKernel<paddle::platform::CPUDeviceContext, float>, lstm, ops::LSTMKernel<paddle::platform::CPUDeviceContext, float>,
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/margin_rank_loss_op.h" #include "paddle/fluid/operators/margin_rank_loss_op.h"
#include <memory>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -94,8 +95,6 @@ class MarginRankLossGradOp : public framework::OperatorWithKernel { ...@@ -94,8 +95,6 @@ class MarginRankLossGradOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null."); PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasInput("X1"), "Input(X1) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasInput("X2"), "Input(X2) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) shouldn't be null."); "Input(Out@GRAD) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasInput("Activated"), PADDLE_ENFORCE(ctx->HasInput("Activated"),
...@@ -106,13 +105,31 @@ class MarginRankLossGradOp : public framework::OperatorWithKernel { ...@@ -106,13 +105,31 @@ class MarginRankLossGradOp : public framework::OperatorWithKernel {
} }
}; };
class MarginRankLossGradDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("margin_rank_loss_grad");
op->SetInput("Activated", Output("Activated"));
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetInput("Label", Input("Label"));
op->SetOutput(framework::GradVarName("X1"), InputGrad("X1"));
op->SetOutput(framework::GradVarName("X2"), InputGrad("X2"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(margin_rank_loss, ops::MarginRankLossOp, REGISTER_OPERATOR(margin_rank_loss, ops::MarginRankLossOp,
ops::MarginRankLossOpMaker<float>, ops::MarginRankLossOpMaker<float>,
paddle::framework::DefaultGradOpDescMaker<true>); ops::MarginRankLossGradDescMaker);
REGISTER_OPERATOR(margin_rank_loss_grad, ops::MarginRankLossGradOp); REGISTER_OPERATOR(margin_rank_loss_grad, ops::MarginRankLossGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
margin_rank_loss, margin_rank_loss,
......
...@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/mean_op.h" #include "paddle/fluid/operators/mean_op.h"
#include <memory>
#include <string> #include <string>
#include <unordered_map>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -61,7 +64,8 @@ class MeanGradOp : public framework::OperatorWithKernel { ...@@ -61,7 +64,8 @@ class MeanGradOp : public framework::OperatorWithKernel {
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
auto input_data_type = ctx.Input<Tensor>("X")->type(); auto input_data_type =
ctx.Input<Tensor>(framework::GradVarName("Out"))->type();
return framework::OpKernelType(input_data_type, ctx.GetPlace()); return framework::OpKernelType(input_data_type, ctx.GetPlace());
} }
}; };
...@@ -81,13 +85,16 @@ class MeanGradMaker : public framework::SingleGradOpDescMaker { ...@@ -81,13 +85,16 @@ class MeanGradMaker : public framework::SingleGradOpDescMaker {
} }
}; };
DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(MeanGradNoNeedBufferVarsInference, "X");
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanOpInferVarType, REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanOpInferVarType,
ops::MeanGradMaker); ops::MeanGradMaker);
REGISTER_OPERATOR(mean_grad, ops::MeanGradOp); REGISTER_OPERATOR(mean_grad, ops::MeanGradOp,
ops::MeanGradNoNeedBufferVarsInference);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
mean, ops::MeanKernel<paddle::platform::CPUDeviceContext, float>, mean, ops::MeanKernel<paddle::platform::CPUDeviceContext, float>,
ops::MeanKernel<paddle::platform::CPUDeviceContext, double>); ops::MeanKernel<paddle::platform::CPUDeviceContext, double>);
......
...@@ -96,7 +96,8 @@ void eltwise_forward(const framework::ExecutionContext &ctx, ...@@ -96,7 +96,8 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
std::vector<int> src_tz = framework::vectorize2int(x->dims()); std::vector<int> src_tz = framework::vectorize2int(x->dims());
auto src_format = x->format(); auto src_format =
src_tz.size() == 2 ? mkldnn::memory::format::nc : x->format();
const std::string key = gethash(src_tz, algorithm); const std::string key = gethash(src_tz, algorithm);
const std::string key_src_data = const std::string key_src_data =
...@@ -126,8 +127,10 @@ void eltwise_forward(const framework::ExecutionContext &ctx, ...@@ -126,8 +127,10 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
if (p_fwd == nullptr) { if (p_fwd == nullptr) {
// create mkldnn memory for input X // create mkldnn memory for input X
auto src_md = platform::MKLDNNMemDesc(
src_tz, platform::MKLDNNGetDataType<T>(), src_format);
auto src_memory = std::shared_ptr<memory>( auto src_memory = std::shared_ptr<memory>(
new memory(x->get_mkldnn_prim_desc(), to_void_cast(x_data))); new memory({src_md, mkldnn_engine}, to_void_cast(x_data)));
// save src_memory to be referred in backward path // save src_memory to be referred in backward path
dev_ctx.SetBlob(key_src_mem, src_memory); dev_ctx.SetBlob(key_src_mem, src_memory);
...@@ -174,7 +177,8 @@ void eltwise_forward(const framework::ExecutionContext &ctx, ...@@ -174,7 +177,8 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
pipeline.push_back(*p_fwd); pipeline.push_back(*p_fwd);
stream(stream::kind::eager).submit(pipeline).wait(); stream(stream::kind::eager).submit(pipeline).wait();
y->set_mkldnn_prim_desc(dst_memory->get_primitive_desc()); y->set_layout(DataLayout::kMKLDNN);
y->set_format(GetMKLDNNFormat(*dst_memory));
} }
template <typename T> template <typename T>
...@@ -192,6 +196,9 @@ void eltwise_grad(const framework::ExecutionContext &ctx, ...@@ -192,6 +196,9 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
std::vector<int> diff_dst_tz = framework::vectorize2int(diff_y->dims()); std::vector<int> diff_dst_tz = framework::vectorize2int(diff_y->dims());
auto diff_y_format =
diff_dst_tz.size() == 2 ? mkldnn::memory::format::nc : diff_y->format();
const std::string key = gethash(diff_dst_tz, algorithm); const std::string key = gethash(diff_dst_tz, algorithm);
const std::string key_src_data = const std::string key_src_data =
key + ctx.op().Input("Out") + "@eltwise_fwd_src_data"; key + ctx.op().Input("Out") + "@eltwise_fwd_src_data";
...@@ -203,8 +210,8 @@ void eltwise_grad(const framework::ExecutionContext &ctx, ...@@ -203,8 +210,8 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
key + std::to_string(*p_src_layout) + "@eltwise_fwd_src_mem"; key + std::to_string(*p_src_layout) + "@eltwise_fwd_src_mem";
const std::string key_fwd_pd = const std::string key_fwd_pd =
key + std::to_string(*p_src_layout) + "@eltwise_fwd_pd"; key + std::to_string(*p_src_layout) + "@eltwise_fwd_pd";
const std::string key_with_layouts = key + std::to_string(*p_src_layout) + const std::string key_with_layouts =
"-" + std::to_string(diff_y->format()); key + std::to_string(*p_src_layout) + "-" + std::to_string(diff_y_format);
const std::string key_diff_src_mem = const std::string key_diff_src_mem =
key_with_layouts + "@eltwise_diff_src_mem"; key_with_layouts + "@eltwise_diff_src_mem";
const std::string key_diff_dst_mem = const std::string key_diff_dst_mem =
...@@ -227,8 +234,10 @@ void eltwise_grad(const framework::ExecutionContext &ctx, ...@@ -227,8 +234,10 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
if (p_grad == nullptr) { if (p_grad == nullptr) {
// create mkldnn memory for input diff_y // create mkldnn memory for input diff_y
auto diff_dst_md = platform::MKLDNNMemDesc(
diff_dst_tz, platform::MKLDNNGetDataType<T>(), diff_y_format);
auto diff_dst_memory = std::shared_ptr<memory>( auto diff_dst_memory = std::shared_ptr<memory>(
new memory(diff_y->get_mkldnn_prim_desc(), to_void_cast(diff_y_data))); new memory({diff_dst_md, mkldnn_engine}, to_void_cast(diff_y_data)));
dev_ctx.SetBlob(key_diff_dst_mem, diff_dst_memory); dev_ctx.SetBlob(key_diff_dst_mem, diff_dst_memory);
// retrieve eltwise primitive desc from device context // retrieve eltwise primitive desc from device context
...@@ -272,7 +281,8 @@ void eltwise_grad(const framework::ExecutionContext &ctx, ...@@ -272,7 +281,8 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
pipeline.push_back(*p_grad); pipeline.push_back(*p_grad);
stream(stream::kind::eager).submit(pipeline).wait(); stream(stream::kind::eager).submit(pipeline).wait();
diff_x->set_mkldnn_prim_desc(diff_src_memory->get_primitive_desc()); diff_x->set_layout(DataLayout::kMKLDNN);
diff_x->set_format(GetMKLDNNFormat(*diff_src_memory));
} }
template <typename T, mkldnn::algorithm algorithm> template <typename T, mkldnn::algorithm algorithm>
......
...@@ -206,14 +206,17 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -206,14 +206,17 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu; if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu;
// create mkldnn memory from input x tensor // create mkldnn memory from input x tensor
mkldnn::memory::format input_format =
platform::MKLDNNFormatForSize(src_tz.size(), x->format());
// keys for backward pass // keys for backward pass
const std::string key = BatchNormMKLDNNHandler::GetHash( const std::string key = BatchNormMKLDNNHandler::GetHash(
src_tz, epsilon, flags, global_stats, x->format(), src_tz, epsilon, flags, global_stats, input_format,
ctx.op().Output("SavedMean")); ctx.op().Output("SavedMean"));
const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd"; const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
auto user_src_md = x->get_mkldnn_prim_desc().desc(); auto user_src_md = platform::MKLDNNMemDesc(
{src_tz}, platform::MKLDNNGetDataType<T>(), input_format);
// create primitive descriptor for batch norm forward // create primitive descriptor for batch norm forward
using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>; using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
...@@ -227,8 +230,8 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -227,8 +230,8 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
BatchNormMKLDNNHandler handler(batch_norm_fwd_pd, dev_ctx, mkldnn_engine, BatchNormMKLDNNHandler handler(batch_norm_fwd_pd, dev_ctx, mkldnn_engine,
key); key);
auto src_memory = handler.AcquireSrcMemory(x->get_mkldnn_prim_desc(), auto src_memory =
to_void_cast(x_data)); handler.AcquireSrcMemory(user_src_md, to_void_cast(x_data));
// crate mkldnn memory for weights(scale/shift) // crate mkldnn memory for weights(scale/shift)
auto scaleshift_memory = auto scaleshift_memory =
...@@ -262,7 +265,8 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -262,7 +265,8 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
variance_memory, false); variance_memory, false);
} }
y->set_mkldnn_prim_desc(dst_memory->get_primitive_desc()); y->set_layout(DataLayout::kMKLDNN);
y->set_format(platform::GetMKLDNNFormat(*dst_memory));
std::vector<mkldnn::primitive> pipeline; std::vector<mkldnn::primitive> pipeline;
pipeline.push_back(*batch_norm_p); pipeline.push_back(*batch_norm_p);
...@@ -332,6 +336,9 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -332,6 +336,9 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>; using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
mkldnn::memory::format dst_format =
platform::MKLDNNFormatForSize(src_tz.size(), diff_y->format());
mkldnn::memory::format input_format = mkldnn::memory::format input_format =
platform::MKLDNNFormatForSize(src_tz.size(), x->format()); platform::MKLDNNFormatForSize(src_tz.size(), x->format());
...@@ -339,14 +346,14 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -339,14 +346,14 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
// keys from forward pass // keys from forward pass
const std::string key = BatchNormMKLDNNHandler::GetHash( const std::string key = BatchNormMKLDNNHandler::GetHash(
src_tz, epsilon, flags, false, x->format(), src_tz, epsilon, flags, false, input_format,
ctx.op().Input("SavedMean")); ctx.op().Input("SavedMean"));
const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd"; const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
// keys for primitives reuse // keys for primitives reuse
const std::string key_with_hash = const std::string key_with_hash =
key + BatchNormMKLDNNHandler::GetHash(src_tz, epsilon, flags, false, key + BatchNormMKLDNNHandler::GetHash(src_tz, epsilon, flags, false,
x->format()); input_format);
const std::string key_batch_norm_bwd_p = const std::string key_batch_norm_bwd_p =
key_with_hash + "@batch_norm_bwd_p"; key_with_hash + "@batch_norm_bwd_p";
const std::string key_batch_norm_src_mem_p = const std::string key_batch_norm_src_mem_p =
...@@ -366,8 +373,9 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -366,8 +373,9 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
primitive reorder_diff_dst; primitive reorder_diff_dst;
bool is_diff_dst_reordered = false; bool is_diff_dst_reordered = false;
auto user_diff_dst_memory = auto user_diff_dst_memory = memory(
memory(diff_y->get_mkldnn_prim_desc(), to_void_cast(diff_y_data)); {{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine},
to_void_cast(diff_y_data));
// MKLDNN requires a single piece of memory for scale and shift/bias data // MKLDNN requires a single piece of memory for scale and shift/bias data
const size_t scaleshift_size = 2 * ic; const size_t scaleshift_size = 2 * ic;
...@@ -451,7 +459,10 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -451,7 +459,10 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
dev_ctx.SetBlob(key_batch_norm_diff_dst_mem_p, diff_dst_memory); dev_ctx.SetBlob(key_batch_norm_diff_dst_mem_p, diff_dst_memory);
// set layout/format of output tensors // set layout/format of output tensors
diff_x->set_mkldnn_prim_desc(diff_src_memory->get_primitive_desc()); diff_x->set_layout(DataLayout::kMKLDNN);
diff_x->set_format((memory::format)diff_src_memory->get_primitive_desc()
.desc()
.data.format);
} else { } else {
// primitives already exist // primitives already exist
UpdateMemoryData(dev_ctx, key_batch_norm_src_mem_p, to_void_cast(x_data)); UpdateMemoryData(dev_ctx, key_batch_norm_src_mem_p, to_void_cast(x_data));
...@@ -476,7 +487,10 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -476,7 +487,10 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
} }
// set layout/format of output tensors // set layout/format of output tensors
diff_x->set_mkldnn_prim_desc(diff_src_memory->get_primitive_desc()); diff_x->set_layout(DataLayout::kMKLDNN);
diff_x->set_format((memory::format)diff_src_memory->get_primitive_desc()
.desc()
.data.format);
} }
// execute optional reorder and batch_norm backward primitive // execute optional reorder and batch_norm backward primitive
......
...@@ -210,7 +210,8 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -210,7 +210,8 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
stream(stream::kind::eager).submit({*concat_p}).wait(); stream(stream::kind::eager).submit({*concat_p}).wait();
output->set_mkldnn_prim_desc(concat_pd->dst_primitive_desc()); output->set_layout(DataLayout::kMKLDNN);
output->set_format(GetDstMemFormat(*concat_pd));
} }
}; };
} // namespace operators } // namespace operators
......
...@@ -96,8 +96,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -96,8 +96,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr; auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
auto* output = ctx.Output<Tensor>("Output"); auto* output = ctx.Output<Tensor>("Output");
PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN); PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN); input->format() != memory::format::format_undef,
"Wrong layout/format set for Input tensor");
PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
filter->format() != memory::format::format_undef,
"Wrong layout/format set for Filter tensor");
PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5, PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5,
"Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW"); "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW");
PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5, PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5,
...@@ -144,19 +148,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -144,19 +148,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std::vector<primitive> pipeline; std::vector<primitive> pipeline;
// For convolution with groups we need to recreate primitive descriptor auto src_format = input->format();
// as Paddle tensor is not having group dims while mkldnn treats mkldnn::memory::format weights_format =
// group as another dimensions GetWeightsFormat(filter->format(), g, is_conv3d);
mkldnn::memory::primitive_desc user_weights_mpd =
filter->get_mkldnn_prim_desc(); auto user_src_md = platform::MKLDNNMemDesc(
if (g > 1) { {src_tz}, platform::MKLDNNGetDataType<T>(), src_format);
mkldnn::memory::format weights_format = auto user_weights_md = platform::MKLDNNMemDesc(
GetWeightsFormat(filter->format(), g, is_conv3d); {weights_tz}, platform::MKLDNNGetDataType<T>(), weights_format);
auto user_weights_md = platform::MKLDNNMemDesc(
{weights_tz}, platform::MKLDNNGetDataType<T>(), weights_format);
user_weights_mpd =
mkldnn::memory::primitive_desc(user_weights_md, mkldnn_engine);
}
/* create memory descriptor for convolution without specified format /* create memory descriptor for convolution without specified format
* ('any') which lets a primitive (convolution in this case) choose * ('any') which lets a primitive (convolution in this case) choose
...@@ -166,7 +165,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -166,7 +165,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto chosen_memory_format = auto chosen_memory_format =
platform::data_format_to_memory_format(data_format); platform::data_format_to_memory_format(data_format);
mkldnn::memory::format weights_format = mkldnn::memory::format::any; weights_format = mkldnn::memory::format::any;
// Check the format for user's special output // Check the format for user's special output
if (chosen_memory_format != mkldnn::memory::format::any) { if (chosen_memory_format != mkldnn::memory::format::any) {
if (is_conv3d) { if (is_conv3d) {
...@@ -206,10 +205,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -206,10 +205,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
platform::ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key); platform::ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key);
// create mkldnn memory from input tensors (data/weights) // create mkldnn memory from input tensors (data/weights)
auto user_src_memory_p = handler.AcquireSrcMemory( auto user_src_memory_p =
input->get_mkldnn_prim_desc(), to_void_cast<T>(input_data)); handler.AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
auto user_weights_memory_p = handler.AcquireWeightsMemory( auto user_weights_memory_p = handler.AcquireWeightsMemory(
user_weights_mpd, to_void_cast<T>(filter_data)); user_weights_md, to_void_cast<T>(filter_data));
// create reorder primitive if the input format is not the preferred one // create reorder primitive if the input format is not the preferred one
auto src_memory_p = auto src_memory_p =
...@@ -282,7 +281,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -282,7 +281,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
pipeline.push_back(*conv_p); pipeline.push_back(*conv_p);
stream(stream::kind::eager).submit(pipeline).wait(); stream(stream::kind::eager).submit(pipeline).wait();
output->set_mkldnn_prim_desc(dst_memory_p->get_primitive_desc()); output->set_layout(DataLayout::kMKLDNN);
output->set_format(GetMKLDNNFormat(*dst_memory_p));
} }
void ComputeINT8(const paddle::framework::ExecutionContext& ctx) const { void ComputeINT8(const paddle::framework::ExecutionContext& ctx) const {
const bool is_test = ctx.Attr<bool>("is_test"); const bool is_test = ctx.Attr<bool>("is_test");
...@@ -948,8 +948,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -948,8 +948,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
// push primitive to stream and wait until it's executed // push primitive to stream and wait until it's executed
pipeline.push_back(*conv_bwd_weights_p); pipeline.push_back(*conv_bwd_weights_p);
auto filter_grad_mpd = diff_weights_memory_p->get_primitive_desc(); filter_grad->set_layout(DataLayout::kMKLDNN);
filter_grad->set_mkldnn_prim_desc(filter_grad_mpd); filter_grad->set_format(GetMKLDNNFormat(*diff_weights_memory_p));
} }
if (input_grad) { if (input_grad) {
...@@ -972,7 +972,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -972,7 +972,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
pipeline.push_back(*conv_bwd_data_p); pipeline.push_back(*conv_bwd_data_p);
input_grad->set_mkldnn_prim_desc(diff_src_memory_p->get_primitive_desc()); input_grad->set_layout(DataLayout::kMKLDNN);
input_grad->set_format(GetMKLDNNFormat(*diff_src_memory_p));
} }
stream(stream::kind::eager).submit(pipeline).wait(); stream(stream::kind::eager).submit(pipeline).wait();
} }
......
...@@ -221,7 +221,8 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -221,7 +221,8 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
pipeline.push_back(*conv_p); pipeline.push_back(*conv_p);
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
output->set_mkldnn_prim_desc(dst_memory_p->get_primitive_desc()); output->set_layout(DataLayout::kMKLDNN);
output->set_format(platform::GetMKLDNNFormat(*dst_memory_p));
} }
private: private:
......
...@@ -42,12 +42,8 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> { ...@@ -42,12 +42,8 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
// The format of output is set as the mkldnn's format // The format of output is set as the mkldnn's format
// TODO(@mozga-intel) The format of matrix sets inside the another layers. // TODO(@mozga-intel) The format of matrix sets inside the another layers.
// TODO(jczaja): Remove this hack after checking performance on block layout tensor->set_layout(DataLayout::kMKLDNN);
tensor->set_format(mkldnn::memory::format::oihw);
auto tensor_mem_pd = paddle::platform::create_prim_desc_from_dims(
paddle::framework::vectorize2int(tensor->dims()),
mkldnn::memory::format::oihw);
tensor->set_mkldnn_prim_desc(tensor_mem_pd);
} }
}; };
} // namespace operators } // namespace operators
......
...@@ -81,7 +81,10 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -81,7 +81,10 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto e_mid = framework::EigenTensor<T, 4>::From(*mid); auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
e_mid = e_mid.constant(k); e_mid = e_mid.constant(k);
auto src_md = x->get_mkldnn_prim_desc().desc(); auto dims = paddle::framework::vectorize2int(x->dims());
auto src_md = paddle::platform::MKLDNNMemDesc(
dims, mkldnn::memory::data_type::f32, x->format());
auto forward_desc = mkldnn::lrn_forward::desc{mkldnn::prop_kind::forward, auto forward_desc = mkldnn::lrn_forward::desc{mkldnn::prop_kind::forward,
mkldnn::lrn_across_channels, mkldnn::lrn_across_channels,
...@@ -91,7 +94,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -91,7 +94,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
beta, beta,
k}; k};
auto src_memory_pd = x->get_mkldnn_prim_desc(); auto src_memory_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine};
if (!is_test) { if (!is_test) {
const std::string key = ctx.op().Output("Out"); const std::string key = ctx.op().Output("Out");
...@@ -108,15 +111,16 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -108,15 +111,16 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
src_memory->set_data_handle( src_memory->set_data_handle(
static_cast<void*>(const_cast<T*>(input_data))); static_cast<void*>(const_cast<T*>(input_data)));
auto dst_memory_pd = forward_pd->dst_primitive_desc(); auto dst_memory = mkldnn::memory(forward_pd->dst_primitive_desc(),
auto dst_memory = static_cast<void*>(output_data));
mkldnn::memory(dst_memory_pd, static_cast<void*>(output_data));
auto workspace_memory = insert_to_context<mkldnn::memory>( auto workspace_memory = insert_to_context<mkldnn::memory>(
key_workspace_memory, dev_ctx, key_workspace_memory, dev_ctx,
forward_pd->workspace_primitive_desc()); forward_pd->workspace_primitive_desc());
run_primitive(*forward_pd, *src_memory, *workspace_memory, dst_memory); run_primitive(*forward_pd, *src_memory, *workspace_memory, dst_memory);
out->set_mkldnn_prim_desc(dst_memory_pd);
out->set_layout(framework::DataLayout::kMKLDNN);
out->set_format(platform::GetMKLDNNFormat(dst_memory));
} else { } else {
auto forward_pd = auto forward_pd =
mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine}; mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine};
...@@ -124,12 +128,13 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -124,12 +128,13 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
src_memory_pd, static_cast<void*>(const_cast<T*>(input_data))}; src_memory_pd, static_cast<void*>(const_cast<T*>(input_data))};
auto workspace_memory = auto workspace_memory =
mkldnn::memory{forward_pd.workspace_primitive_desc()}; mkldnn::memory{forward_pd.workspace_primitive_desc()};
auto dst_memory_pd = forward_pd.dst_primitive_desc();
auto dst_memory = mkldnn::memory(forward_pd.dst_primitive_desc(), auto dst_memory = mkldnn::memory(forward_pd.dst_primitive_desc(),
static_cast<void*>(output_data)); static_cast<void*>(output_data));
run_primitive(forward_pd, src_memory, workspace_memory, dst_memory); run_primitive(forward_pd, src_memory, workspace_memory, dst_memory);
out->set_mkldnn_prim_desc(dst_memory_pd);
out->set_layout(framework::DataLayout::kMKLDNN);
out->set_format(platform::GetMKLDNNFormat(dst_memory));
} }
} }
}; };
......
...@@ -158,14 +158,6 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> { ...@@ -158,14 +158,6 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
auto softmax_p = auto softmax_p =
handler.AcquireSoftmax(softmax_dst_memory_p, softmax_src_memory_p); handler.AcquireSoftmax(softmax_dst_memory_p, softmax_src_memory_p);
// We cannot use softmax_dst_memory_p to get prim desc as
// it contains flattened dims (2D) while output tensor can
// have 2,3,4+ dims
auto output_mem_pd = paddle::platform::create_prim_desc_from_dims(
paddle::framework::vectorize2int(output->dims()),
mkldnn::memory::format::blocked);
output->set_mkldnn_prim_desc(output_mem_pd);
std::vector<primitive> pipeline{ std::vector<primitive> pipeline{
*(static_cast<softmax_forward::primitive*>(softmax_p.get()))}; *(static_cast<softmax_forward::primitive*>(softmax_p.get()))};
stream(stream::kind::eager).submit(pipeline).wait(); stream(stream::kind::eager).submit(pipeline).wait();
......
...@@ -106,12 +106,12 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -106,12 +106,12 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
memory::desc(dst_tz, memory::data_type::f32, memory::format::any); memory::desc(dst_tz, memory::data_type::f32, memory::format::any);
auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_mpd); auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_mpd);
auto dst_mem_pd = sum_pd.dst_primitive_desc();
std::shared_ptr<memory> dst_mem; std::shared_ptr<memory> dst_mem;
if (in_place) { if (in_place) {
dst_mem.reset(new memory(dst_mem_pd)); dst_mem.reset(new memory(sum_pd.dst_primitive_desc()));
} else { } else {
dst_mem.reset(new memory(dst_mem_pd, output_data)); dst_mem.reset(new memory(sum_pd.dst_primitive_desc(), output_data));
} }
std::vector<mkldnn::primitive::at> inputs; std::vector<mkldnn::primitive::at> inputs;
for (size_t i = 0; i < srcs_mem.size(); ++i) { for (size_t i = 0; i < srcs_mem.size(); ++i) {
...@@ -136,7 +136,8 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -136,7 +136,8 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
if (in_place) pipeline.push_back(reorder_prim); if (in_place) pipeline.push_back(reorder_prim);
stream(stream::kind::eager).submit(pipeline).wait(); stream(stream::kind::eager).submit(pipeline).wait();
output->set_mkldnn_prim_desc(dst_mem_pd); output->set_layout(DataLayout::kMKLDNN);
output->set_format(output_format);
} else { // Fallback to naive version } else { // Fallback to naive version
// TODO(@mozga-intel) Add MKLDNN SelectedRows & LoDTensorArray support // TODO(@mozga-intel) Add MKLDNN SelectedRows & LoDTensorArray support
SumKernel<CPUDeviceContext, T> reference_kernel; SumKernel<CPUDeviceContext, T> reference_kernel;
......
...@@ -52,7 +52,7 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -52,7 +52,7 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
mkldnn_engine, key); mkldnn_engine, key);
auto transpose_src_memory_p = handler.AcquireSrcMemory( auto transpose_src_memory_p = handler.AcquireSrcMemory(
input->get_mkldnn_prim_desc(), platform::to_void_cast<T>(input_data)); input->format(), platform::to_void_cast<T>(input_data));
auto transpose_dst_memory_p = auto transpose_dst_memory_p =
handler.AcquireDstMemory(output, ctx.GetPlace()); handler.AcquireDstMemory(output, ctx.GetPlace());
auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p, auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p,
...@@ -62,14 +62,8 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -62,14 +62,8 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
pipeline.push_back(*transpose_p); pipeline.push_back(*transpose_p);
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
// Transpose did change logical dimensions of Tensor, but reorder does not. output->set_layout(DataLayout::kNCHW);
// Reorder does change only physical layout eg. format , strides output->set_format(mkldnn::memory::format::format_undef);
// so we need to create new primitive descriptor with changed logical layout
// so it match output shape
auto output_mem_pd = paddle::platform::create_prim_desc_from_dims(
paddle::framework::vectorize2int(output->dims()),
mkldnn::memory::format::blocked);
output->set_mkldnn_prim_desc(output_mem_pd);
} }
}; };
...@@ -134,9 +128,8 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -134,9 +128,8 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
platform::TransposeMKLDNNHandler handler(nchw_tz, reversed_axis, dev_ctx, platform::TransposeMKLDNNHandler handler(nchw_tz, reversed_axis, dev_ctx,
mkldnn_engine, key); mkldnn_engine, key);
auto transpose_src_memory_p = auto transpose_src_memory_p = handler.AcquireSrcMemory(
handler.AcquireSrcMemory(out_grad->get_mkldnn_prim_desc(), out_grad->format(), platform::to_void_cast<T>(out_grad_data));
platform::to_void_cast<T>(out_grad_data));
auto transpose_dst_memory_p = auto transpose_dst_memory_p =
handler.AcquireDstMemory(x_grad, ctx.GetPlace()); handler.AcquireDstMemory(x_grad, ctx.GetPlace());
auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p, auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p,
...@@ -145,15 +138,6 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -145,15 +138,6 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
std::vector<mkldnn::primitive> pipeline; std::vector<mkldnn::primitive> pipeline;
pipeline.push_back(*transpose_p); pipeline.push_back(*transpose_p);
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
// Transpose did change logical dimensions of Tensor, but reorder does not.
// Reorder does change only physical layout eg. format , strides
// so we need to create new primitive descriptor with changed logical layout
// so it match output shape
auto x_grad_mem_pd = paddle::platform::create_prim_desc_from_dims(
paddle::framework::vectorize2int(x_grad->dims()),
mkldnn::memory::format::blocked);
x_grad->set_mkldnn_prim_desc(x_grad_mem_pd);
} }
}; };
......
...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/multiplex_op.h" #include "paddle/fluid/operators/multiplex_op.h"
#include <memory>
#include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -111,28 +113,47 @@ class MultiplexGradOp : public framework::OperatorWithKernel { ...@@ -111,28 +113,47 @@ class MultiplexGradOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(!ctx->Inputs("X").empty(), "Input(X) should not be null."); auto& dxs = ctx->Outputs(framework::GradVarName("X"));
PADDLE_ENFORCE(!ctx->Outputs(framework::GradVarName("X")).empty(), PADDLE_ENFORCE(!dxs.empty(), "Output(X@Grad) should not be null.");
"Output(X@Grad) should not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) should not be null."); "Input(Out@GRAD) should not be null.");
ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X")); auto dout_dim = ctx->GetInputDim(framework::GradVarName("Out"));
ctx->SetOutputsDim(framework::GradVarName("X"),
std::vector<framework::DDim>(dxs.size(), dout_dim));
} }
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(ctx.MultiInput<Tensor>("X")[0]->type(), return framework::OpKernelType(
ctx.device_context()); ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
ctx.device_context());
}
};
class MultiplexGradDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("multiplex_grad");
op->SetInput("Ids", Input("Ids"));
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X", false));
op->SetAttrMap(Attrs());
return op;
} }
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker,
paddle::framework::DefaultGradOpDescMaker<false>); ops::MultiplexGradDescMaker);
REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp); REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
multiplex, multiplex,
......
...@@ -53,20 +53,25 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> { ...@@ -53,20 +53,25 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const { void Compute(const framework::ExecutionContext& ctx) const {
auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out")); auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto ins = ctx.MultiInput<Tensor>("X");
auto* ids = ctx.Input<Tensor>("Ids"); auto* ids = ctx.Input<Tensor>("Ids");
auto d_ins = ctx.MultiOutput<Tensor>(framework::GradVarName("X")); auto d_ins = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
size_t idx = -1UL;
for (size_t i = 0; i < d_ins.size(); i++) { for (size_t i = 0; i < d_ins.size(); i++) {
if (d_ins[i]) { if (d_ins[i]) {
d_ins[i]->mutable_data<T>(ctx.GetPlace()); d_ins[i]->mutable_data<T>(ctx.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*d_ins[i]); auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
t.device(*ctx.template device_context<Place>().eigen_device()) = t.device(*ctx.template device_context<Place>().eigen_device()) =
t.constant(static_cast<T>(0)); t.constant(static_cast<T>(0));
idx = i;
} }
} }
auto rows = ins[0]->dims()[0]; if (idx == -1UL) return;
auto cols = ins[0]->numel() / rows;
auto rows = d_ins[idx]->dims()[0];
auto cols = d_ins[idx]->numel() / rows;
// copy index to cpu // copy index to cpu
Tensor index_t_cpu; Tensor index_t_cpu;
TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu); TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
......
...@@ -52,20 +52,25 @@ class MultiplexGradCPUKernel : public framework::OpKernel<T> { ...@@ -52,20 +52,25 @@ class MultiplexGradCPUKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const { void Compute(const framework::ExecutionContext& ctx) const {
auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out")); auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* ids = ctx.Input<framework::Tensor>("Ids"); auto* ids = ctx.Input<framework::Tensor>("Ids");
auto ins = ctx.MultiInput<framework::Tensor>("X");
auto d_ins = auto d_ins =
ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X")); ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
size_t idx = -1UL;
for (size_t i = 0; i < d_ins.size(); i++) { for (size_t i = 0; i < d_ins.size(); i++) {
if (d_ins[i]) { if (d_ins[i]) {
d_ins[i]->mutable_data<T>(ctx.GetPlace()); d_ins[i]->mutable_data<T>(ctx.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*d_ins[i]); auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
t.device(*ctx.template device_context<DeviceContext>().eigen_device()) = t.device(*ctx.template device_context<DeviceContext>().eigen_device()) =
t.constant(static_cast<T>(0)); t.constant(static_cast<T>(0));
idx = i;
} }
} }
auto rows = ins[0]->dims()[0]; if (idx == -1UL) return;
auto cols = ins[0]->numel() / rows;
auto rows = d_ins[idx]->dims()[0];
auto cols = d_ins[idx]->numel() / rows;
auto* index = ids->data<int32_t>(); auto* index = ids->data<int32_t>();
platform::CPUPlace place = boost::get<platform::CPUPlace>(ctx.GetPlace()); platform::CPUPlace place = boost::get<platform::CPUPlace>(ctx.GetPlace());
for (auto i = 0; i < rows; i++) { for (auto i = 0; i < rows; i++) {
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/pad_op.h" #include "paddle/fluid/operators/pad_op.h"
#include <memory>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -29,7 +30,7 @@ class PadOp : public framework::OperatorWithKernel { ...@@ -29,7 +30,7 @@ class PadOp : public framework::OperatorWithKernel {
"Output(Out) of PadOp should not be null."); "Output(Out) of PadOp should not be null.");
auto x_dim = ctx->GetInputDim("X"); auto x_dim = ctx->GetInputDim("X");
auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings"); auto& paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
PADDLE_ENFORCE_EQ(x_dim.size() * 2, int64_t(paddings.size()), PADDLE_ENFORCE_EQ(x_dim.size() * 2, int64_t(paddings.size()),
"Size of paddings should be equal to 2 * dimension size " "Size of paddings should be equal to 2 * dimension size "
"of input tensor."); "of input tensor.");
...@@ -99,13 +100,20 @@ class PadOpGrad : public framework::OperatorWithKernel { ...@@ -99,13 +100,20 @@ class PadOpGrad : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), auto& paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
"Input(Out@GRAD) should not be null"); for (int i = 0; i < dout_dims.size(); ++i) {
auto x_dims = ctx->GetInputDim("X"); dout_dims[i] -= (paddings[i * 2] + paddings[i * 2 + 1]);
}
auto x_grad_name = framework::GradVarName("X"); auto x_grad_name = framework::GradVarName("X");
if (ctx->HasOutput(x_grad_name)) { if (ctx->HasOutput(x_grad_name)) {
ctx->SetOutputDim(x_grad_name, x_dims); auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
auto& paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
for (int i = 0; i < dout_dims.size(); ++i) {
dout_dims[i] -= (paddings[i * 2] + paddings[i * 2 + 1]);
}
ctx->SetOutputDim(x_grad_name, dout_dims);
} }
} }
}; };
...@@ -117,7 +125,6 @@ class PadOpGradMaker : public framework::SingleGradOpDescMaker { ...@@ -117,7 +125,6 @@ class PadOpGradMaker : public framework::SingleGradOpDescMaker {
protected: protected:
std::unique_ptr<framework::OpDesc> Apply() const override { std::unique_ptr<framework::OpDesc> Apply() const override {
auto* bind = new framework::OpDesc(); auto* bind = new framework::OpDesc();
bind->SetInput("X", Input("X"));
bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
bind->SetOutput(framework::GradVarName("X"), InputGrad("X")); bind->SetOutput(framework::GradVarName("X"), InputGrad("X"));
bind->SetAttrMap(Attrs()); bind->SetAttrMap(Attrs());
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/psroi_pool_op.h" #include "paddle/fluid/operators/psroi_pool_op.h"
#include <memory>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -154,12 +155,29 @@ class PSROIPoolGradOp : public framework::OperatorWithKernel { ...@@ -154,12 +155,29 @@ class PSROIPoolGradOp : public framework::OperatorWithKernel {
} }
}; };
class PSROIPoolGradDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("psroi_pool_grad");
op->SetInput("X", Input("X"));
op->SetInput("ROIs", Input("ROIs"));
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(psroi_pool, ops::PSROIPoolOp, ops::PSROIPoolOpMaker, REGISTER_OPERATOR(psroi_pool, ops::PSROIPoolOp, ops::PSROIPoolOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); ops::PSROIPoolGradDescMaker);
REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp); REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
psroi_pool, psroi_pool,
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/rank_loss_op.h" #include "paddle/fluid/operators/rank_loss_op.h"
#include <memory>
#include <string> #include <string>
namespace paddle { namespace paddle {
...@@ -116,6 +117,25 @@ class RankLossGradOp : public framework::OperatorWithKernel { ...@@ -116,6 +117,25 @@ class RankLossGradOp : public framework::OperatorWithKernel {
} }
}; };
class RankLossGradDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("rank_loss_grad");
op->SetInput("Label", Input("Label"));
op->SetInput("Left", Input("Left"));
op->SetInput("Right", Input("Right"));
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetOutput(framework::GradVarName("Left"), InputGrad("Left"));
op->SetOutput(framework::GradVarName("Right"), InputGrad("Right"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
......
...@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and ...@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/roi_align_op.h" #include "paddle/fluid/operators/roi_align_op.h"
#include <memory>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -147,12 +148,29 @@ Thus avoid the misaligned problem. ...@@ -147,12 +148,29 @@ Thus avoid the misaligned problem.
} }
}; };
class ROIAlignGradDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("roi_align_grad");
op->SetInput("X", Input("X"));
op->SetInput("ROIs", Input("ROIs"));
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(roi_align, ops::ROIAlignOp, ops::ROIAlignOpMaker, REGISTER_OPERATOR(roi_align, ops::ROIAlignOp, ops::ROIAlignOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); ops::ROIAlignGradDescMaker);
REGISTER_OPERATOR(roi_align_grad, ops::ROIAlignGradOp); REGISTER_OPERATOR(roi_align_grad, ops::ROIAlignGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
roi_align, roi_align,
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/roi_pool_op.h" #include "paddle/fluid/operators/roi_pool_op.h"
#include <memory>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -158,12 +159,30 @@ https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn ...@@ -158,12 +159,30 @@ https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
} }
}; };
class ROIPoolGradDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("roi_pool_grad");
op->SetInput("X", Input("X"));
op->SetInput("ROIs", Input("ROIs"));
op->SetInput("Argmax", Output("Argmax"));
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, REGISTER_OPERATOR(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); ops::ROIPoolGradDescMaker);
REGISTER_OPERATOR(roi_pool_grad, ops::ROIPoolGradOp); REGISTER_OPERATOR(roi_pool_grad, ops::ROIPoolGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
roi_pool, roi_pool,
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/scatter_op.h" #include "paddle/fluid/operators/scatter_op.h"
#include <memory>
#include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/ddim.h"
namespace paddle { namespace paddle {
...@@ -63,14 +64,16 @@ class ScatterGradOp : public framework::OperatorWithKernel { ...@@ -63,14 +64,16 @@ class ScatterGradOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
ctx->SetOutputDim(framework::GradVarName("Updates"), ctx->SetOutputDim(framework::GradVarName("Updates"),
ctx->GetInputDim("Updates")); ctx->GetInputDim("Updates"));
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); ctx->SetOutputDim(framework::GradVarName("X"),
ctx->GetInputDim(framework::GradVarName("Out")));
} }
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(), return framework::OpKernelType(
ctx.device_context()); ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
ctx.device_context());
} }
}; };
...@@ -95,12 +98,34 @@ $$ ...@@ -95,12 +98,34 @@ $$
} }
}; };
class ScatterGradDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("scatter_grad");
op->SetInput("Ids", Input("Ids"));
op->SetInput("Updates", Input("Updates"));
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetOutput(framework::GradVarName("Updates"), InputGrad("Updates"));
op->SetAttrMap(Attrs());
return op;
}
};
DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ScatterGradNoNeedBufferVarsInference,
"Updates");
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(scatter, ops::ScatterOp, ops::ScatterOpMaker, REGISTER_OPERATOR(scatter, ops::ScatterOp, ops::ScatterOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); ops::ScatterGradDescMaker);
REGISTER_OPERATOR(scatter_grad, ops::ScatterGradOp); REGISTER_OPERATOR(scatter_grad, ops::ScatterGradOp,
ops::ScatterGradNoNeedBufferVarsInference);
REGISTER_OP_CPU_KERNEL(scatter, ops::ScatterOpKernel<float>); REGISTER_OP_CPU_KERNEL(scatter, ops::ScatterOpKernel<float>);
REGISTER_OP_CPU_KERNEL(scatter_grad, ops::ScatterGradientOpKernel<float>); REGISTER_OP_CPU_KERNEL(scatter_grad, ops::ScatterGradientOpKernel<float>);
...@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and ...@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/shuffle_channel_op.h" #include "paddle/fluid/operators/shuffle_channel_op.h"
#include <memory>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -91,13 +92,28 @@ class ShuffleChannelGradOp : public framework::OperatorWithKernel { ...@@ -91,13 +92,28 @@ class ShuffleChannelGradOp : public framework::OperatorWithKernel {
} }
}; };
class ShuffleChannelGradDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("shuffle_channel_grad");
op->SetInput("X", Input("X"));
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(shuffle_channel, ops::ShuffleChannelOp, REGISTER_OPERATOR(shuffle_channel, ops::ShuffleChannelOp,
ops::ShuffleChannelOpMaker, ops::ShuffleChannelOpMaker, ops::ShuffleChannelGradDescMaker);
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(shuffle_channel_grad, ops::ShuffleChannelGradOp); REGISTER_OPERATOR(shuffle_channel_grad, ops::ShuffleChannelGradOp);
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/framework/data_layout_transform.h"
...@@ -39,45 +40,6 @@ class MKLDNNHandler { ...@@ -39,45 +40,6 @@ class MKLDNNHandler {
return this->AcquireMemory(md, ptr, "@user_src_mem_p"); return this->AcquireMemory(md, ptr, "@user_src_mem_p");
} }
// TODO(jczaja): extract common part and make AcquireMemory
std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
const mkldnn::memory::primitive_desc& mpd, void* ptr) {
auto local_key = key_ + "@user_src_mem_p";
auto mem_p =
std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
" find mem primitive in device context");
if (mem_p == nullptr) {
mem_p = std::make_shared<mkldnn::memory>(mpd, ptr);
dev_ctx_.SetBlob(local_key, mem_p);
} else {
mem_p->set_data_handle(ptr);
// Mark that reusing happenned. All primitives from operator instance
// should be reused or none of them. So we check consistency
is_reusing_ = true;
}
return mem_p;
}
std::shared_ptr<mkldnn::memory> AcquireWeightsMemory(
const mkldnn::memory::primitive_desc& mpd, void* ptr) {
auto local_key = key_ + "@user_weights_mem_p";
auto mem_p =
std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
" find mem primitive in device context");
if (mem_p == nullptr) {
mem_p = std::make_shared<mkldnn::memory>(mpd, ptr);
dev_ctx_.SetBlob(local_key, mem_p);
} else {
mem_p->set_data_handle(ptr);
// Mark that reusing happenned. All primitives from operator instance
// should be reused or none of them. So we check consistency
is_reusing_ = true;
}
return mem_p;
}
std::shared_ptr<mkldnn::memory> AcquireWeightsMemory( std::shared_ptr<mkldnn::memory> AcquireWeightsMemory(
const mkldnn::memory::desc& md, void* ptr, const mkldnn::memory::desc& md, void* ptr,
user_function custom_func = {}) { user_function custom_func = {}) {
...@@ -315,7 +277,37 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { ...@@ -315,7 +277,37 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
mkldnn::engine engine, const std::string& base_key) mkldnn::engine engine, const std::string& base_key)
: platform::MKLDNNHandler(dev_ctx, engine, base_key), : platform::MKLDNNHandler(dev_ctx, engine, base_key),
dims_(dims), dims_(dims),
axis_(axis) {} axis_(axis),
logical_axis_(dims.size(), 0) {}
std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
const mkldnn::memory::format& fmt, void* ptr) {
auto local_key = key_ + "@user_src_mem_p";
auto mem_p =
std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
" find mem primitive in device context");
if (mem_p == nullptr) {
// Make memory descriptor using input format, unless it
// cannot be trusted (nchw) then make up memory fmt manually
for (size_t i = 0; i < logical_axis_.size(); ++i) {
logical_axis_[i] = i;
}
auto src_md = fmt != mkldnn::memory::format::nchw
? platform::MKLDNNMemDesc(
dims_, platform::MKLDNNGetDataType<float>(), fmt)
: Axis2MemoryDesc(dims_, logical_axis_);
mem_p = std::make_shared<mkldnn::memory>(
mkldnn::memory::primitive_desc{src_md, engine_}, ptr);
dev_ctx_.SetBlob(local_key, mem_p);
} else {
mem_p->set_data_handle(ptr);
// Mark that reusing happenned. All primitives from operator instance
// should be reused or none of them. So we check consistency
is_reusing_ = true;
}
return mem_p;
}
std::shared_ptr<mkldnn::memory> AcquireDstMemory(framework::Tensor* output, std::shared_ptr<mkldnn::memory> AcquireDstMemory(framework::Tensor* output,
platform::Place place) { platform::Place place) {
...@@ -400,6 +392,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { ...@@ -400,6 +392,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
private: private:
std::vector<int> dims_; std::vector<int> dims_;
std::vector<int> axis_; std::vector<int> axis_;
std::vector<int> logical_axis_;
}; };
template <class forward_t, class backward_data_t, class backward_weights_t> template <class forward_t, class backward_data_t, class backward_weights_t>
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <mkldnn.h>
#include <string>
namespace paddle {
namespace platform {
inline mkldnn::memory::primitive_desc create_prim_desc_from_dims(
const std::vector<int>& ltz, mkldnn::memory::format fmt,
mkldnn::memory::data_type data_type = mkldnn::memory::data_type::f32) {
mkldnn_memory_desc_t mem_fmt;
mem_fmt.primitive_kind = mkldnn_memory;
mem_fmt.ndims = ltz.size();
for (unsigned int i = 0; i < ltz.size(); ++i) {
mem_fmt.dims[i] = ltz[i]; // logical dimensions (nchw format,
// regardless physical layout)
}
mem_fmt.data_type = static_cast<mkldnn_data_type_t>(data_type);
mem_fmt.format = static_cast<mkldnn_memory_format_t>(fmt);
unsigned int total_stride = 1;
for (int i = ltz.size() - 1; i >= 0; --i) {
mem_fmt.layout_desc.blocking.padding_dims[i] =
ltz[i]; // logical dimensions (nchw format, regardless physical
// layout)
mem_fmt.layout_desc.blocking.block_dims[i] = 1;
mem_fmt.layout_desc.blocking.offset_padding_to_data[i] = 0; // no offset
mem_fmt.layout_desc.blocking.strides[0][i] = total_stride;
mem_fmt.layout_desc.blocking.strides[1][i] = 1;
total_stride *= ltz[i];
}
mem_fmt.layout_desc.blocking.offset_padding = 0; // no initial offset
auto& pool = platform::DeviceContextPool::Instance();
auto place = paddle::platform::CPUPlace();
auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(pool.Get(place));
auto& cpu_engine = dev_ctx->GetEngine();
return mkldnn::memory::primitive_desc(mem_fmt, cpu_engine);
}
inline mkldnn::memory::primitive_desc create_prim_desc_from_format(
const std::vector<int>& ltz, const mkldnn::memory::format format,
const mkldnn::memory::data_type data_type) {
auto md = mkldnn::memory::desc({ltz}, data_type, format);
auto& pool = platform::DeviceContextPool::Instance();
auto place = paddle::platform::CPUPlace();
auto dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(pool.Get(place));
PADDLE_ENFORCE_NOT_NULL(dev_ctx, "Could not get valid device");
auto& cpu_engine = dev_ctx->GetEngine();
return mkldnn::memory::primitive_desc(md, cpu_engine);
}
} // namespace platform
} // namespace paddle
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/fluid/platform/temporary_allocator.h" #include "paddle/fluid/platform/temporary_allocator.h"
#include <memory> #include <memory>
#include <utility>
#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/memory/allocation/allocator_facade.h"
DEFINE_int64(limit_of_tmp_allocation, -1, DEFINE_int64(limit_of_tmp_allocation, -1,
...@@ -30,31 +31,38 @@ namespace paddle { ...@@ -30,31 +31,38 @@ namespace paddle {
namespace platform { namespace platform {
namespace alloc = memory::allocation; namespace alloc = memory::allocation;
TemporaryAllocation::TemporaryAllocation(
alloc::AllocationPtr &&underlying_allocation)
: Allocation(underlying_allocation->ptr(), underlying_allocation->size(),
underlying_allocation->place()),
underlying_allocation_(std::move(underlying_allocation)) {}
TemporaryAllocator::TemporaryAllocator(platform::Place place) : place_(place) { TemporaryAllocator::TemporaryAllocator(platform::Place place) : place_(place) {
temp_mem_map_.reset(new std::multimap<size_t, alloc::Allocation *>()); temp_mem_map_.reset(new std::multimap<size_t, TemporaryAllocation *>());
} }
bool TemporaryAllocator::IsAllocThreadSafe() const { return true; } bool TemporaryAllocator::IsAllocThreadSafe() const { return true; }
void TemporaryAllocator::Release(const std::function<void()> &callback) { void TemporaryAllocator::Release(const std::function<void()> &callback) {
std::unique_ptr<std::multimap<size_t, alloc::Allocation *>> t_allocations; std::unique_ptr<std::multimap<size_t, TemporaryAllocation *>> t_allocations;
{ {
std::unique_lock<std::mutex> lock(mtx_); std::unique_lock<std::mutex> lock(mtx_);
callback(); callback();
t_allocations.swap(temp_mem_map_); t_allocations.swap(temp_mem_map_);
temp_mem_map_.reset(new std::multimap<size_t, alloc::Allocation *>()); temp_mem_map_.reset(new std::multimap<size_t, TemporaryAllocation *>());
wait_delete_mem_ = 0; wait_delete_mem_ = 0;
} }
alloc::AllocationDeleter deleter;
for (auto tmp : *t_allocations) { for (auto tmp : *t_allocations) {
VLOG(10) << "Delete temporary allocation " << tmp.second->ptr() VLOG(10) << "Delete temporary allocation " << tmp.second->ptr()
<< " size: " << tmp.second->size(); << " size: " << tmp.second->size();
deleter(tmp.second); delete tmp.second;
} }
} }
void TemporaryAllocator::FreeImpl(alloc::Allocation *temp_allocation) { void TemporaryAllocator::Free(alloc::Allocation *allocation) {
auto *temp_allocation = dynamic_cast<TemporaryAllocation *>(allocation);
PADDLE_ENFORCE_NOT_NULL(temp_allocation);
if (platform::is_gpu_place(temp_allocation->place())) { if (platform::is_gpu_place(temp_allocation->place())) {
PADDLE_ENFORCE(platform::is_same_place(temp_allocation->place(), place_), PADDLE_ENFORCE(platform::is_same_place(temp_allocation->place(), place_),
"The place should be the same."); "The place should be the same.");
...@@ -78,7 +86,7 @@ void TemporaryAllocator::FreeImpl(alloc::Allocation *temp_allocation) { ...@@ -78,7 +86,7 @@ void TemporaryAllocator::FreeImpl(alloc::Allocation *temp_allocation) {
} }
VLOG(10) << "Delete temporary allocation " << temp_allocation->ptr() VLOG(10) << "Delete temporary allocation " << temp_allocation->ptr()
<< " size: " << temp_allocation->size(); << " size: " << temp_allocation->size();
alloc::AllocationDeleter()(temp_allocation); delete temp_allocation;
} }
size_t TemporaryAllocator::TemporaryAllocationQueueSize() { size_t TemporaryAllocator::TemporaryAllocationQueueSize() {
...@@ -113,9 +121,11 @@ alloc::Allocation *TemporaryAllocator::AllocateImpl( ...@@ -113,9 +121,11 @@ alloc::Allocation *TemporaryAllocator::AllocateImpl(
} }
// If not find the the available allocation, get allocation from // If not find the the available allocation, get allocation from
// AllocatorFacadeInstance. // AllocatorFacadeInstance.
auto temp_mem = alloc::AllocatorFacade::Instance().Alloc(place_, size, attr); auto raw_allocation =
alloc::AllocatorFacade::Instance().Alloc(place_, size, attr);
auto temp_mem = new TemporaryAllocation(std::move(raw_allocation));
VLOG(10) << "Alloc temporary allocation: " << temp_mem->ptr() << ": " << size; VLOG(10) << "Alloc temporary allocation: " << temp_mem->ptr() << ": " << size;
return temp_mem.release(); return temp_mem;
} }
} // namespace platform } // namespace platform
......
...@@ -23,6 +23,14 @@ ...@@ -23,6 +23,14 @@
namespace paddle { namespace paddle {
namespace platform { namespace platform {
class TemporaryAllocation : public memory::allocation::Allocation {
public:
explicit TemporaryAllocation(
memory::allocation::AllocationPtr &&underlying_allocation);
memory::allocation::AllocationPtr underlying_allocation_;
};
/*! \brief the TemporaryAllocator is used to alloc the temporary allocation /*! \brief the TemporaryAllocator is used to alloc the temporary allocation
* which used by CUDA's async operation. * which used by CUDA's async operation.
* *
...@@ -49,7 +57,7 @@ class TemporaryAllocator : public memory::allocation::Allocator { ...@@ -49,7 +57,7 @@ class TemporaryAllocator : public memory::allocation::Allocator {
void SetCallback(const std::function<void()> &callback); void SetCallback(const std::function<void()> &callback);
protected: protected:
void FreeImpl(memory::allocation::Allocation *allocation) override; void Free(memory::allocation::Allocation *allocation) override;
memory::allocation::Allocation *AllocateImpl( memory::allocation::Allocation *AllocateImpl(
size_t size, memory::allocation::Allocator::Attr attr) override; size_t size, memory::allocation::Allocator::Attr attr) override;
...@@ -58,8 +66,8 @@ class TemporaryAllocator : public memory::allocation::Allocator { ...@@ -58,8 +66,8 @@ class TemporaryAllocator : public memory::allocation::Allocator {
platform::Place place_; platform::Place place_;
// When the allocation is not held by any variable, it should be placed // When the allocation is not held by any variable, it should be placed
// to temp_mem_map immediately. // to temp_mem_map immediately.
std::unique_ptr<std::multimap<size_t, memory::allocation::Allocation *>> std::unique_ptr<std::multimap<size_t, TemporaryAllocation *>> temp_mem_map_{
temp_mem_map_{nullptr}; nullptr};
std::mutex mtx_; std::mutex mtx_;
size_t wait_delete_mem_{0}; size_t wait_delete_mem_{0};
std::function<void()> callback_; std::function<void()> callback_;
......
...@@ -324,7 +324,6 @@ PYBIND11_MODULE(core, m) { ...@@ -324,7 +324,6 @@ PYBIND11_MODULE(core, m) {
[](Tensor &self, paddle::platform::CUDAPinnedPlace &place) { [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
self.mutable_data<float>(place); self.mutable_data<float>(place);
}) })
.def("_clear", &Tensor::clear)
.def("set", PyCPUTensorSetFromArray<float>) .def("set", PyCPUTensorSetFromArray<float>)
.def("set", PyCPUTensorSetFromArray<int>) .def("set", PyCPUTensorSetFromArray<int>)
.def("set", PyCPUTensorSetFromArray<double>) .def("set", PyCPUTensorSetFromArray<double>)
...@@ -1283,6 +1282,15 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1283,6 +1282,15 @@ All parameter, weight, gradient are variables in Paddle.
it will save GPU memory and may make the execution faster. it will save GPU memory and may make the execution faster.
This options is only available in GPU devices. This options is only available in GPU devices.
Default False)DOC") Default False)DOC")
.def_property("fuse_all_optimizer_ops",
[](const BuildStrategy &self) {
return self.fuse_all_optimizer_ops_;
},
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE(!self.IsFinalized(),
"BuildStrategy is finlaized.");
self.fuse_all_optimizer_ops_ = b;
})
.def_property( .def_property(
"sync_batch_norm", "sync_batch_norm",
[](const BuildStrategy &self) { return self.sync_batch_norm_; }, [](const BuildStrategy &self) { return self.sync_batch_norm_; },
......
...@@ -105,12 +105,14 @@ void Printf(const char* fmt, const Args&... args) { ...@@ -105,12 +105,14 @@ void Printf(const char* fmt, const Args&... args) {
Fprintf(std::cout, fmt, args...); Fprintf(std::cout, fmt, args...);
} }
inline std::string HumanReadableSize(double f_size) { template <typename T>
std::string HumanReadableSize(T size) {
size_t i = 0; size_t i = 0;
double f_size = static_cast<double>(size);
double orig = f_size; double orig = f_size;
const std::vector<std::string> units( const std::vector<std::string> units(
{"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"}); {"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"});
while (f_size >= 1024) { while (f_size > 1024) {
f_size /= 1024; f_size /= 1024;
i++; i++;
} }
......
...@@ -34,7 +34,7 @@ from . import io ...@@ -34,7 +34,7 @@ from . import io
from . import evaluator from . import evaluator
from . import initializer from . import initializer
from . import layers from . import layers
from . import imperative from . import dygraph
from . import contrib from . import contrib
from . import nets from . import nets
from . import optimizer from . import optimizer
...@@ -71,7 +71,7 @@ __all__ = framework.__all__ + executor.__all__ + \ ...@@ -71,7 +71,7 @@ __all__ = framework.__all__ + executor.__all__ + \
'initializer', 'initializer',
'layers', 'layers',
'contrib', 'contrib',
'imperative', 'dygraph',
'transpiler', 'transpiler',
'nets', 'nets',
'optimizer', 'optimizer',
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
'''
Example:
>>from paddle.fluid.contrib.model_stat import summary
>>main_program = ...
>>summary(main_program)
+-----+------------+----------------+----------------+---------+------------+
| No. | TYPE | INPUT | OUTPUT | PARAMs | FLOPs |
+-----+------------+----------------+----------------+---------+------------+
| 0 | conv2d | (3, 200, 200) | (64, 100, 100) | 9408 | 188160000 |
| 1 | batch_norm | (64, 100, 100) | (64, 100, 100) | 256 | 640000 |
| 2 | relu | (64, 100, 100) | (64, 100, 100) | 0 | 640000 |
| 3 | pool2d | (64, 100, 100) | (64, 50, 50) | 0 | 1440000 |
...
| 176 | conv2d | (512, 7, 7) | (512, 7, 7) | 2359296 | 231211008 |
| 177 | relu | (512, 7, 7) | (512, 7, 7) | 0 | 25088 |
| 178 | conv2d | (512, 7, 7) | (2048, 7, 7) | 1048576 | 102760448 |
| 179 | relu | (2048, 7, 7) | (2048, 7, 7) | 0 | 100352 |
| 180 | pool2d | (2048, 7, 7) | (2048, 1, 1) | 0 | 100352 |
+-----+------------+----------------+----------------+---------+------------+
Total PARAMs: 48017344(0.0480G)
Total FLOPs: 11692747751(11.69G)
'''
from collections import OrderedDict
from prettytable import PrettyTable
def summary(main_prog):
'''
It can summary model's PARAMS, FLOPs until now.
It support common operator like conv, fc, pool, relu, sigmoid, bn etc.
Args:
main_prog: main program
Returns:
print summary on terminal
'''
collected_ops_list = []
for one_b in main_prog.blocks:
block_vars = one_b.vars
for one_op in one_b.ops:
op_info = OrderedDict()
spf_res = _summary_model(block_vars, one_op)
if spf_res is None:
continue
# TODO: get the operator name
op_info['type'] = one_op.type
op_info['input_shape'] = spf_res[0][1:]
op_info['out_shape'] = spf_res[1][1:]
op_info['PARAMs'] = spf_res[2]
op_info['FLOPs'] = spf_res[3]
collected_ops_list.append(op_info)
summary_table, total = _format_summary(collected_ops_list)
_print_summary(summary_table, total)
def _summary_model(block_vars, one_op):
'''
Compute operator's params and flops.
Args:
block_vars: all vars of one block
one_op: one operator to count
Returns:
in_data_shape: one operator's input data shape
out_data_shape: one operator's output data shape
params: one operator's PARAMs
flops: : one operator's FLOPs
'''
if one_op.type in ['conv2d', 'depthwise_conv2d']:
k_arg_shape = block_vars[one_op.input("Filter")[0]].shape
in_data_shape = block_vars[one_op.input("Input")[0]].shape
out_data_shape = block_vars[one_op.output("Output")[0]].shape
c_out, c_in, k_h, k_w = k_arg_shape
_, c_out_, h_out, w_out = out_data_shape
assert c_out == c_out_, 'shape error!'
k_groups = one_op.attr("groups")
kernel_ops = k_h * k_w * (c_in / k_groups)
bias_ops = 0 if one_op.input("Bias") == [] else 1
params = c_out * (kernel_ops + bias_ops)
flops = h_out * w_out * c_out * (kernel_ops + bias_ops)
# base nvidia paper, include mul and add
flops = 2 * flops
elif one_op.type == 'pool2d':
in_data_shape = block_vars[one_op.input("X")[0]].shape
out_data_shape = block_vars[one_op.output("Out")[0]].shape
_, c_out, h_out, w_out = out_data_shape
k_size = one_op.attr("ksize")
params = 0
flops = h_out * w_out * c_out * (k_size[0] * k_size[1])
elif one_op.type == 'mul':
k_arg_shape = block_vars[one_op.input("Y")[0]].shape
in_data_shape = block_vars[one_op.input("X")[0]].shape
out_data_shape = block_vars[one_op.output("Out")[0]].shape
# TODO: fc has mul ops
# add attr to mul op, tell us whether it belongs to 'fc'
# this's not the best way
if 'fc' not in one_op.output("Out")[0]:
return None
k_in, k_out = k_arg_shape
# bias in sum op
params = k_in * k_out + 1
flops = k_in * k_out
elif one_op.type in ['sigmoid', 'tanh', 'relu', 'leaky_relu', 'prelu']:
in_data_shape = block_vars[one_op.input("X")[0]].shape
out_data_shape = block_vars[one_op.output("Out")[0]].shape
params = 0
if one_op.type == 'prelu':
params = 1
flops = 1
for one_dim in in_data_shape:
flops *= one_dim
elif one_op.type == 'batch_norm':
in_data_shape = block_vars[one_op.input("X")[0]].shape
out_data_shape = block_vars[one_op.output("Y")[0]].shape
_, c_in, h_out, w_out = in_data_shape
# gamma, beta
params = c_in * 2
# compute mean and std
flops = h_out * w_out * c_in * 2
else:
return None
return in_data_shape, out_data_shape, params, flops
def _format_summary(collected_ops_list):
'''
Format summary report.
Args:
collected_ops_list: the collected operator with summary
Returns:
summary_table: summary report format
total: sum param and flops
'''
summary_table = PrettyTable(
["No.", "TYPE", "INPUT", "OUTPUT", "PARAMs", "FLOPs"])
summary_table.align = 'r'
total = {}
total_params = []
total_flops = []
for i, one_op in enumerate(collected_ops_list):
# notice the order
table_row = [
i,
one_op['type'],
one_op['input_shape'],
one_op['out_shape'],
int(one_op['PARAMs']),
int(one_op['FLOPs']),
]
summary_table.add_row(table_row)
total_params.append(int(one_op['PARAMs']))
total_flops.append(int(one_op['FLOPs']))
total['params'] = total_params
total['flops'] = total_flops
return summary_table, total
def _print_summary(summary_table, total):
'''
Print all the summary on terminal.
Args:
summary_table: summary report format
total: sum param and flops
'''
parmas = total['params']
flops = total['flops']
print(summary_table)
print('Total PARAMs: {}({:.4f}M)'.format(
sum(parmas), sum(parmas) / (10**6)))
print('Total FLOPs: {}({:.2f}G)'.format(sum(flops), sum(flops) / 10**9))
print(
"Notice: \n now supported ops include [Conv, DepthwiseConv, FC(mul), BatchNorm, Pool, Activation(sigmoid, tanh, relu, leaky_relu, prelu)]"
)
...@@ -204,6 +204,10 @@ class GraphWrapper(object): ...@@ -204,6 +204,10 @@ class GraphWrapper(object):
""" """
super(GraphWrapper, self).__init__() super(GraphWrapper, self).__init__()
self.program = Program() if program is None else program self.program = Program() if program is None else program
self.persistables = {}
for var in self.program.list_vars():
if var.persistable:
self.persistables[var.name] = var
self.compiled_graph = None self.compiled_graph = None
self.in_nodes = OrderedDict(in_nodes) self.in_nodes = OrderedDict(in_nodes)
self.out_nodes = OrderedDict(out_nodes) self.out_nodes = OrderedDict(out_nodes)
...@@ -467,7 +471,12 @@ class GraphWrapper(object): ...@@ -467,7 +471,12 @@ class GraphWrapper(object):
path(str): The path to save the persistables. path(str): The path to save the persistables.
exe(framework.Executor): The executor used to save the persistables. exe(framework.Executor): The executor used to save the persistables.
""" """
io.save_persistables(exe.exe, path, main_program=self.program) # update persistables from program
for var in self.program.list_vars():
if var.persistable and var.name not in self.persistables:
self.persistables[var.name] = var
io.save_vars(exe.exe, path, vars=self.persistables.values())
def load_persistables(self, path, exe): def load_persistables(self, path, exe):
""" """
...@@ -481,7 +490,7 @@ class GraphWrapper(object): ...@@ -481,7 +490,7 @@ class GraphWrapper(object):
return os.path.exists(os.path.join(path, var.name)) return os.path.exists(os.path.join(path, var.name))
io.load_vars( io.load_vars(
exe.exe, path, main_program=self.program, predicate=if_exist) exe.exe, path, vars=self.persistables.values(), predicate=if_exist)
def update_param_shape(self, scope): def update_param_shape(self, scope):
""" """
......
...@@ -26,6 +26,17 @@ __all__ = [ ...@@ -26,6 +26,17 @@ __all__ = [
] ]
def _init_var_node(var_node, value, scope, place):
assert isinstance(value,
np.ndarray), 'The type of value should be numpy array.'
assert scope is not None, \
'The scope cannot be set None.'
assert place is not None, \
'The place cannot be set None.'
tensor = scope.var(var_node.name()).get_tensor()
tensor.set(value, place)
class QuantizationTransformPass(object): class QuantizationTransformPass(object):
def __init__(self, def __init__(self,
scope=None, scope=None,
...@@ -88,14 +99,14 @@ class QuantizationTransformPass(object): ...@@ -88,14 +99,14 @@ class QuantizationTransformPass(object):
assert activation_quantize_type != 'channel_wise_abs_max', "The activation quantization type does not support 'channel_wise_abs_max'." assert activation_quantize_type != 'channel_wise_abs_max', "The activation quantization type does not support 'channel_wise_abs_max'."
if activation_quantize_type not in quant_type: if activation_quantize_type not in quant_type:
raise ValueError( raise ValueError(
"Unknown activation_quantize_type : '%s'. It can only be ", "Unknown activation_quantize_type : '%s'. It can only be "
"'abs_max' or 'range_abs_max' or 'moving_average_abs_max'.", "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'." %
str(activation_quantize_type)) (str(activation_quantize_type)))
if weight_quantize_type not in quant_type: if weight_quantize_type not in quant_type:
raise ValueError( raise ValueError(
"Unknown weight_quantize_type: '%s'. It can only be ", "Unknown weight_quantize_type: '%s'. It can only be "
"'abs_max' or 'channel_wise_abs_max' or 'range_abs_max' or 'moving_average_abs_max'.", "'abs_max' or 'channel_wise_abs_max' or 'range_abs_max' or 'moving_average_abs_max'."
str(weight_quantize_type)) % (str(weight_quantize_type)))
self._activation_quantize_type = activation_quantize_type self._activation_quantize_type = activation_quantize_type
self._weight_quantize_type = weight_quantize_type self._weight_quantize_type = weight_quantize_type
...@@ -121,8 +132,6 @@ class QuantizationTransformPass(object): ...@@ -121,8 +132,6 @@ class QuantizationTransformPass(object):
""" """
assert isinstance(graph, assert isinstance(graph,
IrGraph), 'graph must be the instance of IrGraph.' IrGraph), 'graph must be the instance of IrGraph.'
#sequential_execution = core.get_pass('sequential_execution_pass')
#sequential_execution.apply(graph.graph)
self._is_test = graph.is_test() self._is_test = graph.is_test()
# marked the variable which has been dequantized. # marked the variable which has been dequantized.
dequantized_vars = collections.OrderedDict() dequantized_vars = collections.OrderedDict()
...@@ -203,9 +212,12 @@ class QuantizationTransformPass(object): ...@@ -203,9 +212,12 @@ class QuantizationTransformPass(object):
var_type=core.VarDesc.VarType.LOD_TENSOR, var_type=core.VarDesc.VarType.LOD_TENSOR,
shape=[1], shape=[1],
var_dtype=core.VarDesc.VarType.INT64) var_dtype=core.VarDesc.VarType.INT64)
self._init_var_node( _init_var_node(
global_step_in, np.zeros( global_step_in,
[1], dtype='int64')) np.zeros(
[1], dtype='int64'),
self._scope,
self._place)
global_step_out = graph.create_var_node_from_desc( global_step_out = graph.create_var_node_from_desc(
global_step_in.var()) global_step_in.var())
# The attribute of `op_role` is needed by ParallelExecutor. # The attribute of `op_role` is needed by ParallelExecutor.
...@@ -284,7 +296,12 @@ class QuantizationTransformPass(object): ...@@ -284,7 +296,12 @@ class QuantizationTransformPass(object):
var_dtype=var_node.dtype()) var_dtype=var_node.dtype())
data_type = 'float64' if var_node.dtype( data_type = 'float64' if var_node.dtype(
) == core.VarDesc.VarType.FP64 else 'float32' ) == core.VarDesc.VarType.FP64 else 'float32'
self._init_var_node(scale_in_node, np.array([0.001], dtype=data_type)) _init_var_node(
scale_in_node,
np.array(
[0.001], dtype=data_type),
self._scope,
self._place)
scale_out_node = graph.create_var_node_from_desc(scale_in_node.var()) scale_out_node = graph.create_var_node_from_desc(scale_in_node.var())
inputs = {'X': var_node, 'InScale': scale_in_node} inputs = {'X': var_node, 'InScale': scale_in_node}
...@@ -299,9 +316,13 @@ class QuantizationTransformPass(object): ...@@ -299,9 +316,13 @@ class QuantizationTransformPass(object):
var_dtype=var_node.dtype()) var_dtype=var_node.dtype())
data_type = 'float64' if var_node.dtype( data_type = 'float64' if var_node.dtype(
) == core.VarDesc.VarType.FP64 else 'float32' ) == core.VarDesc.VarType.FP64 else 'float32'
self._init_var_node( _init_var_node(
scales_node, np.zeros( scales_node,
[self._window_size], dtype=data_type)) np.zeros(
[self._window_size], dtype=data_type),
self._scope,
self._place)
inputs['Iter'] = self._global_step inputs['Iter'] = self._global_step
outputs['OutScales'] = scales_node outputs['OutScales'] = scales_node
attrs = { attrs = {
...@@ -343,7 +364,12 @@ class QuantizationTransformPass(object): ...@@ -343,7 +364,12 @@ class QuantizationTransformPass(object):
var_dtype=var_node.dtype()) var_dtype=var_node.dtype())
data_type = 'float64' if var_node.dtype( data_type = 'float64' if var_node.dtype(
) == core.VarDesc.VarType.FP64 else 'float32' ) == core.VarDesc.VarType.FP64 else 'float32'
self._init_var_node(scale_in_node, np.array([0.001], dtype=data_type)) _init_var_node(
scale_in_node,
np.array(
[0.001], dtype=data_type),
self._scope,
self._place)
scale_out_node = graph.create_var_node_from_desc(scale_in_node.var()) scale_out_node = graph.create_var_node_from_desc(scale_in_node.var())
ins = {'X': var_node, 'InScale': scale_in_node} ins = {'X': var_node, 'InScale': scale_in_node}
...@@ -356,13 +382,23 @@ class QuantizationTransformPass(object): ...@@ -356,13 +382,23 @@ class QuantizationTransformPass(object):
shape=[1]) shape=[1])
data_type = 'float64' if var_node.dtype( data_type = 'float64' if var_node.dtype(
) == core.VarDesc.VarType.FP64 else 'float32' ) == core.VarDesc.VarType.FP64 else 'float32'
self._init_var_node(scale_in_node, np.ones([1], dtype=data_type)) _init_var_node(
scale_in_node,
np.ones(
[1], dtype=data_type),
self._scope,
self._place)
accum_in_node = graph.create_persistable_node( accum_in_node = graph.create_persistable_node(
name=unique_name.generate('accum'), name=unique_name.generate('accum'),
var_type=core.VarDesc.VarType.LOD_TENSOR, var_type=core.VarDesc.VarType.LOD_TENSOR,
var_dtype=var_node.dtype(), var_dtype=var_node.dtype(),
shape=[1]) shape=[1])
self._init_var_node(accum_in_node, np.ones([1], dtype=data_type)) _init_var_node(
accum_in_node,
np.ones(
[1], dtype=data_type),
self._scope,
self._place)
state_out_node = graph.create_var_node_from_desc(state_in_node.var( state_out_node = graph.create_var_node_from_desc(state_in_node.var(
)) ))
accum_out_node = graph.create_var_node_from_desc(accum_in_node.var( accum_out_node = graph.create_var_node_from_desc(accum_in_node.var(
...@@ -482,16 +518,6 @@ class QuantizationTransformPass(object): ...@@ -482,16 +518,6 @@ class QuantizationTransformPass(object):
graph.link_to(dequant_op_node, dequant_var_node) graph.link_to(dequant_op_node, dequant_var_node)
return dequant_var_node return dequant_var_node
def _init_var_node(self, var_node, value):
assert isinstance(
value, np.ndarray), 'The type of value should be numpy array.'
assert self._scope is not None, \
'The scope cannot be set None when activation_quantize_type equals to range_abs_max.'
assert self._place is not None, \
'The place cannot be set None when activation_quantize_type equals to range_abs_max.'
tensor = self._scope.var(var_node.name()).get_tensor()
tensor.set(value, self._place)
def _quantized_var_name(self, var_name): def _quantized_var_name(self, var_name):
""" """
Return quantized variable name for the input `var_name`. Return quantized variable name for the input `var_name`.
...@@ -594,8 +620,8 @@ class QuantizationFreezePass(object): ...@@ -594,8 +620,8 @@ class QuantizationFreezePass(object):
self._weight_bits) self._weight_bits)
self._restore_var(input_arg_name, quantized_param_v) self._restore_var(input_arg_name, quantized_param_v)
else: else:
scale_v = self._to_node(op_node.outputs, scale_v = graph._find_node_by_name(
op_node.output('OutScale')[0]) op_node.outputs, op_node.output('OutScale')[0])
self._var_scale_map[input_arg_name] = scale_v self._var_scale_map[input_arg_name] = scale_v
ops = graph.all_op_nodes() ops = graph.all_op_nodes()
...@@ -627,8 +653,8 @@ class QuantizationFreezePass(object): ...@@ -627,8 +653,8 @@ class QuantizationFreezePass(object):
return graph return graph
def _remove_fake_quant_and_dequant_op(self, graph, op_node): def _remove_fake_quant_and_dequant_op(self, graph, op_node):
k = self._to_node(op_node.outputs, op_node.output('Out')[0]) k = graph._find_node_by_name(op_node.outputs, op_node.output('Out')[0])
v = self._to_node(op_node.inputs, op_node.input('X')[0]) v = graph._find_node_by_name(op_node.inputs, op_node.input('X')[0])
if v.node not in self._op_input_rename_map: if v.node not in self._op_input_rename_map:
self._op_input_rename_map[k.node] = v self._op_input_rename_map[k.node] = v
else: else:
...@@ -663,8 +689,8 @@ class QuantizationFreezePass(object): ...@@ -663,8 +689,8 @@ class QuantizationFreezePass(object):
raise ValueError("Only support one output, but op %s has" raise ValueError("Only support one output, but op %s has"
" more than one output." % (op_node.name())) " more than one output." % (op_node.name()))
output_var_node = self._to_node(op_node.outputs, output_var_node = graph._find_node_by_name(
op_node.output_arg_names()[0]) op_node.outputs, op_node.output_arg_names()[0])
weight_scale_node = graph.create_persistable_node( weight_scale_node = graph.create_persistable_node(
name=unique_name.generate('channel_scale'), name=unique_name.generate('channel_scale'),
var_type=core.VarDesc.VarType.LOD_TENSOR, var_type=core.VarDesc.VarType.LOD_TENSOR,
...@@ -672,7 +698,9 @@ class QuantizationFreezePass(object): ...@@ -672,7 +698,9 @@ class QuantizationFreezePass(object):
var_dtype=output_var_node.dtype()) var_dtype=output_var_node.dtype())
data_type = 'float64' if output_var_node.dtype( data_type = 'float64' if output_var_node.dtype(
) == core.VarDesc.VarType.FP64 else 'float32' ) == core.VarDesc.VarType.FP64 else 'float32'
self._init_var_node(weight_scale_node, channel_scale.astype(data_type)) _init_var_node(weight_scale_node,
channel_scale.astype(data_type), self._scope,
self._place)
dequant_var_node = graph.create_var_node( dequant_var_node = graph.create_var_node(
name=self._dequantized_var_name(output_var_node.name()), name=self._dequantized_var_name(output_var_node.name()),
var_type=output_var_node.type(), var_type=output_var_node.type(),
...@@ -724,8 +752,8 @@ class QuantizationFreezePass(object): ...@@ -724,8 +752,8 @@ class QuantizationFreezePass(object):
raise ValueError("Only support one output, but op %s has" raise ValueError("Only support one output, but op %s has"
" more than one output." % (op_node.name())) " more than one output." % (op_node.name()))
output_var_node = self._to_node(op_node.outputs, output_var_node = graph._find_node_by_name(
op_node.output_arg_names()[0]) op_node.outputs, op_node.output_arg_names()[0])
dequant_var_node = graph.create_var_node( dequant_var_node = graph.create_var_node(
name=self._dequantized_var_name(output_var_node.name()), name=self._dequantized_var_name(output_var_node.name()),
var_type=output_var_node.type(), var_type=output_var_node.type(),
...@@ -746,24 +774,6 @@ class QuantizationFreezePass(object): ...@@ -746,24 +774,6 @@ class QuantizationFreezePass(object):
self._op_output_rename_map[output_var_node.node] = dequant_var_node self._op_output_rename_map[output_var_node.node] = dequant_var_node
return dequant_var_node return dequant_var_node
def _init_var_node(self, var_node, value):
assert isinstance(
value, np.ndarray), 'The type of value should be numpy array.'
assert self._scope is not None, \
'The scope cannot be set None when activation_quantize_type equals to range_abs_max.'
assert self._place is not None, \
'The place cannot be set None when activation_quantize_type equals to range_abs_max.'
tensor = self._scope.var(var_node.name()).get_tensor()
tensor.set(value, self._place)
def _to_node(self, nodes, node_name):
target_node = None
for n in nodes:
if n.name() == node_name:
target_node = n
assert target_node is not None, "Cannot find the target node in the giving set."
return target_node
def _load_var(self, name): def _load_var(self, name):
return np.array(self._scope.find_var(name).get_tensor()) return np.array(self._scope.find_var(name).get_tensor())
......
...@@ -20,7 +20,7 @@ from .... import io ...@@ -20,7 +20,7 @@ from .... import io
from .... import core from .... import core
from ....compiler import CompiledProgram from ....compiler import CompiledProgram
from ....compiler import BuildStrategy from ....compiler import BuildStrategy
from ....framework import IrGraph from ....framework import IrGraph, Variable, Program
from ..core.strategy import Strategy from ..core.strategy import Strategy
from .quantization_pass import * from .quantization_pass import *
...@@ -45,13 +45,14 @@ class QuantizationStrategy(Strategy): ...@@ -45,13 +45,14 @@ class QuantizationStrategy(Strategy):
activation_bits=8, activation_bits=8,
weight_bits=8, weight_bits=8,
activation_quantize_type='abs_max', activation_quantize_type='abs_max',
weight_quantize_type='abs_max',
save_in_nodes=None, save_in_nodes=None,
save_out_nodes=None): save_out_nodes=None):
""" """
Args: Args:
start_epoch(int): The 'on_epoch_begin' function will be called in start_epoch. default: 0 start_epoch(int): The 'on_epoch_begin' function will be called in start_epoch. default: 0
end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. default: 0 end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. default: 0
float_model_save_path(str): The path to save model with float weights. float_model_save_path(str): The path to save model with float weights.
None means it doesn't save float model. defalut: None. None means it doesn't save float model. defalut: None.
mobile_model_save_path(str): The path to save model for paddle-mobile execution. mobile_model_save_path(str): The path to save model for paddle-mobile execution.
None means it doesn't save mobile model. defalut: None. None means it doesn't save mobile model. defalut: None.
...@@ -66,9 +67,11 @@ class QuantizationStrategy(Strategy): ...@@ -66,9 +67,11 @@ class QuantizationStrategy(Strategy):
dynamically each step in both training and testing period. If use dynamically each step in both training and testing period. If use
'range_abs_max', a static quantization scale will be calculated 'range_abs_max', a static quantization scale will be calculated
during training and used in inference. during training and used in inference.
save_in_nodes(list<str>): A list of variable names used to prune graph weight_quantize_type (str): quantization type for weights, support 'abs_max' and 'channel_wise_abs_max'.
The 'range_abs_max' usually is not used for weight, since weights are fixed once the model is well trained.
save_in_nodes(list<str>): A list of variable names used to prune graph
for saving inference model. for saving inference model.
save_out_nodes(list<str>): A list of variable names used to prune graph save_out_nodes(list<str>): A list of variable names used to prune graph
for saving inference model. for saving inference model.
""" """
...@@ -81,43 +84,80 @@ class QuantizationStrategy(Strategy): ...@@ -81,43 +84,80 @@ class QuantizationStrategy(Strategy):
self.activation_bits = activation_bits self.activation_bits = activation_bits
self.weight_bits = weight_bits self.weight_bits = weight_bits
self.activation_quantize_type = activation_quantize_type self.activation_quantize_type = activation_quantize_type
self.weight_quantize_type = weight_quantize_type
self.save_out_nodes = save_out_nodes self.save_out_nodes = save_out_nodes
self.save_in_nodes = save_in_nodes self.save_in_nodes = save_in_nodes
def on_compression_begin(self, context):
"""
Restore graph when the compressoin task is inited from checkpoint.
"""
# It is inited from checkpoint and has missed start epoch.
if context.epoch_id != 0 and context.epoch_id > self.start_epoch:
_logger.info("Restore quantization task from checkpoint")
self._modify_graph_for_quantization(context)
_logger.info("Finish restoring quantization task from checkpoint")
def _modify_graph_for_quantization(self, context):
"""
Insert fake_quantize_op and fake_dequantize_op before trainging and testing.
"""
train_ir_graph = IrGraph(
core.Graph(context.optimize_graph.program.clone().desc),
for_test=False)
test_ir_graph = IrGraph(
core.Graph(context.eval_graph.program.clone().desc), for_test=True)
transform_pass = QuantizationTransformPass(
scope=context.scope,
place=context.place,
weight_bits=self.weight_bits,
activation_bits=self.activation_bits,
activation_quantize_type=self.activation_quantize_type,
weight_quantize_type=self.weight_quantize_type)
transform_pass.apply(train_ir_graph)
transform_pass.apply(test_ir_graph)
# Put persistables created by transform_pass into context.optimize_graph.persistables
# for saving checkpoint.
program_persistables = set()
for var in context.optimize_graph.program.list_vars():
if var.persistable:
program_persistables.add(var.name)
program = Program()
for var_node in train_ir_graph.all_persistable_nodes():
if var_node.name() not in program_persistables:
var_desc = var_node.var()
var = program.global_block().create_var(
name=var_node.name(),
shape=var_desc.shape(),
dtype=var_desc.dtype(),
type=var_desc.type(),
lod_level=var_desc.lod_level())
context.optimize_graph.persistables[var.name] = var
build_strategy = BuildStrategy()
build_strategy.enable_inplace = False
build_strategy.memory_optimize = False
# for quantization training
context.optimize_graph.compiled_graph = CompiledProgram(
train_ir_graph.graph).with_data_parallel(
loss_name=context.optimize_graph.out_nodes['loss'],
build_strategy=build_strategy)
# for evaluation. And program compiled from ir graph must be with data parallel.
context.eval_graph.compiled_graph = CompiledProgram(
test_ir_graph.graph).with_data_parallel(
build_strategy=build_strategy)
# for saving inference model after training
context.put('quantization_test_ir_graph_backup', test_ir_graph)
def on_epoch_begin(self, context): def on_epoch_begin(self, context):
""" """
Insert fake_quantize_op and fake_dequantize_op before trainging and testing. Insert fake_quantize_op and fake_dequantize_op before trainging and testing.
""" """
super(QuantizationStrategy, self).on_compression_begin(context) super(QuantizationStrategy, self).on_epoch_begin(context)
if self.start_epoch == context.epoch_id: if self.start_epoch == context.epoch_id:
_logger.info('QuantizationStrategy::on_epoch_begin') _logger.info('QuantizationStrategy::on_epoch_begin')
train_ir_graph = IrGraph( self._modify_graph_for_quantization(context)
core.Graph(context.optimize_graph.program.desc), for_test=False)
test_ir_graph = IrGraph(
core.Graph(context.eval_graph.program.desc), for_test=True)
transform_pass = QuantizationTransformPass(
scope=context.scope,
place=context.place,
weight_bits=self.weight_bits,
activation_bits=self.activation_bits,
activation_quantize_type=self.activation_quantize_type)
transform_pass.apply(train_ir_graph)
transform_pass.apply(test_ir_graph)
build_strategy = BuildStrategy()
build_strategy.enable_inplace = False
build_strategy.memory_optimize = False
# for quantization training
context.optimize_graph.compiled_graph = CompiledProgram(
train_ir_graph.graph).with_data_parallel(
loss_name=context.optimize_graph.out_nodes['loss'],
build_strategy=build_strategy)
# for evaluation. And program compiled from ir graph must be with data parallel.
context.eval_graph.compiled_graph = CompiledProgram(
test_ir_graph.graph).with_data_parallel(
build_strategy=build_strategy)
# for saving inference model after training
context.put('quantization_test_ir_graph_backup', test_ir_graph)
_logger.info('Finish QuantizationStrategy::on_epoch_begin') _logger.info('Finish QuantizationStrategy::on_epoch_begin')
def on_epoch_end(self, context): def on_epoch_end(self, context):
...@@ -134,7 +174,8 @@ class QuantizationStrategy(Strategy): ...@@ -134,7 +174,8 @@ class QuantizationStrategy(Strategy):
scope=context.scope, scope=context.scope,
place=context.place, place=context.place,
weight_bits=self.weight_bits, weight_bits=self.weight_bits,
activation_bits=self.activation_bits) activation_bits=self.activation_bits,
weight_quantize_type=self.weight_quantize_type)
freeze_pass.apply(test_ir_graph) freeze_pass.apply(test_ir_graph)
# for other strategies # for other strategies
......
...@@ -35,6 +35,8 @@ strategies: ...@@ -35,6 +35,8 @@ strategies:
start_epoch: 0 start_epoch: 0
end_epoch: 0 end_epoch: 0
float_model_save_path: './output/float' float_model_save_path: './output/float'
mobile_model_save_path: './output/mobile'
int8_model_save_path: './output/int8'
weight_bits: 8 weight_bits: 8
activation_bits: 8 activation_bits: 8
weight_quantize_type: 'abs_max' weight_quantize_type: 'abs_max'
......
...@@ -256,8 +256,6 @@ class TestQuantizationFreezePass(unittest.TestCase): ...@@ -256,8 +256,6 @@ class TestQuantizationFreezePass(unittest.TestCase):
place=place, place=place,
activation_quantize_type=activation_quant_type, activation_quantize_type=activation_quant_type,
weight_quantize_type=weight_quant_type) weight_quantize_type=weight_quant_type)
#transform_pass = QuantizationTransformPass(
# scope=scope, place=place, activation_quantize_type=activation_quant_type)
transform_pass.apply(main_graph) transform_pass.apply(main_graph)
transform_pass.apply(test_graph) transform_pass.apply(test_graph)
dev_name = '_gpu_' if use_cuda else '_cpu_' dev_name = '_gpu_' if use_cuda else '_cpu_'
...@@ -315,7 +313,6 @@ class TestQuantizationFreezePass(unittest.TestCase): ...@@ -315,7 +313,6 @@ class TestQuantizationFreezePass(unittest.TestCase):
# Freeze graph for inference, but the weight of fc/conv is still float type. # Freeze graph for inference, but the weight of fc/conv is still float type.
freeze_pass = QuantizationFreezePass( freeze_pass = QuantizationFreezePass(
scope=scope, place=place, weight_quantize_type=weight_quant_type) scope=scope, place=place, weight_quantize_type=weight_quant_type)
#freeze_pass = QuantizationFreezePass(scope=scope, place=place)
freeze_pass.apply(test_graph) freeze_pass.apply(test_graph)
if not for_ci: if not for_ci:
marked_nodes = set() marked_nodes = set()
......
...@@ -22,7 +22,7 @@ __all__ = ['enabled', 'guard', 'to_variable'] ...@@ -22,7 +22,7 @@ __all__ = ['enabled', 'guard', 'to_variable']
def enabled(): def enabled():
return framework._in_imperative_mode() return framework._in_dygraph_mode()
@signature_safe_contextmanager @signature_safe_contextmanager
...@@ -39,14 +39,14 @@ def guard(place=None): ...@@ -39,14 +39,14 @@ def guard(place=None):
with framework.program_guard(train, startup): with framework.program_guard(train, startup):
with framework.unique_name.guard(): with framework.unique_name.guard():
with framework._imperative_guard(tracer): with framework._dygraph_guard(tracer):
with framework._imperative_place_guard(place): with framework._dygraph_place_guard(place):
yield yield
def to_variable(value, block=None, name=None): def to_variable(value, block=None, name=None):
if isinstance(value, np.ndarray): if isinstance(value, np.ndarray):
assert enabled(), "to_variable could only be called in imperative mode" assert enabled(), "to_variable could only be called in dygraph mode"
if not block: if not block:
block = framework.default_main_program().current_block() block = framework.default_main_program().current_block()
......
...@@ -68,7 +68,7 @@ def save_persistables(vardict, dirname, filename=None): ...@@ -68,7 +68,7 @@ def save_persistables(vardict, dirname, filename=None):
dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
init_cell) init_cell)
param_path = "./my_paddle_model" param_path = "./my_paddle_model"
fluid.imperative.checkpoint.save_persistables(ptb_model.state_dict(), dirname=param_path, fluid.dygraph.save_persistables(ptb_model.state_dict(), dirname=param_path,
layer=ptb_model) layer=ptb_model)
""" """
if isinstance(vardict, collections.OrderedDict): if isinstance(vardict, collections.OrderedDict):
...@@ -97,17 +97,17 @@ def load_persistables(vardict, dirname, filename=None): ...@@ -97,17 +97,17 @@ def load_persistables(vardict, dirname, filename=None):
Examples: Examples:
.. code-block:: python .. code-block:: python
my_layer = layer(fluid.imperative.Layer) my_layer = layer(fluid.dygraph.Layer)
param_path = "./my_paddle_model" param_path = "./my_paddle_model"
param_dict = fluid.imperative.checkpoint.load_persistables(my_layer.parameters(), param_path) param_dict = fluid.dygraph.load_persistables(my_layer.parameters(), param_path)
param_1 = param_dict['PtbModel_0.w_1'] param_1 = param_dict['PtbModel_0.w_1']
or: or:
my_layer = layer(fluid.imperative.Layer) my_layer = layer(fluid.dygraph.Layer)
param_path = "./my_paddle_model" param_path = "./my_paddle_model"
filename = "model.file" filename = "model.file"
param_dict = fluid.imperative.checkpoint.load_persistables(my_layer.state_dict(), param_path, param_dict = fluid.dygraph.load_persistables(my_layer.state_dict(), param_path,
filename=filename) filename=filename)
param_1 = param_dict['PtbModel_0.w_1'] param_1 = param_dict['PtbModel_0.w_1']
......
...@@ -16,7 +16,7 @@ from __future__ import print_function ...@@ -16,7 +16,7 @@ from __future__ import print_function
import copy import copy
import six import six
from ..framework import Parameter, _in_imperative_mode from ..framework import Parameter, _in_dygraph_mode
from ..param_attr import ParamAttr from ..param_attr import ParamAttr
from .. import core from .. import core
from six.moves import zip from six.moves import zip
......
...@@ -283,7 +283,7 @@ class PyLayer(core.PyLayer): ...@@ -283,7 +283,7 @@ class PyLayer(core.PyLayer):
@classmethod @classmethod
def __call__(cls, *inputs): def __call__(cls, *inputs):
tracer = framework._imperative_tracer() tracer = framework._dygraph_tracer()
block = framework.default_main_program().current_block() block = framework.default_main_program().current_block()
ivar_inputs = [x._ivar for x in inputs] ivar_inputs = [x._ivar for x in inputs]
......
...@@ -133,7 +133,7 @@ class Conv2D(layers.Layer): ...@@ -133,7 +133,7 @@ class Conv2D(layers.Layer):
outputs={'Out': [pre_act]}, outputs={'Out': [pre_act]},
attrs={'axis': 1}) attrs={'axis': 1})
# Currently, we don't support inplace in imperative mode # Currently, we don't support inplace in dygraph mode
return self._helper.append_activation(pre_act, act=self._act) return self._helper.append_activation(pre_act, act=self._act)
...@@ -265,7 +265,7 @@ class FC(layers.Layer): ...@@ -265,7 +265,7 @@ class FC(layers.Layer):
attrs={'axis': self._num_flatten_dims}) attrs={'axis': self._num_flatten_dims})
else: else:
pre_activation = pre_bias pre_activation = pre_bias
# Currently, we don't support inplace in imperative mode # Currently, we don't support inplace in dygraph mode
return self._helper.append_activation(pre_activation, act=self._act) return self._helper.append_activation(pre_activation, act=self._act)
...@@ -387,7 +387,7 @@ class BatchNorm(layers.Layer): ...@@ -387,7 +387,7 @@ class BatchNorm(layers.Layer):
"use_global_stats": self._use_global_stats "use_global_stats": self._use_global_stats
}) })
# Currently, we don't support inplace in imperative mode # Currently, we don't support inplace in dygraph mode
return self._helper.append_activation(batch_norm_out, self._act) return self._helper.append_activation(batch_norm_out, self._act)
...@@ -426,7 +426,7 @@ class Embedding(layers.Layer): ...@@ -426,7 +426,7 @@ class Embedding(layers.Layer):
dict_size = len(dataset.ids) dict_size = len(dataset.ids)
input = fluid.layers.data(name='ids', shape=[32, 32], dtype='float32') input = fluid.layers.data(name='ids', shape=[32, 32], dtype='float32')
embedding = fluid.imperative.Embedding(size=[dict_size, 16]) embedding = fluid.dygraph.Embedding(size=[dict_size, 16])
fc = embedding(input) fc = embedding(input)
""" """
......
...@@ -24,12 +24,12 @@ __all__ = ['Tracer'] ...@@ -24,12 +24,12 @@ __all__ = ['Tracer']
def release_op(op): def release_op(op):
del framework._imperative_tracer()._ops[op._trace_id] del framework._dygraph_tracer()._ops[op._trace_id]
class Tracer(core.Tracer): class Tracer(core.Tracer):
""" """
Python wrapper of imperative tracer Python wrapper of dygraph tracer
""" """
def __init__(self, block): def __init__(self, block):
......
...@@ -75,20 +75,20 @@ GRAD_VAR_SUFFIX = core.kGradVarSuffix() ...@@ -75,20 +75,20 @@ GRAD_VAR_SUFFIX = core.kGradVarSuffix()
ZERO_VAR_SUFFIX = core.kZeroVarSuffix() ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName() CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName()
_imperative_tracer_ = None _dygraph_tracer_ = None
_imperative_current_expected_place_ = None _dygraph_current_expected_place_ = None
def _in_imperative_mode(): def _in_dygraph_mode():
return _imperative_tracer_ is not None return _dygraph_tracer_ is not None
def _imperative_tracer(): def _dygraph_tracer():
return _imperative_tracer_ return _dygraph_tracer_
def _current_expected_place(): def _current_expected_place():
return _imperative_current_expected_place_ return _dygraph_current_expected_place_
def _cpu_num(): def _cpu_num():
...@@ -396,7 +396,7 @@ class Variable(object): ...@@ -396,7 +396,7 @@ class Variable(object):
if not isinstance(dtype, core.VarDesc.VarType): if not isinstance(dtype, core.VarDesc.VarType):
dtype = convert_np_dtype_to_dtype_(dtype) dtype = convert_np_dtype_to_dtype_(dtype)
if _in_imperative_mode(): if _in_dygraph_mode():
# record vars in tracer rather than blocks # record vars in tracer rather than blocks
self._ivar = kwargs.get("ivar", None) self._ivar = kwargs.get("ivar", None)
if not self._ivar: if not self._ivar:
...@@ -406,7 +406,7 @@ class Variable(object): ...@@ -406,7 +406,7 @@ class Variable(object):
_current_expected_place(), stop_gradient, True _current_expected_place(), stop_gradient, True
if persistable else False) if persistable else False)
if persistable: if persistable:
_imperative_tracer().trace_var(name, self) _dygraph_tracer().trace_var(name, self)
else: else:
self.error_clip = error_clip self.error_clip = error_clip
...@@ -515,8 +515,8 @@ class Variable(object): ...@@ -515,8 +515,8 @@ class Variable(object):
Returns: Returns:
str: The debug string. str: The debug string.
""" """
if _in_imperative_mode(): if _in_dygraph_mode():
# TODO(panyx0718): add more imperative debug info. # TODO(panyx0718): add more dygraph debug info.
return 'name %s, dtype: %s shape: %s' % (self.name, self.dtype, return 'name %s, dtype: %s shape: %s' % (self.name, self.dtype,
self.shape) self.shape)
...@@ -548,42 +548,42 @@ class Variable(object): ...@@ -548,42 +548,42 @@ class Variable(object):
@property @property
def _stop_gradient(self): def _stop_gradient(self):
if _in_imperative_mode(): if _in_dygraph_mode():
return self._ivar.stop_gradient return self._ivar.stop_gradient
else: else:
return self.stop_gradient return self.stop_gradient
@_stop_gradient.setter @_stop_gradient.setter
def _stop_gradient(self, s): def _stop_gradient(self, s):
if _in_imperative_mode(): if _in_dygraph_mode():
self._ivar.stop_gradient = s self._ivar.stop_gradient = s
else: else:
self.stop_gradient = s self.stop_gradient = s
@property @property
def persistable(self): def persistable(self):
if _in_imperative_mode(): if _in_dygraph_mode():
return self._ivar.persistable return self._ivar.persistable
else: else:
return self.desc.persistable() return self.desc.persistable()
@persistable.setter @persistable.setter
def persistable(self, p): def persistable(self, p):
if _in_imperative_mode(): if _in_dygraph_mode():
return self._ivar.persistable return self._ivar.persistable
else: else:
self.desc.set_persistable(p) self.desc.set_persistable(p)
@property @property
def name(self): def name(self):
if _in_imperative_mode(): if _in_dygraph_mode():
return self._ivar.name return self._ivar.name
else: else:
return cpt.to_text(self.desc.name()) return cpt.to_text(self.desc.name())
@name.setter @name.setter
def name(self, new_name): def name(self, new_name):
if _in_imperative_mode(): if _in_dygraph_mode():
self._ivar.name = new_name self._ivar.name = new_name
else: else:
self.desc.set_name(new_name) self.desc.set_name(new_name)
...@@ -591,26 +591,26 @@ class Variable(object): ...@@ -591,26 +591,26 @@ class Variable(object):
@property @property
def shape(self): def shape(self):
# convert to tuple, make it as same as numpy API. # convert to tuple, make it as same as numpy API.
if _in_imperative_mode(): if _in_dygraph_mode():
return self._ivar.shape return self._ivar.shape
else: else:
return tuple(self.desc.shape()) return tuple(self.desc.shape())
@property @property
def dtype(self): def dtype(self):
if _in_imperative_mode(): if _in_dygraph_mode():
return self._ivar.dtype return self._ivar.dtype
else: else:
return self.desc.dtype() return self.desc.dtype()
@property @property
def lod_level(self): def lod_level(self):
# TODO(minqiyang): Support lod_level in imperative mode # TODO(minqiyang): Support lod_level in dygraph mode
return self.desc.lod_level() return self.desc.lod_level()
@property @property
def type(self): def type(self):
if _in_imperative_mode(): if _in_dygraph_mode():
return self._ivar.dtype return self._ivar.dtype
else: else:
return self.desc.type() return self.desc.type()
...@@ -918,7 +918,7 @@ class Operator(object): ...@@ -918,7 +918,7 @@ class Operator(object):
inputs=None, inputs=None,
outputs=None, outputs=None,
attrs=None): attrs=None):
if _in_imperative_mode(): if _in_dygraph_mode():
if type is None: if type is None:
raise ValueError( raise ValueError(
"`type` to initialized an Operator can not be None.") "`type` to initialized an Operator can not be None.")
...@@ -1037,7 +1037,7 @@ class Operator(object): ...@@ -1037,7 +1037,7 @@ class Operator(object):
for arg in out_args: for arg in out_args:
out_arg_names.append(cpt.to_text(arg.name)) out_arg_names.append(cpt.to_text(arg.name))
# TODO(minqiyang): could we remove variable's op in static mode? # TODO(minqiyang): could we remove variable's op in static mode?
if not _in_imperative_mode(): if not _in_dygraph_mode():
arg.op = self arg.op = self
self.desc.set_output(out_proto.name, out_arg_names) self.desc.set_output(out_proto.name, out_arg_names)
...@@ -1083,7 +1083,7 @@ class Operator(object): ...@@ -1083,7 +1083,7 @@ class Operator(object):
@property @property
def type(self): def type(self):
if _in_imperative_mode(): if _in_dygraph_mode():
return self.iop.type return self.iop.type
else: else:
return self.desc.type() return self.desc.type()
...@@ -1626,7 +1626,7 @@ class Block(object): ...@@ -1626,7 +1626,7 @@ class Block(object):
Returns: Returns:
Operator: the append Operator. Operator: the append Operator.
""" """
if _in_imperative_mode(): if _in_dygraph_mode():
op = Operator( op = Operator(
block=self, block=self,
desc=None, desc=None,
...@@ -1638,9 +1638,8 @@ class Block(object): ...@@ -1638,9 +1638,8 @@ class Block(object):
# record ops in tracer rather than blocks # record ops in tracer rather than blocks
# #
# TODO(minqiyang): add op stop_gradient support in static mode too. # TODO(minqiyang): add op stop_gradient support in static mode too.
# currently, we only support stop_gradient in imperative mode. # currently, we only support stop_gradient in dygraph mode.
_imperative_tracer().trace_op(op, _dygraph_tracer().trace_op(op, kwargs.get("stop_gradient", False))
kwargs.get("stop_gradient", False))
else: else:
op_desc = self.desc.append_op() op_desc = self.desc.append_op()
op = Operator( op = Operator(
...@@ -1699,7 +1698,7 @@ class Block(object): ...@@ -1699,7 +1698,7 @@ class Block(object):
return self.ops[start:end] return self.ops[start:end]
def _prepend_op(self, *args, **kwargs): def _prepend_op(self, *args, **kwargs):
if _in_imperative_mode(): if _in_dygraph_mode():
op = Operator( op = Operator(
self, self,
None, None,
...@@ -1707,8 +1706,7 @@ class Block(object): ...@@ -1707,8 +1706,7 @@ class Block(object):
inputs=kwargs.get("inputs", None), inputs=kwargs.get("inputs", None),
outputs=kwargs.get("outputs", None), outputs=kwargs.get("outputs", None),
attrs=kwargs.get("attrs", None)) attrs=kwargs.get("attrs", None))
_imperative_tracer().trace_op(op, _dygraph_tracer().trace_op(op, kwargs.get("stop_gradient", False))
kwargs.get("stop_gradient", False))
else: else:
op_desc = self.desc._prepend_op() op_desc = self.desc._prepend_op()
op = Operator( op = Operator(
...@@ -2347,40 +2345,6 @@ class IrGraph(object): ...@@ -2347,40 +2345,6 @@ class IrGraph(object):
""" """
return {IrOpNode(node) for node in self.graph.nodes() if node.is_op()} return {IrOpNode(node) for node in self.graph.nodes() if node.is_op()}
def _find_var_node(self, key):
"""
Get a variable node by the `key` from this graph. The key
can be a node name or a node id.
WARNS:
There are some nodes may have the same name. So, be
cautious about using this method when you find the
target var node by its name.
Args:
key(str|int): The str type denotes that the target variable node's name.
And the int type denotes that the target variable node's id.
Raises:
ValueError: If this graph doesn't have a variable with the giving name or id.
Returns:
IrVarNode: the variable node with the giving name or id.
"""
target_var_node = None
var_nodes = self.all_var_nodes()
if isinstance(key, six.string_types):
for var_node in var_nodes:
if var_node.name() == key:
target_var_node = var_node
elif isinstance(key, int):
for var_node in var_nodes:
if var_node.id() == key:
target_var_node = var_node
if target_var_node is None:
raise ValueError("var_node %s not in this graph" % key)
return target_var_node
def create_persistable_node(self, name, var_type, shape, var_dtype): def create_persistable_node(self, name, var_type, shape, var_dtype):
""" """
Create a persistable variable node in the graph. In IrGraph, Create a persistable variable node in the graph. In IrGraph,
...@@ -2525,14 +2489,6 @@ class IrGraph(object): ...@@ -2525,14 +2489,6 @@ class IrGraph(object):
core.graph_safe_remove_nodes(self.graph, original_nodes) core.graph_safe_remove_nodes(self.graph, original_nodes)
def resolve_hazard(self): def resolve_hazard(self):
def _to_node(nodes, node_name):
target_node = None
for n in nodes:
if n.name() == node_name:
target_node = n
assert target_node is not None, "Cannot find the target node in the giving set."
return target_node
ordered_nodes = core.topology_sort(self.graph) ordered_nodes = core.topology_sort(self.graph)
var_nodes = dict() var_nodes = dict()
for node in ordered_nodes: for node in ordered_nodes:
...@@ -2540,16 +2496,17 @@ class IrGraph(object): ...@@ -2540,16 +2496,17 @@ class IrGraph(object):
for each_var_name in node.op().input_arg_names(): for each_var_name in node.op().input_arg_names():
if each_var_name not in var_nodes: if each_var_name not in var_nodes:
var_nodes[each_var_name] = [ var_nodes[each_var_name] = [
_to_node(node.inputs, each_var_name) self._find_node_by_name(node.inputs, each_var_name)
] ]
for each_var_name in node.op().output_arg_names(): for each_var_name in node.op().output_arg_names():
if each_var_name not in var_nodes: if each_var_name not in var_nodes:
var_nodes[each_var_name] = [ var_nodes[each_var_name] = [
_to_node(node.outputs, each_var_name) self._find_node_by_name(node.outputs, each_var_name)
] ]
else: else:
var_nodes[each_var_name].append( var_nodes[each_var_name].append(
_to_node(node.outputs, each_var_name)) self._find_node_by_name(node.outputs,
each_var_name))
self.graph.resolve_hazard(var_nodes) self.graph.resolve_hazard(var_nodes)
def has_circle(self): def has_circle(self):
...@@ -2662,6 +2619,17 @@ class IrGraph(object): ...@@ -2662,6 +2619,17 @@ class IrGraph(object):
program = Program._construct_from_desc(desc) program = Program._construct_from_desc(desc)
return program return program
def _find_node_by_name(self, nodes, node_name):
"""
Find a node in the giving nodes set by the name.
"""
target_node = None
for n in nodes:
if n.name() == node_name:
target_node = n
assert target_node is not None, "Cannot find the target node in the giving set."
return target_node
def _update_desc_attr(self, desc, name, val): def _update_desc_attr(self, desc, name, val):
""" """
Update the value of desc's attribute by attribute's name. Update the value of desc's attribute by attribute's name.
...@@ -3541,22 +3509,22 @@ def _get_var(name, program=None): ...@@ -3541,22 +3509,22 @@ def _get_var(name, program=None):
@signature_safe_contextmanager @signature_safe_contextmanager
def _imperative_guard(tracer): def _dygraph_guard(tracer):
global _imperative_tracer_ global _dygraph_tracer_
tmp_trace = _imperative_tracer_ tmp_trace = _dygraph_tracer_
_imperative_tracer_ = tracer _dygraph_tracer_ = tracer
yield yield
_imperative_tracer_ = tmp_trace _dygraph_tracer_ = tmp_trace
@signature_safe_contextmanager @signature_safe_contextmanager
def _imperative_place_guard(place): def _dygraph_place_guard(place):
global _imperative_current_expected_place_ global _dygraph_current_expected_place_
tmp_place = _imperative_current_expected_place_ tmp_place = _dygraph_current_expected_place_
_imperative_current_expected_place_ = place _dygraph_current_expected_place_ = place
yield yield
_imperative_current_expected_place_ = tmp_place _dygraph_current_expected_place_ = tmp_place
...@@ -165,7 +165,7 @@ class ConstantInitializer(Initializer): ...@@ -165,7 +165,7 @@ class ConstantInitializer(Initializer):
'force_cpu': self._force_cpu or force_init_on_cpu() 'force_cpu': self._force_cpu or force_init_on_cpu()
}, },
stop_gradient=True) stop_gradient=True)
if not framework._in_imperative_mode(): if not framework._in_dygraph_mode():
var.op = op var.op = op
return op return op
...@@ -245,7 +245,7 @@ class UniformInitializer(Initializer): ...@@ -245,7 +245,7 @@ class UniformInitializer(Initializer):
attrs={"in_dtype": out_var.dtype, attrs={"in_dtype": out_var.dtype,
"out_dtype": var.dtype}) "out_dtype": var.dtype})
if not framework._in_imperative_mode(): if not framework._in_dygraph_mode():
var.op = op var.op = op
return op return op
...@@ -324,7 +324,7 @@ class NormalInitializer(Initializer): ...@@ -324,7 +324,7 @@ class NormalInitializer(Initializer):
outputs={"Out": var}, outputs={"Out": var},
attrs={"in_dtype": out_var.dtype, attrs={"in_dtype": out_var.dtype,
"out_dtype": var.dtype}) "out_dtype": var.dtype})
if not framework._in_imperative_mode(): if not framework._in_dygraph_mode():
var.op = op var.op = op
return op return op
...@@ -403,7 +403,7 @@ class TruncatedNormalInitializer(Initializer): ...@@ -403,7 +403,7 @@ class TruncatedNormalInitializer(Initializer):
outputs={"Out": var}, outputs={"Out": var},
attrs={"in_dtype": out_var.dtype, attrs={"in_dtype": out_var.dtype,
"out_dtype": var.dtype}) "out_dtype": var.dtype})
if not framework._in_imperative_mode(): if not framework._in_dygraph_mode():
var.op = op var.op = op
return op return op
...@@ -509,7 +509,7 @@ class XavierInitializer(Initializer): ...@@ -509,7 +509,7 @@ class XavierInitializer(Initializer):
"seed": self._seed "seed": self._seed
}, },
stop_gradient=True) stop_gradient=True)
if not framework._in_imperative_mode(): if not framework._in_dygraph_mode():
var.op = op var.op = op
return op return op
...@@ -610,7 +610,7 @@ class MSRAInitializer(Initializer): ...@@ -610,7 +610,7 @@ class MSRAInitializer(Initializer):
"seed": self._seed "seed": self._seed
}, },
stop_gradient=True) stop_gradient=True)
if not framework._in_imperative_mode(): if not framework._in_dygraph_mode():
var.op = op var.op = op
return op return op
...@@ -709,7 +709,7 @@ class BilinearInitializer(Initializer): ...@@ -709,7 +709,7 @@ class BilinearInitializer(Initializer):
'shape': list(shape), 'shape': list(shape),
value_name: values value_name: values
}) })
if not framework._in_imperative_mode(): if not framework._in_dygraph_mode():
var.op = op var.op = op
return op return op
...@@ -768,7 +768,7 @@ class NumpyArrayInitializer(Initializer): ...@@ -768,7 +768,7 @@ class NumpyArrayInitializer(Initializer):
value_name: values value_name: values
}, },
stop_gradient=True) stop_gradient=True)
if not framework._in_imperative_mode(): if not framework._in_dygraph_mode():
var.op = op var.op = op
return op return op
......
...@@ -17,7 +17,7 @@ from .param_attr import ParamAttr ...@@ -17,7 +17,7 @@ from .param_attr import ParamAttr
from .initializer import Constant from .initializer import Constant
from . import layers from . import layers
from . import backward from . import backward
from .imperative import Layer, nn from .dygraph import Layer, nn
from . import executor from . import executor
from . import core from . import core
......
...@@ -17,7 +17,7 @@ from __future__ import print_function ...@@ -17,7 +17,7 @@ from __future__ import print_function
import copy import copy
import six import six
from .framework import Parameter, dtype_is_floating, _in_imperative_mode from .framework import Parameter, dtype_is_floating, _in_dygraph_mode
from . import unique_name from . import unique_name
from paddle.fluid.initializer import Constant, Xavier from paddle.fluid.initializer import Constant, Xavier
from .param_attr import ParamAttr from .param_attr import ParamAttr
...@@ -30,9 +30,9 @@ class LayerHelper(LayerHelperBase): ...@@ -30,9 +30,9 @@ class LayerHelper(LayerHelperBase):
def __init__(self, layer_type, **kwargs): def __init__(self, layer_type, **kwargs):
self.kwargs = kwargs self.kwargs = kwargs
name = self.kwargs.get('name', None) name = self.kwargs.get('name', None)
# TODO(panyx0718, minqiyang): imperative mode # TODO(panyx0718, minqiyang): dygraph mode
# can not use both `layer_type` and `name`. Deprecate LayerHelper # can not use both `layer_type` and `name`. Deprecate LayerHelper
# and write a Helper for imperative mode. # and write a Helper for dygraph mode.
if name is None: if name is None:
self.kwargs['name'] = unique_name.generate(layer_type) self.kwargs['name'] = unique_name.generate(layer_type)
......
...@@ -17,7 +17,7 @@ from __future__ import print_function ...@@ -17,7 +17,7 @@ from __future__ import print_function
import copy import copy
import numpy as np import numpy as np
from .framework import Variable, default_main_program, default_startup_program, _in_imperative_mode, _current_expected_place from .framework import Variable, default_main_program, default_startup_program, _in_dygraph_mode, _current_expected_place
from . import unique_name from . import unique_name
from .param_attr import ParamAttr, WeightNormParamAttr from .param_attr import ParamAttr, WeightNormParamAttr
from . import core from . import core
...@@ -54,8 +54,8 @@ class LayerHelperBase(object): ...@@ -54,8 +54,8 @@ class LayerHelperBase(object):
Return Variable construct from value Return Variable construct from value
""" """
if isinstance(value, np.ndarray): if isinstance(value, np.ndarray):
assert _in_imperative_mode( assert _in_dygraph_mode(
), "to_variable could only be called in imperative mode" ), "to_variable could only be called in dygraph mode"
if not block: if not block:
block = default_main_program().current_block() block = default_main_program().current_block()
...@@ -302,8 +302,8 @@ class LayerHelperBase(object): ...@@ -302,8 +302,8 @@ class LayerHelperBase(object):
param = self._create_weight_normalize(attr, shape, dtype) param = self._create_weight_normalize(attr, shape, dtype)
WeightNormParamAttr.params_with_weight_norm.append(param) WeightNormParamAttr.params_with_weight_norm.append(param)
return param return param
if _in_imperative_mode(): if _in_dygraph_mode():
# In imperative mode, we want the returned parameter to be # In dygraph mode, we want the returned parameter to be
# initialized so that it can be used imperatively. # initialized so that it can be used imperatively.
return self.main_program.global_block().create_parameter( return self.main_program.global_block().create_parameter(
dtype=dtype, dtype=dtype,
...@@ -370,7 +370,7 @@ class LayerHelperBase(object): ...@@ -370,7 +370,7 @@ class LayerHelperBase(object):
initializer: initializer to use initializer: initializer to use
""" """
assert isinstance(var, Variable) assert isinstance(var, Variable)
if _in_imperative_mode(): if _in_dygraph_mode():
initializer(var, var.block) initializer(var, var.block)
else: else:
self.startup_program.global_block().create_var( self.startup_program.global_block().create_var(
......
...@@ -23,8 +23,8 @@ import os ...@@ -23,8 +23,8 @@ import os
import inspect import inspect
from ..layer_helper import LayerHelper from ..layer_helper import LayerHelper
from ..initializer import Normal, Constant, NumpyArrayInitializer from ..initializer import Normal, Constant, NumpyArrayInitializer
from ..framework import Variable, OpProtoHolder, _in_imperative_mode from ..framework import Variable, OpProtoHolder, _in_dygraph_mode
from ..imperative import base from ..dygraph import base
from ..param_attr import ParamAttr from ..param_attr import ParamAttr
from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_ from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_
from .tensor import concat, assign from .tensor import concat, assign
...@@ -32,7 +32,7 @@ from . import utils ...@@ -32,7 +32,7 @@ from . import utils
from .. import unique_name from .. import unique_name
from functools import reduce from functools import reduce
from .. import core from .. import core
from ..imperative import layers from ..dygraph import layers
__all__ = [ __all__ = [
'fc', 'fc',
...@@ -296,7 +296,6 @@ def fc(input, ...@@ -296,7 +296,6 @@ def fc(input,
data_2 = fluid.layers.data(name="data_2", shape=[24, 36], dtype="float32") data_2 = fluid.layers.data(name="data_2", shape=[24, 36], dtype="float32")
fc = fluid.layers.fc(input=[data_1, data_2], size=1000, act="tanh") fc = fluid.layers.fc(input=[data_1, data_2], size=1000, act="tanh")
""" """
helper = LayerHelper("fc", **locals()) helper = LayerHelper("fc", **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
...@@ -3279,6 +3278,8 @@ def layer_norm(input, ...@@ -3279,6 +3278,8 @@ def layer_norm(input,
>>> dtype='float32') >>> dtype='float32')
>>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1) >>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
""" """
assert _in_dygraph_mode(
) is not True, "please use FC instead of fc in dygraph mode!"
helper = LayerHelper('layer_norm', **locals()) helper = LayerHelper('layer_norm', **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
...@@ -5866,11 +5867,49 @@ def multiplex(inputs, index): ...@@ -5866,11 +5867,49 @@ def multiplex(inputs, index):
""" """
${comment} ${comment}
>>> import paddle.fluid as fluid For Example:
>>> x1 = fluid.layers.data(name='x1', shape=[4], dtype='float32')
>>> x2 = fluid.layers.data(name='x2', shape=[4], dtype='float32') .. code-block:: text
>>> index = fluid.layers.data(name='index', shape=[1], dtype='int32')
>>> out = fluid.layers.multiplex(inputs=[x1, x2], index=index) case 1:
Given:
X = [[[0,0,3,4], [0,1,3,4], [0,2,4,4], [0,3,3,4]],
[[1,0,3,4], [1,1,7,8], [1,2,4,2], [1,3,3,4]],
[[2,0,3,4], [2,1,7,8], [2,2,4,2], [2,3,3,4]],
[[3,0,3,4], [3,1,7,8], [3,2,4,2], [3,3,3,4]]]
index = [3,0,1,2]
out:[[3 0 3 4] // X[3,0] (3 = index[i], 0 = i); i=0
[0 1 3 4] // X[0,1] (0 = index[i], 1 = i); i=1
[1 2 4 2] // X[1,2] (0 = index[i], 2 = i); i=2
[2 3 3 4]] // X[2,3] (0 = index[i], 3 = i); i=3
case 2:
Given:
X = [[[0,0,3,4], [0,1,3,4], [0,2,4,4], [0,3,3,4]],
[[1,0,3,4], [1,1,7,8], [1,2,4,2], [1,3,3,4]]]
index = [1,0]
out:[[1 0 3 4] // X[1,0] (3 = index[0], 0 = i); i=1
[0 1 3 4] // X[0,1] (0 = index[1], 1 = i); i=2
[0 2 4 4] // X[0,2] (0 = 0, 2 = i); i=3
[0 3 3 4]] // X[0,3] (0 = 0, 3 = i); i=4
Examples:
.. code-block:: python
import paddle.fluid as fluid
x1 = fluid.layers.data(name='x1', shape=[4], dtype='float32')
x2 = fluid.layers.data(name='x2', shape=[4], dtype='float32')
index = fluid.layers.data(name='index', shape=[1], dtype='int32')
out = fluid.layers.multiplex(inputs=[x1, x2], index=index)
Args: Args:
inputs (list): ${x_comment}. inputs (list): ${x_comment}.
...@@ -6405,8 +6444,8 @@ def squeeze(input, axes, name=None): ...@@ -6405,8 +6444,8 @@ def squeeze(input, axes, name=None):
x = layers.data(name='x', shape=[5, 1, 10]) x = layers.data(name='x', shape=[5, 1, 10])
y = layers.sequeeze(input=x, axes=[1]) y = layers.sequeeze(input=x, axes=[1])
""" """
assert not _in_imperative_mode(), ( assert not _in_dygraph_mode(), (
"squeeze layer is not supported in imperative mode yet.") "squeeze layer is not supported in dygraph mode yet.")
helper = LayerHelper("squeeze", **locals()) helper = LayerHelper("squeeze", **locals())
out = helper.create_variable_for_type_inference(dtype=input.dtype) out = helper.create_variable_for_type_inference(dtype=input.dtype)
x_shape = helper.create_variable_for_type_inference(dtype=input.dtype) x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
...@@ -9144,7 +9183,7 @@ def _elementwise_op(helper): ...@@ -9144,7 +9183,7 @@ def _elementwise_op(helper):
op_type = helper.layer_type op_type = helper.layer_type
x = helper.kwargs.get('x', None) x = helper.kwargs.get('x', None)
y = helper.kwargs.get('y', None) y = helper.kwargs.get('y', None)
if _in_imperative_mode(): if _in_dygraph_mode():
x = base.to_variable(x) x = base.to_variable(x)
y = base.to_variable(y) y = base.to_variable(y)
......
...@@ -20,7 +20,6 @@ from ..framework import convert_np_dtype_to_dtype_ ...@@ -20,7 +20,6 @@ from ..framework import convert_np_dtype_to_dtype_
from ..framework import Variable from ..framework import Variable
from ..initializer import Constant, force_init_on_cpu from ..initializer import Constant, force_init_on_cpu
from ..core import VarDesc from ..core import VarDesc
from ..imperative import base as imperative_base
from .layer_function_generator import templatedoc from .layer_function_generator import templatedoc
import numpy import numpy
......
...@@ -30,7 +30,6 @@ from .initializer import Constant ...@@ -30,7 +30,6 @@ from .initializer import Constant
from .layer_helper import LayerHelper from .layer_helper import LayerHelper
from .layers import ops from .layers import ops
from .regularizer import append_regularization_ops from .regularizer import append_regularization_ops
from .imperative import base as imperative_base
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.layers import tensor from paddle.fluid.layers import tensor
from functools import reduce from functools import reduce
...@@ -169,7 +168,7 @@ class Optimizer(object): ...@@ -169,7 +168,7 @@ class Optimizer(object):
name = self._name + "_" + name name = self._name + "_" + name
if (name in self._accumulators and if (name in self._accumulators and
param.name in self._accumulators[name]): param.name in self._accumulators[name]):
if framework._in_imperative_mode(): if framework._in_dygraph_mode():
return self._accumulators[name][param.name] return self._accumulators[name][param.name]
raise Exception("Accumulator {} already exists for parameter {}". raise Exception("Accumulator {} already exists for parameter {}".
format(name, param.name)) format(name, param.name))
...@@ -396,11 +395,11 @@ class Optimizer(object): ...@@ -396,11 +395,11 @@ class Optimizer(object):
""" """
self._dtype = loss.dtype self._dtype = loss.dtype
optimize_ops = [] optimize_ops = []
if framework._in_imperative_mode(): if framework._in_dygraph_mode():
if parameter_list is not None: if parameter_list is not None:
parameters = parameter_list parameters = parameter_list
else: else:
parameters = framework._imperative_tracer().all_parameters() parameters = framework._dygraph_tracer().all_parameters()
params_grads = [] params_grads = []
for param in parameters: for param in parameters:
......
...@@ -262,14 +262,14 @@ class OpTest(unittest.TestCase): ...@@ -262,14 +262,14 @@ class OpTest(unittest.TestCase):
if isinstance(value, tuple): if isinstance(value, tuple):
data = value[0] data = value[0]
lod = value[1] lod = value[1]
v = fluid.imperative.base.to_variable(value=data) v = fluid.dygraph.base.to_variable(value=data)
v._ivar.value().get_tensor().set_recursive_sequence_lengths(lod) v._ivar.value().get_tensor().set_recursive_sequence_lengths(lod)
return v return v
else: else:
return fluid.imperative.base.to_variable(value) return fluid.dygraph.base.to_variable(value)
def _calc_imperative_output(self, place, parallel=False, no_check_set=None): def _calc_dygraph_output(self, place, parallel=False, no_check_set=None):
with fluid.imperative.base.guard(place=place): with fluid.dygraph.base.guard(place=place):
block = fluid.default_main_program().global_block() block = fluid.default_main_program().global_block()
# prepare input variable # prepare input variable
...@@ -316,7 +316,7 @@ class OpTest(unittest.TestCase): ...@@ -316,7 +316,7 @@ class OpTest(unittest.TestCase):
return outputs return outputs
def _calc_output(self, place, parallel=False, no_check_set=None): def _calc_output(self, place, parallel=False, no_check_set=None, loss=None):
program = Program() program = Program()
block = program.global_block() block = program.global_block()
self._append_ops(block) self._append_ops(block)
...@@ -329,8 +329,14 @@ class OpTest(unittest.TestCase): ...@@ -329,8 +329,14 @@ class OpTest(unittest.TestCase):
use_cuda = False use_cuda = False
if isinstance(place, fluid.CUDAPlace(0)): if isinstance(place, fluid.CUDAPlace(0)):
use_cuda = True use_cuda = True
executor = fluid.ParallelExecutor( if loss:
use_cuda=use_cuda, loss_name=loss.name, main_program=program) executor = fluid.ParallelExecutor(
use_cuda=use_cuda,
loss_name=loss.name,
main_program=program)
else:
executor = fluid.ParallelExecutor(
use_cuda=use_cuda, main_program=program)
else: else:
executor = Executor(place) executor = Executor(place)
...@@ -364,9 +370,9 @@ class OpTest(unittest.TestCase): ...@@ -364,9 +370,9 @@ class OpTest(unittest.TestCase):
atol, atol,
no_check_set=None, no_check_set=None,
equal_nan=False, equal_nan=False,
check_imperative=False): check_dygraph=False):
if check_imperative: if check_dygraph:
imperative_outs = self._calc_imperative_output( dygraph_outs = self._calc_dygraph_output(
place, no_check_set=no_check_set) place, no_check_set=no_check_set)
outs, fetch_list = self._calc_output(place, no_check_set=no_check_set) outs, fetch_list = self._calc_output(place, no_check_set=no_check_set)
...@@ -393,8 +399,8 @@ class OpTest(unittest.TestCase): ...@@ -393,8 +399,8 @@ class OpTest(unittest.TestCase):
type(sub_out)) type(sub_out))
for item in sub_out: for item in sub_out:
sub_out_name, expect = item[0], item[1] sub_out_name, expect = item[0], item[1]
if check_imperative: if check_dygraph:
imperative_actual = imperative_outs[sub_out_name][0] imperative_actual = dygraph_outs[sub_out_name][0]
imperative_actual_t = np.array( imperative_actual_t = np.array(
imperative_actual._ivar.value().get_tensor()) imperative_actual._ivar.value().get_tensor())
idx = find_actual(sub_out_name, fetch_list) idx = find_actual(sub_out_name, fetch_list)
...@@ -407,7 +413,7 @@ class OpTest(unittest.TestCase): ...@@ -407,7 +413,7 @@ class OpTest(unittest.TestCase):
actual_t, expect_t, atol=atol, equal_nan=equal_nan), actual_t, expect_t, atol=atol, equal_nan=equal_nan),
"Output (" + sub_out_name + ") has diff at " + "Output (" + sub_out_name + ") has diff at " +
str(place)) str(place))
if check_imperative: if check_dygraph:
self.assertTrue( self.assertTrue(
np.allclose( np.allclose(
imperative_actual_t, imperative_actual_t,
...@@ -415,21 +421,21 @@ class OpTest(unittest.TestCase): ...@@ -415,21 +421,21 @@ class OpTest(unittest.TestCase):
atol=atol, atol=atol,
equal_nan=equal_nan), equal_nan=equal_nan),
"Output (" + sub_out_name + ") has diff at " + "Output (" + sub_out_name + ") has diff at " +
str(place) + " in imperative mode") str(place) + " in dygraph mode")
if isinstance(expect, tuple): if isinstance(expect, tuple):
self.assertListEqual( self.assertListEqual(
actual.recursive_sequence_lengths(), expect[1], actual.recursive_sequence_lengths(), expect[1],
"Output (" + sub_out_name + "Output (" + sub_out_name +
") has different lod at " + str(place)) ") has different lod at " + str(place))
if check_imperative: if check_dygraph:
self.assertListEqual( self.assertListEqual(
imperative_actual._ivar.value().get_tensor() imperative_actual._ivar.value().get_tensor()
.recursive_sequence_lengths(), expect[1], .recursive_sequence_lengths(), expect[1],
"Output (" + out_name + ") has different lod at " + "Output (" + out_name + ") has different lod at " +
str(place) + " in imperative mode") str(place) + " in dygraph mode")
else: else:
if check_imperative: if check_dygraph:
imperative_actual = imperative_outs[out_name][0] imperative_actual = dygraph_outs[out_name][0]
imperative_actual_t = np.array( imperative_actual_t = np.array(
imperative_actual._ivar.value().get_tensor()) imperative_actual._ivar.value().get_tensor())
idx = find_actual(out_name, fetch_list) idx = find_actual(out_name, fetch_list)
...@@ -443,7 +449,7 @@ class OpTest(unittest.TestCase): ...@@ -443,7 +449,7 @@ class OpTest(unittest.TestCase):
"Output (" + out_name + ") has diff at " + str(place) + "Output (" + out_name + ") has diff at " + str(place) +
"\nExpect " + str(expect_t) + "\n" + "But Got" + "\nExpect " + str(expect_t) + "\n" + "But Got" +
str(actual_t) + " in class " + self.__class__.__name__) str(actual_t) + " in class " + self.__class__.__name__)
if check_imperative: if check_dygraph:
self.assertTrue( self.assertTrue(
np.allclose( np.allclose(
imperative_actual_t, imperative_actual_t,
...@@ -458,12 +464,12 @@ class OpTest(unittest.TestCase): ...@@ -458,12 +464,12 @@ class OpTest(unittest.TestCase):
self.assertListEqual(actual.recursive_sequence_lengths(), self.assertListEqual(actual.recursive_sequence_lengths(),
expect[1], "Output (" + out_name + expect[1], "Output (" + out_name +
") has different lod at " + str(place)) ") has different lod at " + str(place))
if check_imperative: if check_dygraph:
self.assertListEqual( self.assertListEqual(
imperative_actual._ivar.value().get_tensor() imperative_actual._ivar.value().get_tensor()
.recursive_sequence_lengths(), expect[1], .recursive_sequence_lengths(), expect[1],
"Output (" + out_name + ") has different lod at " + "Output (" + out_name + ") has different lod at " +
str(place) + " in imperative mode") str(place) + " in dygraph mode")
def _get_places(self): def _get_places(self):
if self.dtype == np.float16: if self.dtype == np.float16:
...@@ -490,11 +496,11 @@ class OpTest(unittest.TestCase): ...@@ -490,11 +496,11 @@ class OpTest(unittest.TestCase):
atol=1e-5, atol=1e-5,
no_check_set=None, no_check_set=None,
equal_nan=False, equal_nan=False,
check_imperative=False): check_dygraph=False):
places = self._get_places() places = self._get_places()
for place in places: for place in places:
self.check_output_with_place(place, atol, no_check_set, equal_nan, self.check_output_with_place(place, atol, no_check_set, equal_nan,
check_imperative) check_dygraph)
def check_output_customized(self, checker): def check_output_customized(self, checker):
places = self._get_places() places = self._get_places()
......
...@@ -43,6 +43,7 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -43,6 +43,7 @@ class TestParallelExecutorBase(unittest.TestCase):
use_ir_memory_optimize=True, use_ir_memory_optimize=True,
enable_inplace=True, enable_inplace=True,
fuse_elewise_add_act_ops=False, fuse_elewise_add_act_ops=False,
fuse_all_optimizer_ops=False,
fuse_all_reduce_ops=False, fuse_all_reduce_ops=False,
fuse_relu_depthwise_conv=False, fuse_relu_depthwise_conv=False,
optimizer=fluid.optimizer.Adam, optimizer=fluid.optimizer.Adam,
...@@ -81,6 +82,7 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -81,6 +82,7 @@ class TestParallelExecutorBase(unittest.TestCase):
build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv
build_strategy.memory_optimize = False if memory_opt else use_ir_memory_optimize build_strategy.memory_optimize = False if memory_opt else use_ir_memory_optimize
build_strategy.fuse_all_optimizer_ops = fuse_all_optimizer_ops
build_strategy.fuse_all_reduce_ops = fuse_all_reduce_ops build_strategy.fuse_all_reduce_ops = fuse_all_reduce_ops
# python memory optimization is conflict with inplace pass. # python memory optimization is conflict with inplace pass.
# Use ir graph memory optimization after inplace pass is the correct way. # Use ir graph memory optimization after inplace pass is the correct way.
......
...@@ -16,8 +16,10 @@ from __future__ import print_function ...@@ -16,8 +16,10 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest from op_test import OpTest
from paddle.fluid import core
alignment = 256
class TestAllocContinuousSpace(OpTest): class TestAllocContinuousSpace(OpTest):
...@@ -29,11 +31,11 @@ class TestAllocContinuousSpace(OpTest): ...@@ -29,11 +31,11 @@ class TestAllocContinuousSpace(OpTest):
self.constant = attrs["constant"] self.constant = attrs["constant"]
self.set_constant = attrs["set_constant"] self.set_constant = attrs["set_constant"]
self.Inputs = self.init_input() self.Inputs = self.init_input()
self.FusedOutput = self.init_output(self.Inputs, self.set_constant, self.Outputs, self.FusedOutput = self.init_output(
self.constant) self.Inputs, self.set_constant, self.constant)
self.inputs = {'Input': self.Inputs} self.inputs = {'Input': self.Inputs}
self.attrs = attrs self.attrs = attrs
self.outputs = {'Output': self.Inputs, 'FusedOutput': self.FusedOutput} self.outputs = {'Output': self.Outputs, 'FusedOutput': self.FusedOutput}
def init_dtype(self): def init_dtype(self):
self.dtype = np.float32 self.dtype = np.float32
...@@ -52,14 +54,31 @@ class TestAllocContinuousSpace(OpTest): ...@@ -52,14 +54,31 @@ class TestAllocContinuousSpace(OpTest):
return {"copy_data": True, "set_constant": False, "constant": 0.0} return {"copy_data": True, "set_constant": False, "constant": 0.0}
def init_output(self, input_list, set_constant, constant): def init_output(self, input_list, set_constant, constant):
inputs = [input[1].flatten() for input in input_list] inputs = []
output = np.concatenate(inputs) outputs = input_list
for input in input_list:
length = len(input[1].flatten())
aligned_len = (length + alignment) / alignment * alignment
out = np.zeros(int(aligned_len))
out[0:length] = input[1].flatten()
inputs.append(out)
alloc_continuous_space_var = np.concatenate([input for input in inputs])
if set_constant: if set_constant:
output = np.ones((len(output))) * constant alloc_continuous_space_var = np.ones(
return output (len(alloc_continuous_space_var))) * constant
outputs = [(out[0],
np.ones(out[1].shape).astype(self.dtype) * constant)
for out in outputs]
return outputs, alloc_continuous_space_var
def test_check_output(self): def test_check_output(self):
self.check_output() if core.is_compiled_with_cuda():
self.check_output_with_place(
place=core.CUDAPlace(0),
no_check_set=["FusedOutput"],
atol=1e-5)
class TestAllocContinuousSpace2(TestAllocContinuousSpace): class TestAllocContinuousSpace2(TestAllocContinuousSpace):
...@@ -67,7 +86,11 @@ class TestAllocContinuousSpace2(TestAllocContinuousSpace): ...@@ -67,7 +86,11 @@ class TestAllocContinuousSpace2(TestAllocContinuousSpace):
return {"copy_data": False, "set_constant": True, "constant": 0.5} return {"copy_data": False, "set_constant": True, "constant": 0.5}
def test_check_output(self): def test_check_output(self):
self.check_output(no_check_set=["Output"]) if core.is_compiled_with_cuda():
self.check_output_with_place(
place=core.CUDAPlace(0),
no_check_set=["FusedOutput"],
atol=1e-5)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -18,7 +18,7 @@ import numpy as np ...@@ -18,7 +18,7 @@ import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
class L1(fluid.imperative.Layer): class L1(fluid.dygraph.Layer):
def __init__(self, prefix): def __init__(self, prefix):
super(L1, self).__init__(prefix) super(L1, self).__init__(prefix)
self._param_attr = fluid.ParamAttr( self._param_attr = fluid.ParamAttr(
...@@ -32,7 +32,7 @@ class L1(fluid.imperative.Layer): ...@@ -32,7 +32,7 @@ class L1(fluid.imperative.Layer):
return self.w1 + self.w2 return self.w1 + self.w2
class L2(fluid.imperative.Layer): class L2(fluid.dygraph.Layer):
def __init__(self, prefix): def __init__(self, prefix):
super(L2, self).__init__(prefix) super(L2, self).__init__(prefix)
self.layer1 = L1(self.full_name()) self.layer1 = L1(self.full_name())
...@@ -42,7 +42,7 @@ class L2(fluid.imperative.Layer): ...@@ -42,7 +42,7 @@ class L2(fluid.imperative.Layer):
return self.layer1() + self.layer2() return self.layer1() + self.layer2()
class L3(fluid.imperative.Layer): class L3(fluid.dygraph.Layer):
def __init__(self, prefix): def __init__(self, prefix):
super(L3, self).__init__(prefix) super(L3, self).__init__(prefix)
self.layer1 = L2(self.full_name()) self.layer1 = L2(self.full_name())
...@@ -54,7 +54,7 @@ class L3(fluid.imperative.Layer): ...@@ -54,7 +54,7 @@ class L3(fluid.imperative.Layer):
class TestBaseLayer(unittest.TestCase): class TestBaseLayer(unittest.TestCase):
def test_one_level(self): def test_one_level(self):
with fluid.imperative.guard(): with fluid.dygraph.guard():
l = L1('test_one_level') l = L1('test_one_level')
ret = l() ret = l()
self.assertEqual(l.w1.name, "test_one_level/L1_0.w_0") self.assertEqual(l.w1.name, "test_one_level/L1_0.w_0")
...@@ -62,7 +62,7 @@ class TestBaseLayer(unittest.TestCase): ...@@ -62,7 +62,7 @@ class TestBaseLayer(unittest.TestCase):
self.assertTrue(np.allclose(ret._numpy(), 0.2 * np.ones([2, 2]))) self.assertTrue(np.allclose(ret._numpy(), 0.2 * np.ones([2, 2])))
def test_three_level(self): def test_three_level(self):
with fluid.imperative.guard(): with fluid.dygraph.guard():
l = L3('test_three_level') l = L3('test_three_level')
names = [p.name for p in l.parameters()] names = [p.name for p in l.parameters()]
ret = l() ret = l()
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parallel_executor_test_base import TestParallelExecutorBase
import paddle.fluid as fluid
import paddle.fluid.core as core
import numpy as np
import paddle
import paddle.dataset.mnist as mnist
import unittest
import os
def simple_fc_net(use_feed):
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = img
for _ in range(4):
hidden = fluid.layers.fc(
hidden,
size=200,
act='relu',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
def fc_with_batchnorm(use_feed):
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = img
for _ in range(2):
hidden = fluid.layers.fc(
hidden,
size=200,
act='relu',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
hidden = fluid.layers.batch_norm(input=hidden)
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
class TestFuseAdamOps(TestParallelExecutorBase):
@classmethod
def setUpClass(cls):
os.environ['CPU_NUM'] = str(4)
def _init_data(self, random=True):
np.random.seed(5)
if random:
img = np.random.random(size=[32, 784]).astype(np.float32)
else:
img = np.ones(shape=[32, 784], dtype='float32')
label = np.ones(shape=[32, 1], dtype='int64')
return img, label
def _compare_fused_optimizer_ops(self,
model,
use_cuda,
random_data=True,
optimizer=fluid.optimizer.Adam):
if use_cuda and not core.is_compiled_with_cuda():
return
img, label = self._init_data(random_data)
not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
model,
feed_dict={"image": img,
"label": label},
use_cuda=use_cuda,
fuse_all_optimizer_ops=False,
memory_opt=False, # avoid the gradient's name changed in Python side.
optimizer=optimizer)
fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
model,
feed_dict={"image": img,
"label": label},
use_cuda=use_cuda,
fuse_all_optimizer_ops=True,
memory_opt=False, # avoid the gradient's name changed in Python side.
optimizer=optimizer)
for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
for loss in zip(not_fuse_op_last_loss, fuse_op_last_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
def test_simple_fc_with_fuse_op(self):
self._compare_fused_optimizer_ops(simple_fc_net, True)
self._compare_fused_optimizer_ops(simple_fc_net, False)
def test_batchnorm_fc_with_fuse_op(self):
self._compare_fused_optimizer_ops(fc_with_batchnorm, True)
# self._compare_fused_optimizer_ops(fc_with_batchnorm, False)
class TestFuseSGDOps(TestFuseAdamOps):
def sgd_optimizer(self, learning_rate=1e-4):
return fluid.optimizer.SGD(learning_rate=learning_rate)
def test_simple_fc_with_fuse_op(self):
self._compare_fused_optimizer_ops(
simple_fc_net, True, optimizer=self.sgd_optimizer)
self._compare_fused_optimizer_ops(
simple_fc_net, False, optimizer=self.sgd_optimizer)
def test_batchnorm_fc_with_fuse_op(self):
self._compare_fused_optimizer_ops(
fc_with_batchnorm, True, optimizer=self.sgd_optimizer)
self._compare_fused_optimizer_ops(
fc_with_batchnorm, False, optimizer=self.sgd_optimizer)
if __name__ == '__main__':
unittest.main()
...@@ -156,7 +156,7 @@ class TestGRUOp(OpTest): ...@@ -156,7 +156,7 @@ class TestGRUOp(OpTest):
} }
def test_check_output(self): def test_check_output(self):
self.check_output(atol=1e-8, check_imperative=True) self.check_output(atol=1e-8, check_dygraph=True)
def test_check_grad(self): def test_check_grad(self):
self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden']) self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden'])
......
...@@ -18,11 +18,11 @@ import numpy as np ...@@ -18,11 +18,11 @@ import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.imperative.nn import FC from paddle.fluid.dygraph.nn import FC
from test_imperative_base import new_program_scope from test_imperative_base import new_program_scope
class MyLayer(fluid.imperative.Layer): class MyLayer(fluid.dygraph.Layer):
def __init__(self, name_scope): def __init__(self, name_scope):
super(MyLayer, self).__init__(name_scope) super(MyLayer, self).__init__(name_scope)
...@@ -34,7 +34,7 @@ class MyLayer(fluid.imperative.Layer): ...@@ -34,7 +34,7 @@ class MyLayer(fluid.imperative.Layer):
return [x] return [x]
class MyPyLayer(fluid.imperative.PyLayer): class MyPyLayer(fluid.dygraph.PyLayer):
def __init__(self): def __init__(self):
super(MyPyLayer, self).__init__() super(MyPyLayer, self).__init__()
...@@ -48,7 +48,7 @@ class MyPyLayer(fluid.imperative.PyLayer): ...@@ -48,7 +48,7 @@ class MyPyLayer(fluid.imperative.PyLayer):
return np.array(dout) * (1 - np.square(np.array(out))) return np.array(dout) * (1 - np.square(np.array(out)))
class MLP(fluid.imperative.Layer): class MLP(fluid.dygraph.Layer):
def __init__(self, name_scope): def __init__(self, name_scope):
super(MLP, self).__init__(name_scope) super(MLP, self).__init__(name_scope)
self._fc1 = FC(self.full_name(), self._fc1 = FC(self.full_name(),
...@@ -71,7 +71,7 @@ class MLP(fluid.imperative.Layer): ...@@ -71,7 +71,7 @@ class MLP(fluid.imperative.Layer):
return x return x
class SimpleRNNCell(fluid.imperative.Layer): class SimpleRNNCell(fluid.dygraph.Layer):
def __init__(self, name_scope, step_input_size, hidden_size, output_size, def __init__(self, name_scope, step_input_size, hidden_size, output_size,
param_attr): param_attr):
super(SimpleRNNCell, self).__init__(name_scope) super(SimpleRNNCell, self).__init__(name_scope)
...@@ -159,7 +159,7 @@ class SimpleRNNCell(fluid.imperative.Layer): ...@@ -159,7 +159,7 @@ class SimpleRNNCell(fluid.imperative.Layer):
return reduce_out, hidden return reduce_out, hidden
class SimpleRNN(fluid.imperative.Layer): class SimpleRNN(fluid.dygraph.Layer):
def __init__(self, name_scope): def __init__(self, name_scope):
super(SimpleRNN, self).__init__(name_scope) super(SimpleRNN, self).__init__(name_scope)
self.seq_len = 4 self.seq_len = 4
...@@ -194,10 +194,10 @@ class SimpleRNN(fluid.imperative.Layer): ...@@ -194,10 +194,10 @@ class SimpleRNN(fluid.imperative.Layer):
class TestImperative(unittest.TestCase): class TestImperative(unittest.TestCase):
def test_sum_op(self): def test_sum_op(self):
x = np.ones([2, 2], np.float32) x = np.ones([2, 2], np.float32)
with fluid.imperative.guard(): with fluid.dygraph.guard():
inputs = [] inputs = []
for _ in range(10): for _ in range(10):
inputs.append(fluid.imperative.base.to_variable(x)) inputs.append(fluid.dygraph.base.to_variable(x))
ret = fluid.layers.sums(inputs) ret = fluid.layers.sums(inputs)
loss = fluid.layers.reduce_sum(ret) loss = fluid.layers.reduce_sum(ret)
loss._backward() loss._backward()
...@@ -205,17 +205,17 @@ class TestImperative(unittest.TestCase): ...@@ -205,17 +205,17 @@ class TestImperative(unittest.TestCase):
self.assertTrue(np.allclose(inputs[0]._gradient(), x)) self.assertTrue(np.allclose(inputs[0]._gradient(), x))
def test_layer(self): def test_layer(self):
with fluid.imperative.guard(): with fluid.dygraph.guard():
cl = core.Layer() cl = core.Layer()
cl.forward([]) cl.forward([])
l = fluid.imperative.Layer("l") l = fluid.dygraph.Layer("l")
self.assertRaises(NotImplementedError, l.forward, []) self.assertRaises(NotImplementedError, l.forward, [])
def test_pylayer_func_id(self): def test_pylayer_func_id(self):
with fluid.imperative.guard(): with fluid.dygraph.guard():
class PyLayer1(fluid.imperative.PyLayer): class PyLayer1(fluid.dygraph.PyLayer):
def __init__(self): def __init__(self):
super(PyLayer1, self).__init__() super(PyLayer1, self).__init__()
...@@ -227,7 +227,7 @@ class TestImperative(unittest.TestCase): ...@@ -227,7 +227,7 @@ class TestImperative(unittest.TestCase):
def backward(input): def backward(input):
return input return input
class PyLayer2(fluid.imperative.PyLayer): class PyLayer2(fluid.dygraph.PyLayer):
def __init__(self): def __init__(self):
super(PyLayer2, self).__init__() super(PyLayer2, self).__init__()
...@@ -241,21 +241,21 @@ class TestImperative(unittest.TestCase): ...@@ -241,21 +241,21 @@ class TestImperative(unittest.TestCase):
py_layer_1 = PyLayer1() py_layer_1 = PyLayer1()
py_layer_2 = PyLayer2() py_layer_2 = PyLayer2()
py_layer_1(fluid.imperative.base.to_variable(np.ones([2, 2]))) py_layer_1(fluid.dygraph.base.to_variable(np.ones([2, 2])))
py_layer_2(fluid.imperative.base.to_variable(np.ones([2, 2]))) py_layer_2(fluid.dygraph.base.to_variable(np.ones([2, 2])))
id = py_layer_1.forward_id id = py_layer_1.forward_id
self.assertGreater(id, 0) self.assertGreater(id, 0)
self.assertEqual(py_layer_1.backward_id, id + 1) self.assertEqual(py_layer_1.backward_id, id + 1)
self.assertEqual(py_layer_2.forward_id, id + 2) self.assertEqual(py_layer_2.forward_id, id + 2)
self.assertEqual(py_layer_2.backward_id, id + 3) self.assertEqual(py_layer_2.backward_id, id + 3)
py_layer_1(fluid.imperative.base.to_variable(np.ones([2, 2]))) py_layer_1(fluid.dygraph.base.to_variable(np.ones([2, 2])))
self.assertEqual(py_layer_1.forward_id, id) self.assertEqual(py_layer_1.forward_id, id)
def test_pylayer(self): def test_pylayer(self):
np_inp = np.ones([2, 2], np.float32) np_inp = np.ones([2, 2], np.float32)
with fluid.imperative.guard(): with fluid.dygraph.guard():
my_py_layer = MyPyLayer() my_py_layer = MyPyLayer()
var_inp = fluid.imperative.base.to_variable(np_inp) var_inp = fluid.dygraph.base.to_variable(np_inp)
outs = my_py_layer(var_inp) outs = my_py_layer(var_inp)
dy_out = np.sum(outs[0]._numpy()) dy_out = np.sum(outs[0]._numpy())
outs[0]._backward() outs[0]._backward()
...@@ -282,8 +282,8 @@ class TestImperative(unittest.TestCase): ...@@ -282,8 +282,8 @@ class TestImperative(unittest.TestCase):
def test_layer_in_out(self): def test_layer_in_out(self):
np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32) np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32)
with fluid.imperative.guard(): with fluid.dygraph.guard():
var_inp = fluid.imperative.base.to_variable(np_inp) var_inp = fluid.dygraph.base.to_variable(np_inp)
l = MyLayer("my_layer") l = MyLayer("my_layer")
x = l(var_inp)[0] x = l(var_inp)[0]
self.assertIsNotNone(x) self.assertIsNotNone(x)
...@@ -310,8 +310,8 @@ class TestImperative(unittest.TestCase): ...@@ -310,8 +310,8 @@ class TestImperative(unittest.TestCase):
def test_mlp(self): def test_mlp(self):
np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
with fluid.imperative.guard(): with fluid.dygraph.guard():
var_inp = fluid.imperative.base.to_variable(np_inp) var_inp = fluid.dygraph.base.to_variable(np_inp)
mlp = MLP("mlp") mlp = MLP("mlp")
out = mlp(var_inp) out = mlp(var_inp)
dy_out = out._numpy() dy_out = out._numpy()
...@@ -353,8 +353,8 @@ class TestImperative(unittest.TestCase): ...@@ -353,8 +353,8 @@ class TestImperative(unittest.TestCase):
[10.0, 11.0, 12.0]]) [10.0, 11.0, 12.0]])
np_inp = np_inp.reshape((1, 4, 3)) np_inp = np_inp.reshape((1, 4, 3))
np_inp = np_inp.astype(np.float32) np_inp = np_inp.astype(np.float32)
with fluid.imperative.guard(): with fluid.dygraph.guard():
var_inp = fluid.imperative.base.to_variable(np_inp) var_inp = fluid.dygraph.base.to_variable(np_inp)
var_inp = fluid.layers.reshape(var_inp, shape=[1, 4, 3]) var_inp = fluid.layers.reshape(var_inp, shape=[1, 4, 3])
simple_rnn = SimpleRNN("simple_rnn") simple_rnn = SimpleRNN("simple_rnn")
outs, pre_hiddens = simple_rnn.forward(var_inp) outs, pre_hiddens = simple_rnn.forward(var_inp)
......
...@@ -18,11 +18,11 @@ import numpy as np ...@@ -18,11 +18,11 @@ import numpy as np
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.optimizer import SGDOptimizer from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
from paddle.fluid.imperative.base import to_variable from paddle.fluid.dygraph.base import to_variable
class SimpleImgConvPool(fluid.imperative.Layer): class SimpleImgConvPool(fluid.dygraph.Layer):
def __init__(self, def __init__(self,
name_scope, name_scope,
num_channels, num_channels,
...@@ -71,7 +71,7 @@ class SimpleImgConvPool(fluid.imperative.Layer): ...@@ -71,7 +71,7 @@ class SimpleImgConvPool(fluid.imperative.Layer):
return x return x
class MNIST(fluid.imperative.Layer): class MNIST(fluid.dygraph.Layer):
def __init__(self, name_scope): def __init__(self, name_scope):
super(MNIST, self).__init__(name_scope) super(MNIST, self).__init__(name_scope)
...@@ -98,12 +98,12 @@ class MNIST(fluid.imperative.Layer): ...@@ -98,12 +98,12 @@ class MNIST(fluid.imperative.Layer):
return x return x
class TestImperativeCheckpoint(unittest.TestCase): class TestDygraphCheckpoint(unittest.TestCase):
def save_load_persistables(self): def save_load_persistables(self):
seed = 90 seed = 90
epoch_num = 1 epoch_num = 1
with fluid.imperative.guard(): with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
...@@ -135,14 +135,14 @@ class TestImperativeCheckpoint(unittest.TestCase): ...@@ -135,14 +135,14 @@ class TestImperativeCheckpoint(unittest.TestCase):
avg_loss._backward() avg_loss._backward()
sgd.minimize(avg_loss) sgd.minimize(avg_loss)
fluid.imperative.save_persistables(mnist, "save_dir") fluid.dygraph.save_persistables(mnist, "save_dir")
mnist.clear_gradients() mnist.clear_gradients()
for param in mnist.parameters(): for param in mnist.parameters():
dy_param_init_value[param.name] = param._numpy() dy_param_init_value[param.name] = param._numpy()
mnist.load_dict( mnist.load_dict(
fluid.imperative.load_persistables(mnist, "save_dir")) fluid.dygraph.load_persistables(mnist, "save_dir"))
restore = mnist.parameters() restore = mnist.parameters()
......
...@@ -22,7 +22,7 @@ import paddle ...@@ -22,7 +22,7 @@ import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
from test_imperative_base import new_program_scope from test_imperative_base import new_program_scope
from paddle.fluid.imperative.base import to_variable from paddle.fluid.dygraph.base import to_variable
# Can use Amusic dataset as the DeepCF describes. # Can use Amusic dataset as the DeepCF describes.
DATA_PATH = os.environ.get('DATA_PATH', '') DATA_PATH = os.environ.get('DATA_PATH', '')
...@@ -32,11 +32,11 @@ NUM_BATCHES = int(os.environ.get('NUM_BATCHES', 5)) ...@@ -32,11 +32,11 @@ NUM_BATCHES = int(os.environ.get('NUM_BATCHES', 5))
NUM_EPOCHES = int(os.environ.get('NUM_EPOCHES', 1)) NUM_EPOCHES = int(os.environ.get('NUM_EPOCHES', 1))
class DMF(fluid.imperative.Layer): class DMF(fluid.dygraph.Layer):
def __init__(self, name_scope): def __init__(self, name_scope):
super(DMF, self).__init__(name_scope) super(DMF, self).__init__(name_scope)
self._user_latent = fluid.imperative.FC(self.full_name(), 256) self._user_latent = fluid.dygraph.FC(self.full_name(), 256)
self._item_latent = fluid.imperative.FC(self.full_name(), 256) self._item_latent = fluid.dygraph.FC(self.full_name(), 256)
self._user_layers = [] self._user_layers = []
self._item_layers = [] self._item_layers = []
...@@ -45,12 +45,12 @@ class DMF(fluid.imperative.Layer): ...@@ -45,12 +45,12 @@ class DMF(fluid.imperative.Layer):
self._user_layers.append( self._user_layers.append(
self.add_sublayer( self.add_sublayer(
'user_layer_%d' % i, 'user_layer_%d' % i,
fluid.imperative.FC( fluid.dygraph.FC(
self.full_name(), self._hid_sizes[i], act='relu'))) self.full_name(), self._hid_sizes[i], act='relu')))
self._item_layers.append( self._item_layers.append(
self.add_sublayer( self.add_sublayer(
'item_layer_%d' % i, 'item_layer_%d' % i,
fluid.imperative.FC( fluid.dygraph.FC(
self.full_name(), self._hid_sizes[i], act='relu'))) self.full_name(), self._hid_sizes[i], act='relu')))
def forward(self, users, items): def forward(self, users, items):
...@@ -63,18 +63,18 @@ class DMF(fluid.imperative.Layer): ...@@ -63,18 +63,18 @@ class DMF(fluid.imperative.Layer):
return fluid.layers.elementwise_mul(users, items) return fluid.layers.elementwise_mul(users, items)
class MLP(fluid.imperative.Layer): class MLP(fluid.dygraph.Layer):
def __init__(self, name_scope): def __init__(self, name_scope):
super(MLP, self).__init__(name_scope) super(MLP, self).__init__(name_scope)
self._user_latent = fluid.imperative.FC(self.full_name(), 256) self._user_latent = fluid.dygraph.FC(self.full_name(), 256)
self._item_latent = fluid.imperative.FC(self.full_name(), 256) self._item_latent = fluid.dygraph.FC(self.full_name(), 256)
self._match_layers = [] self._match_layers = []
self._hid_sizes = [128, 64] self._hid_sizes = [128, 64]
for i in range(len(self._hid_sizes)): for i in range(len(self._hid_sizes)):
self._match_layers.append( self._match_layers.append(
self.add_sublayer( self.add_sublayer(
'match_layer_%d' % i, 'match_layer_%d' % i,
fluid.imperative.FC( fluid.dygraph.FC(
self.full_name(), self._hid_sizes[i], act='relu'))) self.full_name(), self._hid_sizes[i], act='relu')))
self._mat self._mat
...@@ -88,7 +88,7 @@ class MLP(fluid.imperative.Layer): ...@@ -88,7 +88,7 @@ class MLP(fluid.imperative.Layer):
return match_vec return match_vec
class DeepCF(fluid.imperative.Layer): class DeepCF(fluid.dygraph.Layer):
def __init__(self, name_scope, num_users, num_items, matrix): def __init__(self, name_scope, num_users, num_items, matrix):
super(DeepCF, self).__init__(name_scope) super(DeepCF, self).__init__(name_scope)
self._num_users = num_users self._num_users = num_users
...@@ -103,7 +103,7 @@ class DeepCF(fluid.imperative.Layer): ...@@ -103,7 +103,7 @@ class DeepCF(fluid.imperative.Layer):
self._mlp = MLP(self.full_name()) self._mlp = MLP(self.full_name())
self._dmf = DMF(self.full_name()) self._dmf = DMF(self.full_name())
self._match_fc = fluid.imperative.FC(self.full_name(), 1, act='sigmoid') self._match_fc = fluid.dygraph.FC(self.full_name(), 1, act='sigmoid')
def forward(self, users, items): def forward(self, users, items):
# users_emb = self._user_emb(users) # users_emb = self._user_emb(users)
...@@ -191,7 +191,7 @@ def load_data(DATA_PATH): ...@@ -191,7 +191,7 @@ def load_data(DATA_PATH):
np.expand_dims(labels_np, -1), num_users, num_items, matrix np.expand_dims(labels_np, -1), num_users, num_items, matrix
class TestImperativeDeepCF(unittest.TestCase): class TestDygraphDeepCF(unittest.TestCase):
def test_deefcf(self): def test_deefcf(self):
seed = 90 seed = 90
if DATA_PATH: if DATA_PATH:
...@@ -237,7 +237,7 @@ class TestImperativeDeepCF(unittest.TestCase): ...@@ -237,7 +237,7 @@ class TestImperativeDeepCF(unittest.TestCase):
fetch_list=[loss])[0] fetch_list=[loss])[0]
sys.stderr.write('static loss %s\n' % static_loss) sys.stderr.write('static loss %s\n' % static_loss)
with fluid.imperative.guard(): with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
......
...@@ -22,12 +22,12 @@ import paddle ...@@ -22,12 +22,12 @@ import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.optimizer import SGDOptimizer from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
from test_imperative_base import new_program_scope from test_imperative_base import new_program_scope
from paddle.fluid.imperative.base import to_variable from paddle.fluid.dygraph.base import to_variable
class Discriminator(fluid.imperative.Layer): class Discriminator(fluid.dygraph.Layer):
def __init__(self, name_scope): def __init__(self, name_scope):
super(Discriminator, self).__init__(name_scope) super(Discriminator, self).__init__(name_scope)
self._fc1 = FC(self.full_name(), size=32, act='elu') self._fc1 = FC(self.full_name(), size=32, act='elu')
...@@ -38,7 +38,7 @@ class Discriminator(fluid.imperative.Layer): ...@@ -38,7 +38,7 @@ class Discriminator(fluid.imperative.Layer):
return self._fc2(x) return self._fc2(x)
class Generator(fluid.imperative.Layer): class Generator(fluid.dygraph.Layer):
def __init__(self, name_scope): def __init__(self, name_scope):
super(Generator, self).__init__(name_scope) super(Generator, self).__init__(name_scope)
self._fc1 = FC(self.full_name(), size=64, act='elu') self._fc1 = FC(self.full_name(), size=64, act='elu')
...@@ -51,7 +51,7 @@ class Generator(fluid.imperative.Layer): ...@@ -51,7 +51,7 @@ class Generator(fluid.imperative.Layer):
return self._fc3(x) return self._fc3(x)
class TestImperativeGAN(unittest.TestCase): class TestDygraphGAN(unittest.TestCase):
def test_gan_float32(self): def test_gan_float32(self):
seed = 90 seed = 90
...@@ -130,7 +130,7 @@ class TestImperativeGAN(unittest.TestCase): ...@@ -130,7 +130,7 @@ class TestImperativeGAN(unittest.TestCase):
scope.find_var(param.name).get_tensor()) scope.find_var(param.name).get_tensor())
dy_params = dict() dy_params = dict()
with fluid.imperative.guard(): with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
......
...@@ -22,16 +22,16 @@ import paddle ...@@ -22,16 +22,16 @@ import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.optimizer import AdamOptimizer from paddle.fluid.optimizer import AdamOptimizer
from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
from test_imperative_base import new_program_scope from test_imperative_base import new_program_scope
from paddle.fluid.imperative.base import to_variable from paddle.fluid.dygraph.base import to_variable
def gen_data(): def gen_data():
pass pass
class GraphConv(fluid.imperative.Layer): class GraphConv(fluid.dygraph.Layer):
def __init__(self, name_scope, in_features, out_features): def __init__(self, name_scope, in_features, out_features):
super(GraphConv, self).__init__(name_scope) super(GraphConv, self).__init__(name_scope)
...@@ -50,7 +50,7 @@ class GraphConv(fluid.imperative.Layer): ...@@ -50,7 +50,7 @@ class GraphConv(fluid.imperative.Layer):
return fluid.layers.matmul(adj, support) + self.bias return fluid.layers.matmul(adj, support) + self.bias
class GCN(fluid.imperative.Layer): class GCN(fluid.dygraph.Layer):
def __init__(self, name_scope, num_hidden): def __init__(self, name_scope, num_hidden):
super(GCN, self).__init__(name_scope) super(GCN, self).__init__(name_scope)
self.gc = GraphConv(self.full_name(), num_hidden, 32) self.gc = GraphConv(self.full_name(), num_hidden, 32)
...@@ -61,7 +61,7 @@ class GCN(fluid.imperative.Layer): ...@@ -61,7 +61,7 @@ class GCN(fluid.imperative.Layer):
return self.gc2(x, adj) return self.gc2(x, adj)
class TestImperativeGNN(unittest.TestCase): class TestDygraphGNN(unittest.TestCase):
def test_gnn_float32(self): def test_gnn_float32(self):
seed = 90 seed = 90
...@@ -115,7 +115,7 @@ class TestImperativeGNN(unittest.TestCase): ...@@ -115,7 +115,7 @@ class TestImperativeGNN(unittest.TestCase):
static_weight = np.array( static_weight = np.array(
scope.find_var(model.gc.weight.name).get_tensor()) scope.find_var(model.gc.weight.name).get_tensor())
with fluid.imperative.guard(): with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
......
...@@ -23,12 +23,12 @@ import paddle ...@@ -23,12 +23,12 @@ import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.optimizer import SGDOptimizer from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
from paddle.fluid.imperative.base import to_variable from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope from test_imperative_base import new_program_scope
class SimpleImgConvPool(fluid.imperative.Layer): class SimpleImgConvPool(fluid.dygraph.Layer):
def __init__(self, def __init__(self,
name_scope, name_scope,
num_channels, num_channels,
...@@ -77,7 +77,7 @@ class SimpleImgConvPool(fluid.imperative.Layer): ...@@ -77,7 +77,7 @@ class SimpleImgConvPool(fluid.imperative.Layer):
return x return x
class MNIST(fluid.imperative.Layer): class MNIST(fluid.dygraph.Layer):
def __init__(self, name_scope): def __init__(self, name_scope):
super(MNIST, self).__init__(name_scope) super(MNIST, self).__init__(name_scope)
...@@ -104,11 +104,11 @@ class MNIST(fluid.imperative.Layer): ...@@ -104,11 +104,11 @@ class MNIST(fluid.imperative.Layer):
return x return x
class TestImperativeMnist(unittest.TestCase): class TestDygraphMnist(unittest.TestCase):
def test_mnist_float32(self): def test_mnist_float32(self):
seed = 90 seed = 90
epoch_num = 1 epoch_num = 1
with fluid.imperative.guard(): with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
......
...@@ -16,17 +16,17 @@ from __future__ import print_function ...@@ -16,17 +16,17 @@ from __future__ import print_function
import unittest import unittest
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.imperative.nn import Embedding from paddle.fluid.dygraph.nn import Embedding
import paddle.fluid.framework as framework import paddle.fluid.framework as framework
from paddle.fluid.optimizer import SGDOptimizer from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.imperative.base import to_variable from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope from test_imperative_base import new_program_scope
import numpy as np import numpy as np
import six import six
from paddle.fluid.backward import append_backward from paddle.fluid.backward import append_backward
class SimpleLSTMRNN(fluid.imperative.Layer): class SimpleLSTMRNN(fluid.dygraph.Layer):
def __init__(self, def __init__(self,
name_scope, name_scope,
hidden_size, hidden_size,
...@@ -131,7 +131,7 @@ class SimpleLSTMRNN(fluid.imperative.Layer): ...@@ -131,7 +131,7 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
return real_res, last_hidden, last_cell return real_res, last_hidden, last_cell
class PtbModel(fluid.imperative.Layer): class PtbModel(fluid.dygraph.Layer):
def __init__(self, def __init__(self,
name_scope, name_scope,
hidden_size, hidden_size,
...@@ -214,7 +214,7 @@ class PtbModel(fluid.imperative.Layer): ...@@ -214,7 +214,7 @@ class PtbModel(fluid.imperative.Layer):
return loss, last_hidden, last_cell return loss, last_hidden, last_cell
class TestImperativePtbRnn(unittest.TestCase): class TestDygraphPtbRnn(unittest.TestCase):
def test_ptb_rnn_cpu_float32(self): def test_ptb_rnn_cpu_float32(self):
seed = 90 seed = 90
hidden_size = 10 hidden_size = 10
...@@ -224,7 +224,7 @@ class TestImperativePtbRnn(unittest.TestCase): ...@@ -224,7 +224,7 @@ class TestImperativePtbRnn(unittest.TestCase):
init_scale = 0.1 init_scale = 0.1
batch_size = 4 batch_size = 4
with fluid.imperative.guard(): with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
# TODO: marsyang1993 Change seed to # TODO: marsyang1993 Change seed to
......
...@@ -21,8 +21,8 @@ import paddle ...@@ -21,8 +21,8 @@ import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.layer_helper import LayerHelper from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid.imperative.nn import Conv2D, Pool2D, BatchNorm, FC from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, FC
from paddle.fluid.imperative.base import to_variable from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope from test_imperative_base import new_program_scope
batch_size = 8 batch_size = 8
...@@ -57,7 +57,7 @@ def optimizer_setting(params): ...@@ -57,7 +57,7 @@ def optimizer_setting(params):
lr = [] lr = []
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
optimizer = fluid.optimizer.SGD(learning_rate=0.01) optimizer = fluid.optimizer.SGD(learning_rate=0.01)
# TODO(minqiyang): Add learning rate scheduler support to imperative mode # TODO(minqiyang): Add learning rate scheduler support to dygraph mode
# optimizer = fluid.optimizer.Momentum( # optimizer = fluid.optimizer.Momentum(
# learning_rate=params["lr"], # learning_rate=params["lr"],
# learning_rate=fluid.layers.piecewise_decay( # learning_rate=fluid.layers.piecewise_decay(
...@@ -68,7 +68,7 @@ def optimizer_setting(params): ...@@ -68,7 +68,7 @@ def optimizer_setting(params):
return optimizer return optimizer
class ConvBNLayer(fluid.imperative.Layer): class ConvBNLayer(fluid.dygraph.Layer):
def __init__(self, def __init__(self,
name_scope, name_scope,
num_channels, num_channels,
...@@ -99,7 +99,7 @@ class ConvBNLayer(fluid.imperative.Layer): ...@@ -99,7 +99,7 @@ class ConvBNLayer(fluid.imperative.Layer):
return y return y
class BottleneckBlock(fluid.imperative.Layer): class BottleneckBlock(fluid.dygraph.Layer):
def __init__(self, def __init__(self,
name_scope, name_scope,
num_channels, num_channels,
...@@ -156,7 +156,7 @@ class BottleneckBlock(fluid.imperative.Layer): ...@@ -156,7 +156,7 @@ class BottleneckBlock(fluid.imperative.Layer):
return layer_helper.append_activation(y) return layer_helper.append_activation(y)
class ResNet(fluid.imperative.Layer): class ResNet(fluid.dygraph.Layer):
def __init__(self, name_scope, layers=50, class_dim=102): def __init__(self, name_scope, layers=50, class_dim=102):
super(ResNet, self).__init__(name_scope) super(ResNet, self).__init__(name_scope)
...@@ -226,13 +226,13 @@ class ResNet(fluid.imperative.Layer): ...@@ -226,13 +226,13 @@ class ResNet(fluid.imperative.Layer):
return y return y
class TestImperativeResnet(unittest.TestCase): class TestDygraphResnet(unittest.TestCase):
def test_resnet_float32(self): def test_resnet_float32(self):
seed = 90 seed = 90
batch_size = train_parameters["batch_size"] batch_size = train_parameters["batch_size"]
batch_num = 20 batch_num = 20
with fluid.imperative.guard(): with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
......
...@@ -16,7 +16,7 @@ from __future__ import print_function ...@@ -16,7 +16,7 @@ from __future__ import print_function
import unittest import unittest
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.imperative import Embedding, LayerNorm, FC, to_variable, Layer, guard from paddle.fluid.dygraph import Embedding, LayerNorm, FC, to_variable, Layer, guard
from test_imperative_base import new_program_scope from test_imperative_base import new_program_scope
from paddle.fluid import core from paddle.fluid import core
import numpy as np import numpy as np
...@@ -623,7 +623,7 @@ class PrepareEncoderDecoderLayer(Layer): ...@@ -623,7 +623,7 @@ class PrepareEncoderDecoderLayer(Layer):
initializer=fluid.initializer.NumpyArrayInitializer(pos_inp), initializer=fluid.initializer.NumpyArrayInitializer(pos_inp),
trainable=False)) trainable=False))
# use in imperative_mode to fit different length batch # use in dygraph_mode to fit different length batch
# self._pos_emb._w = to_variable( # self._pos_emb._w = to_variable(
# position_encoding_init(self._src_max_len, self._src_emb_dim)) # position_encoding_init(self._src_max_len, self._src_emb_dim))
...@@ -946,7 +946,7 @@ class TransFormer(Layer): ...@@ -946,7 +946,7 @@ class TransFormer(Layer):
return sum_cost, avg_cost, predict, token_num return sum_cost, avg_cost, predict, token_num
class TestImperativeTransformer(unittest.TestCase): class TestDygraphTransformer(unittest.TestCase):
def test_transformer_float32(self): def test_transformer_float32(self):
seed = 90 seed = 90
with guard(): with guard():
......
...@@ -29,8 +29,8 @@ from paddle.fluid import core ...@@ -29,8 +29,8 @@ from paddle.fluid import core
from paddle.fluid.initializer import Constant from paddle.fluid.initializer import Constant
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
from test_imperative_base import new_program_scope from test_imperative_base import new_program_scope
from paddle.fluid.imperative import nn from paddle.fluid.dygraph import nn
from paddle.fluid.imperative import base from paddle.fluid.dygraph import base
class LayerTest(unittest.TestCase): class LayerTest(unittest.TestCase):
...@@ -68,7 +68,7 @@ class LayerTest(unittest.TestCase): ...@@ -68,7 +68,7 @@ class LayerTest(unittest.TestCase):
@contextlib.contextmanager @contextlib.contextmanager
def dynamic_graph(self, force_to_use_cpu=False): def dynamic_graph(self, force_to_use_cpu=False):
with fluid.imperative.guard( with fluid.dygraph.guard(
self._get_place(force_to_use_cpu=force_to_use_cpu)): self._get_place(force_to_use_cpu=force_to_use_cpu)):
fluid.default_startup_program().random_seed = self.seed fluid.default_startup_program().random_seed = self.seed
fluid.default_main_program().random_seed = self.seed fluid.default_main_program().random_seed = self.seed
......
...@@ -61,6 +61,11 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, ...@@ -61,6 +61,11 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name=embedding_name, trainable=False)) for x in word_input name=embedding_name, trainable=False)) for x in word_input
] ]
# TODO(zcd): if the parameter is not trainable, the
# parameter's gradient should not generated.
for emb_layer in emb_layers:
emb_layer.stop_gradient = True
emb_layers.append(predicate_embedding) emb_layers.append(predicate_embedding)
emb_layers.append(mark_embedding) emb_layers.append(mark_embedding)
...@@ -113,60 +118,62 @@ class TestCRFModel(unittest.TestCase): ...@@ -113,60 +118,62 @@ class TestCRFModel(unittest.TestCase):
os.environ['CPU_NUM'] = str(4) os.environ['CPU_NUM'] = str(4)
main = fluid.Program() main = fluid.Program()
startup = fluid.Program() startup = fluid.Program()
with fluid.program_guard(main, startup): scope = fluid.Scope()
word = fluid.layers.data( with fluid.scope_guard(scope):
name='word_data', shape=[1], dtype='int64', lod_level=1) with fluid.program_guard(main, startup):
predicate = fluid.layers.data( word = fluid.layers.data(
name='verb_data', shape=[1], dtype='int64', lod_level=1) name='word_data', shape=[1], dtype='int64', lod_level=1)
ctx_n2 = fluid.layers.data( predicate = fluid.layers.data(
name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1) name='verb_data', shape=[1], dtype='int64', lod_level=1)
ctx_n1 = fluid.layers.data( ctx_n2 = fluid.layers.data(
name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1) name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
ctx_0 = fluid.layers.data( ctx_n1 = fluid.layers.data(
name='ctx_0_data', shape=[1], dtype='int64', lod_level=1) name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
ctx_p1 = fluid.layers.data( ctx_0 = fluid.layers.data(
name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1) name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
ctx_p2 = fluid.layers.data( ctx_p1 = fluid.layers.data(
name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1) name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
mark = fluid.layers.data( ctx_p2 = fluid.layers.data(
name='mark_data', shape=[1], dtype='int64', lod_level=1) name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
mark = fluid.layers.data(
feature_out = db_lstm(**locals()) name='mark_data', shape=[1], dtype='int64', lod_level=1)
target = fluid.layers.data(
name='target', shape=[1], dtype='int64', lod_level=1) feature_out = db_lstm(**locals())
crf_cost = fluid.layers.linear_chain_crf( target = fluid.layers.data(
input=feature_out, name='target', shape=[1], dtype='int64', lod_level=1)
label=target, crf_cost = fluid.layers.linear_chain_crf(
param_attr=fluid.ParamAttr( input=feature_out,
name='crfw', learning_rate=1e-1)) label=target,
avg_cost = fluid.layers.mean(crf_cost) param_attr=fluid.ParamAttr(
name='crfw', learning_rate=1e-1))
sgd_optimizer = fluid.optimizer.SGD( avg_cost = fluid.layers.mean(crf_cost)
learning_rate=fluid.layers.exponential_decay(
learning_rate=0.01, sgd_optimizer = fluid.optimizer.SGD(
decay_steps=100000, learning_rate=fluid.layers.exponential_decay(
decay_rate=0.5, learning_rate=0.01,
staircase=True)) decay_steps=100000,
sgd_optimizer.minimize(avg_cost) decay_rate=0.5,
staircase=True))
train_data = paddle.batch( sgd_optimizer.minimize(avg_cost)
paddle.reader.shuffle(
paddle.dataset.conll05.test(), buf_size=8192), train_data = paddle.batch(
batch_size=16) paddle.reader.shuffle(
paddle.dataset.conll05.test(), buf_size=8192),
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() batch_size=16)
exe = fluid.Executor(place)
exe.run(startup) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
train_cp = compiler.CompiledProgram(main).with_data_parallel( exe.run(startup)
loss_name=avg_cost.name, build_strategy=build_strategy)
train_cp = compiler.CompiledProgram(main).with_data_parallel(
feeder = fluid.DataFeeder( loss_name=avg_cost.name, build_strategy=build_strategy)
feed_list=[
word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, feeder = fluid.DataFeeder(
mark, target feed_list=[
], word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate,
place=fluid.CPUPlace()) mark, target
],
place=fluid.CPUPlace())
data = train_data() data = train_data()
for i in range(10): for i in range(10):
......
...@@ -41,14 +41,15 @@ class TestBase(unittest.TestCase): ...@@ -41,14 +41,15 @@ class TestBase(unittest.TestCase):
fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()) fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace())
exe.run(startup_prog) exe.run(startup_prog)
for _ in six.moves.xrange(iter): exe_strategy = fluid.ExecutionStrategy()
exe_strategy = fluid.ExecutionStrategy() exe_strategy._dry_run = True
exe_strategy._dry_run = True exe_strategy.use_experimental_executor = use_experimental_executor
exe_strategy.use_experimental_executor = use_experimental_executor train_cp = compiler.CompiledProgram(
train_cp = compiler.CompiledProgram(main_prog).with_data_parallel( main_prog).with_data_parallel(
loss_name=loss.name, exec_strategy=exe_strategy) loss_name=loss.name, exec_strategy=exe_strategy)
for _ in six.moves.xrange(iter_per_pe): for _ in six.moves.xrange(iter):
exe.run(train_cp) for _ in six.moves.xrange(iter_per_pe):
exe.run(train_cp)
class TestMNISTDryRun(TestBase): class TestMNISTDryRun(TestBase):
......
...@@ -19,7 +19,6 @@ from paddle.fluid.framework import default_main_program, Program, convert_np_dty ...@@ -19,7 +19,6 @@ from paddle.fluid.framework import default_main_program, Program, convert_np_dty
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
import numpy as np import numpy as np
from test_imperative_base import new_program_scope
class TestVariable(unittest.TestCase): class TestVariable(unittest.TestCase):
...@@ -153,7 +152,7 @@ class TestVariableImperative(unittest.TestCase): ...@@ -153,7 +152,7 @@ class TestVariableImperative(unittest.TestCase):
self.assertEqual([1, 1, 100], nw.shape) self.assertEqual([1, 1, 100], nw.shape)
def test_slice(self): def test_slice(self):
with fluid.imperative.guard(): with fluid.dygraph.guard():
self._test_slice() self._test_slice()
......
...@@ -102,7 +102,7 @@ packages=['paddle', ...@@ -102,7 +102,7 @@ packages=['paddle',
'paddle.reader', 'paddle.reader',
'paddle.distributed', 'paddle.distributed',
'paddle.fluid', 'paddle.fluid',
'paddle.fluid.imperative', 'paddle.fluid.dygraph',
'paddle.fluid.proto', 'paddle.fluid.proto',
'paddle.fluid.proto.profiler', 'paddle.fluid.proto.profiler',
'paddle.fluid.distributed', 'paddle.fluid.distributed',
......
...@@ -28,7 +28,7 @@ import hashlib ...@@ -28,7 +28,7 @@ import hashlib
member_dict = collections.OrderedDict() member_dict = collections.OrderedDict()
experimental_namespace = {"paddle.fluid.imperative"} experimental_namespace = {"paddle.fluid.dygraph"}
def md5(doc): def md5(doc):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册