提交 d231e550 编写于 作者: S sneaxiy

merge develop

test=develop
...@@ -118,9 +118,10 @@ paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon', ...@@ -118,9 +118,10 @@ paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon',
paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)) paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0))
paddle.fluid.layers.roi_align ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None)) paddle.fluid.layers.roi_align ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None))
paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,)) paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,))
paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR')) paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None))
paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',)) paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',))
paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.layers.resize_nearest ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,))
...@@ -178,6 +179,7 @@ paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], vara ...@@ -178,6 +179,7 @@ paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], vara
paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None)) paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
paddle.fluid.layers.similarity_focus ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)) paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None))
paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)) paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
...@@ -200,6 +202,7 @@ paddle.fluid.layers.create_tensor ArgSpec(args=['dtype', 'name', 'persistable'], ...@@ -200,6 +202,7 @@ paddle.fluid.layers.create_tensor ArgSpec(args=['dtype', 'name', 'persistable'],
paddle.fluid.layers.create_parameter ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None)) paddle.fluid.layers.create_parameter ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None))
paddle.fluid.layers.create_global_var ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None)) paddle.fluid.layers.create_global_var ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None))
paddle.fluid.layers.cast ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.cast ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.tensor_array_to_tensor ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
paddle.fluid.layers.concat ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None)) paddle.fluid.layers.concat ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None))
paddle.fluid.layers.sums ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sums ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.assign ArgSpec(args=['input', 'output'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.assign ArgSpec(args=['input', 'output'], varargs=None, keywords=None, defaults=(None,))
......
...@@ -18,8 +18,8 @@ namespace framework { ...@@ -18,8 +18,8 @@ namespace framework {
void TransDataDevice(const Tensor &in, const platform::Place &dst_place, void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
Tensor *out) { Tensor *out) {
VLOG(3) << "DeviceTransform in, src_place " << in.place() VLOG(30) << "DeviceTransform in, src_place " << in.place()
<< " dst_place: " << dst_place; << " dst_place: " << dst_place;
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
in.place().which(), dst_place.which(), in.place().which(), dst_place.which(),
......
...@@ -49,10 +49,10 @@ class TestOpWithKernel : public OperatorWithKernel { ...@@ -49,10 +49,10 @@ class TestOpWithKernel : public OperatorWithKernel {
OpKernelType GetExpectedKernelType( OpKernelType GetExpectedKernelType(
const ExecutionContext& ctx) const override { const ExecutionContext& ctx) const override {
if (Attr<bool>("use_gpu")) { if (Attr<bool>("use_gpu")) {
VLOG(3) << "force use gpu kernel"; VLOG(30) << "force use gpu kernel";
return OpKernelType(proto::VarType::FP32, platform::CUDAPlace(0)); return OpKernelType(proto::VarType::FP32, platform::CUDAPlace(0));
} else { } else {
VLOG(3) << "use default kernel"; VLOG(30) << "use default kernel";
return OpKernelType(proto::VarType::FP32, return OpKernelType(proto::VarType::FP32,
ctx.Input<Tensor>("input")->place()); ctx.Input<Tensor>("input")->place());
} }
...@@ -148,7 +148,7 @@ TEST(Operator, CPUtoGPU) { ...@@ -148,7 +148,7 @@ TEST(Operator, CPUtoGPU) {
// get output // get output
auto* output2 = scope.Var("OUT2"); auto* output2 = scope.Var("OUT2");
gpu_op->Run(scope, cuda_place); gpu_op->Run(scope, cuda_place);
VLOG(3) << "after gpu_op run"; VLOG(30) << "after gpu_op run";
// auto* output2_ptr = output2->Get<LoDTensor>().data<float>(); // auto* output2_ptr = output2->Get<LoDTensor>().data<float>();
paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool& pool =
......
...@@ -60,7 +60,7 @@ void BroadcastOpHandle::BroadcastOneVar( ...@@ -60,7 +60,7 @@ void BroadcastOpHandle::BroadcastOneVar(
PADDLE_ENFORCE_NOT_NULL(in_var); PADDLE_ENFORCE_NOT_NULL(in_var);
Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var); Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
if (UNLIKELY(!in_tensor.IsInitialized())) { if (UNLIKELY(!in_tensor.IsInitialized())) {
VLOG(3) << "in var " << in_var_handle.name_ << "not inited, return!"; VLOG(30) << "in var " << in_var_handle.name_ << "not inited, return!";
return; return;
} }
......
...@@ -45,8 +45,8 @@ std::unique_ptr<ir::Graph> ModifyOpLockAndRecordEventPass::ApplyImpl( ...@@ -45,8 +45,8 @@ std::unique_ptr<ir::Graph> ModifyOpLockAndRecordEventPass::ApplyImpl(
IsLockAndRecordEventFreeComputationOpHandle(compute_op, graph_view); IsLockAndRecordEventFreeComputationOpHandle(compute_op, graph_view);
compute_op->SetLockAndRecordEventFree(is_lock_and_record_event_free); compute_op->SetLockAndRecordEventFree(is_lock_and_record_event_free);
if (is_lock_and_record_event_free) { if (is_lock_and_record_event_free) {
VLOG(10) << "Set is_lock_and_record_event_free be true in op " VLOG(100) << "Set is_lock_and_record_event_free be true in op "
<< compute_op->DebugString(); << compute_op->DebugString();
} }
} }
return ir_graph; return ir_graph;
......
...@@ -399,7 +399,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl( ...@@ -399,7 +399,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
for (size_t i = 0; i < backward_vars.size(); i += 2) { for (size_t i = 0; i < backward_vars.size(); i += 2) {
auto &p_name = backward_vars[i]; auto &p_name = backward_vars[i];
auto &g_name = backward_vars[i + 1]; auto &g_name = backward_vars[i + 1];
VLOG(10) << "Bcast " << g_name << " for parameter " << p_name; VLOG(100) << "Bcast " << g_name << " for parameter " << p_name;
switch (strategy_.reduce_) { switch (strategy_.reduce_) {
case BuildStrategy::ReduceStrategy::kReduce: case BuildStrategy::ReduceStrategy::kReduce:
...@@ -809,8 +809,8 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( ...@@ -809,8 +809,8 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE_EQ(send_param_grad.size(), 2U); PADDLE_ENFORCE_EQ(send_param_grad.size(), 2U);
op_dev_id = GetAppropriateDeviceID({send_param_grad[1]}); op_dev_id = GetAppropriateDeviceID({send_param_grad[1]});
VLOG(10) << "send grad " << input_var_names[0] << " origin " VLOG(100) << "send grad " << input_var_names[0] << " origin "
<< send_param_grad[1] << " place: " << op_dev_id; << send_param_grad[1] << " place: " << op_dev_id;
for (auto &varname : input_var_names) { for (auto &varname : input_var_names) {
sharded_var_device->emplace(varname, op_dev_id); sharded_var_device->emplace(varname, op_dev_id);
} }
...@@ -826,9 +826,9 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( ...@@ -826,9 +826,9 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
if (recv_param_grad.size() == 2U) { if (recv_param_grad.size() == 2U) {
op_dev_id = op_dev_id =
GetVarDeviceID(*result, recv_param_grad[1], *sharded_var_device); GetVarDeviceID(*result, recv_param_grad[1], *sharded_var_device);
VLOG(10) << "recv param " << recv_param_grad[0] VLOG(100) << "recv param " << recv_param_grad[0]
<< " get grad place: " << recv_param_grad[1] << " get grad place: " << recv_param_grad[1]
<< " place: " << op_dev_id; << " place: " << op_dev_id;
} else { } else {
op_dev_id = GetAppropriateDeviceID(output_var_names); op_dev_id = GetAppropriateDeviceID(output_var_names);
} }
......
...@@ -140,8 +140,8 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl( ...@@ -140,8 +140,8 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
if (next_compute_op != nullptr) { if (next_compute_op != nullptr) {
if (compute_ref_cnt_map.count(next_compute_op)) { if (compute_ref_cnt_map.count(next_compute_op)) {
compute_ref_cnt_map[next_compute_op]->AddVar(var_name); compute_ref_cnt_map[next_compute_op]->AddVar(var_name);
VLOG(5) << "Add reference count of " << var_name << " to Operator " VLOG(50) << "Add reference count of " << var_name << " to Operator "
<< next_compute_op->Name(); << next_compute_op->Name();
} else { } else {
// Create new reference_count_op_handle // Create new reference_count_op_handle
ir::Node *ref_cnt_node = graph->CreateEmptyNode( ir::Node *ref_cnt_node = graph->CreateEmptyNode(
......
...@@ -51,7 +51,7 @@ void ScaleLossGradOpHandle::RunImpl() { ...@@ -51,7 +51,7 @@ void ScaleLossGradOpHandle::RunImpl() {
->stream(); ->stream();
memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp, memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
platform::CPUPlace(), &coeff_, sizeof(float), stream); platform::CPUPlace(), &coeff_, sizeof(float), stream);
VLOG(10) << place_ << "RUN Scale loss grad op"; VLOG(100) << place_ << "RUN Scale loss grad op";
}); });
#endif #endif
} }
......
...@@ -94,8 +94,8 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl( ...@@ -94,8 +94,8 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
op_node_list[i - 1]->outputs.push_back(dep_var); op_node_list[i - 1]->outputs.push_back(dep_var);
dep_var->outputs.push_back(op_node_list[i]); dep_var->outputs.push_back(op_node_list[i]);
dep_var->inputs.push_back(op_node_list[i - 1]); dep_var->inputs.push_back(op_node_list[i - 1]);
VLOG(10) << "Add dependencies between " << op_node_list[i - 1]->Name() VLOG(100) << "Add dependencies between " << op_node_list[i - 1]->Name()
<< " and " << op_node_list[i]->Name(); << " and " << op_node_list[i]->Name();
} }
return graph; return graph;
} }
......
...@@ -210,16 +210,16 @@ void ThreadedSSAGraphExecutor::RunOp( ...@@ -210,16 +210,16 @@ void ThreadedSSAGraphExecutor::RunOp(
details::OpHandleBase *op) { details::OpHandleBase *op) {
auto op_run = [ready_var_q, op, this] { auto op_run = [ready_var_q, op, this] {
try { try {
if (VLOG_IS_ON(10)) { if (VLOG_IS_ON(100)) {
VLOG(10) << op << " " << op->Name() << " : " << op->DebugString(); VLOG(100) << op << " " << op->Name() << " : " << op->DebugString();
} }
if (LIKELY(!strategy_.dry_run_)) { if (LIKELY(!strategy_.dry_run_)) {
op->Run(strategy_.use_cuda_); op->Run(strategy_.use_cuda_);
} }
VLOG(10) << op << " " << op->Name() << " Done "; VLOG(100) << op << " " << op->Name() << " Done ";
running_ops_--; running_ops_--;
ready_var_q->Extend(op->Outputs()); ready_var_q->Extend(op->Outputs());
VLOG(10) << op << " " << op->Name() << "Signal posted"; VLOG(100) << op << " " << op->Name() << "Signal posted";
} catch (...) { } catch (...) {
exception_holder_.Catch(std::current_exception()); exception_holder_.Catch(std::current_exception());
} }
......
...@@ -43,7 +43,7 @@ ExecutorPrepareContext::ExecutorPrepareContext( ...@@ -43,7 +43,7 @@ ExecutorPrepareContext::ExecutorPrepareContext(
} }
ExecutorPrepareContext::~ExecutorPrepareContext() { ExecutorPrepareContext::~ExecutorPrepareContext() {
VLOG(5) << "destroy ExecutorPrepareContext"; VLOG(50) << "destroy ExecutorPrepareContext";
} }
template <typename RefCntMap> template <typename RefCntMap>
...@@ -60,7 +60,7 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op, ...@@ -60,7 +60,7 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
if ((it->second)-- == 1) { if ((it->second)-- == 1) {
auto* var = scope.FindVar(name); auto* var = scope.FindVar(name);
if (var != nullptr) { if (var != nullptr) {
VLOG(10) << "Erase tensor \'" << name << "\'"; VLOG(100) << "Erase tensor \'" << name << "\'";
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
erase_tensors.insert(var->GetMutable<LoDTensor>()); erase_tensors.insert(var->GetMutable<LoDTensor>());
} else if (var->IsType<SelectedRows>()) { } else if (var->IsType<SelectedRows>()) {
...@@ -141,21 +141,21 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope, ...@@ -141,21 +141,21 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
if (var->Persistable()) { if (var->Persistable()) {
auto* ptr = const_cast<Scope*>(ancestor_scope)->Var(var->Name()); auto* ptr = const_cast<Scope*>(ancestor_scope)->Var(var->Name());
InitializeVariable(ptr, var->GetType()); InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name() VLOG(30) << "Create Variable " << var->Name()
<< " global, which pointer is " << ptr; << " global, which pointer is " << ptr;
} else { } else {
auto* ptr = scope->Var(var->Name()); auto* ptr = scope->Var(var->Name());
InitializeVariable(ptr, var->GetType()); InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name() VLOG(30) << "Create Variable " << var->Name()
<< " locally, which pointer is " << ptr; << " locally, which pointer is " << ptr;
} }
} }
} else { } else {
for (auto& var : global_block.AllVars()) { for (auto& var : global_block.AllVars()) {
auto* ptr = scope->Var(var->Name()); auto* ptr = scope->Var(var->Name());
InitializeVariable(ptr, var->GetType()); InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create variable " << var->Name() << ", which pointer is " VLOG(30) << "Create variable " << var->Name() << ", which pointer is "
<< ptr; << ptr;
} }
} }
} }
...@@ -286,7 +286,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, ...@@ -286,7 +286,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
int i = 0; int i = 0;
for (auto& feed_target : (*feed_targets)) { for (auto& feed_target : (*feed_targets)) {
std::string var_name = feed_target.first; std::string var_name = feed_target.first;
VLOG(3) << "feed target's name: " << var_name; VLOG(30) << "feed target's name: " << var_name;
// prepend feed op // prepend feed op
auto* op = global_block->PrependOp(); auto* op = global_block->PrependOp();
...@@ -309,7 +309,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, ...@@ -309,7 +309,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
int i = 0; int i = 0;
for (auto& fetch_target : (*fetch_targets)) { for (auto& fetch_target : (*fetch_targets)) {
std::string var_name = fetch_target.first; std::string var_name = fetch_target.first;
VLOG(3) << "fetch target's name: " << var_name; VLOG(30) << "fetch target's name: " << var_name;
// append fetch op // append fetch op
auto* op = global_block->AppendOp(); auto* op = global_block->AppendOp();
...@@ -459,7 +459,7 @@ void Executor::RunPreparedContext( ...@@ -459,7 +459,7 @@ void Executor::RunPreparedContext(
void Executor::EnableMKLDNN(const ProgramDesc& program) { void Executor::EnableMKLDNN(const ProgramDesc& program) {
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
VLOG(3) << "use_mkldnn=True"; VLOG(30) << "use_mkldnn=True";
for (size_t bid = 0; bid < program.Size(); ++bid) { for (size_t bid = 0; bid < program.Size(); ++bid) {
auto* block = const_cast<ProgramDesc&>(program).MutableBlock(bid); auto* block = const_cast<ProgramDesc&>(program).MutableBlock(bid);
for (auto* op : block->AllOps()) { for (auto* op : block->AllOps()) {
......
...@@ -25,7 +25,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input, ...@@ -25,7 +25,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
const std::string& var_name, size_t index) { const std::string& var_name, size_t index) {
// If var_name Variable is not found in GlobalScope, a new variable will // If var_name Variable is not found in GlobalScope, a new variable will
// be created. // be created.
VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index; VLOG(30) << "SetFeedVariable name=" << var_name << " index=" << index;
Variable* g_feed_value = scope->Var(var_name); Variable* g_feed_value = scope->Var(var_name);
auto& feed_inputs = *(g_feed_value->GetMutable<FeedFetchList>()); auto& feed_inputs = *(g_feed_value->GetMutable<FeedFetchList>());
if (index >= feed_inputs.size()) { if (index >= feed_inputs.size()) {
...@@ -47,8 +47,8 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, ...@@ -47,8 +47,8 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
typeid(FeedFetchList).name()); typeid(FeedFetchList).name());
auto& fetch_outputs = *g_fetch_value->GetMutable<FeedFetchList>(); auto& fetch_outputs = *g_fetch_value->GetMutable<FeedFetchList>();
auto& tensor = fetch_outputs[index]; auto& tensor = fetch_outputs[index];
VLOG(3) << "Fetch " << var_name << " with index " << index VLOG(30) << "Fetch " << var_name << " with index " << index
<< " shape= " << tensor.dims(); << " shape= " << tensor.dims();
PADDLE_ENFORCE_LT(index, fetch_outputs.size()); PADDLE_ENFORCE_LT(index, fetch_outputs.size());
return tensor; return tensor;
} }
......
...@@ -147,19 +147,19 @@ void PrepareParameters(Graph* graph, const Param& param) { ...@@ -147,19 +147,19 @@ void PrepareParameters(Graph* graph, const Param& param) {
scope->Var(param.LSTMX)->GetMutable<LoDTensor>(); scope->Var(param.LSTMX)->GetMutable<LoDTensor>();
scope->Var(param.LSTMOUT)->GetMutable<LoDTensor>(); scope->Var(param.LSTMOUT)->GetMutable<LoDTensor>();
#define GATE_W(name__) \ #define GATE_W(name__) \
auto* W_##name__##_w0 = scope->FindVar(#name__ ".w_0"); \ auto* W_##name__##_w0 = scope->FindVar(#name__ ".w_0"); \
auto* W_##name__##_w1 = scope->FindVar(#name__ ".w_1"); \ auto* W_##name__##_w1 = scope->FindVar(#name__ ".w_1"); \
auto* W_##name__##_b0 = scope->FindVar(#name__ ".b_0"); \ auto* W_##name__##_b0 = scope->FindVar(#name__ ".b_0"); \
CHECK_P3(W_##name__##_w0, W_##name__##_w1, W_##name__##_b0); \ CHECK_P3(W_##name__##_w0, W_##name__##_w1, W_##name__##_b0); \
VLOG(4) << #name__ "_w0" \ VLOG(40) << #name__ "_w0" \
<< " shape: " << W_##name__##_w0->Get<LoDTensor>().dims(); \ << " shape: " << W_##name__##_w0->Get<LoDTensor>().dims(); \
VLOG(4) << #name__ "_w1" \ VLOG(40) << #name__ "_w1" \
<< " shape: " << W_##name__##_w1->Get<LoDTensor>().dims(); \ << " shape: " << W_##name__##_w1->Get<LoDTensor>().dims(); \
VLOG(4) << #name__ "_b0" \ VLOG(40) << #name__ "_b0" \
<< " shape: " << W_##name__##_b0->Get<LoDTensor>().dims(); \ << " shape: " << W_##name__##_b0->Get<LoDTensor>().dims(); \
auto& W_##name__##_w0_t = W_##name__##_w0->Get<LoDTensor>(); \ auto& W_##name__##_w0_t = W_##name__##_w0->Get<LoDTensor>(); \
auto& W_##name__##_w1_t = W_##name__##_w1->Get<LoDTensor>(); \ auto& W_##name__##_w1_t = W_##name__##_w1->Get<LoDTensor>(); \
auto& W_##name__##_b0_t = W_##name__##_b0->Get<LoDTensor>(); auto& W_##name__##_b0_t = W_##name__##_b0->Get<LoDTensor>();
GATE_W(forget); GATE_W(forget);
...@@ -208,7 +208,7 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0, ...@@ -208,7 +208,7 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
int D = W_forget_w0.dims()[0]; int D = W_forget_w0.dims()[0];
int M = W_forget_w1.dims()[0]; int M = W_forget_w1.dims()[0];
out->Resize(make_ddim({D + M, 4 * D})); out->Resize(make_ddim({D + M, 4 * D}));
VLOG(3) << "LSTMWeight resized to " << out->dims(); VLOG(30) << "LSTMWeight resized to " << out->dims();
float* out_data = out->mutable_data<float>(platform::CPUPlace()); float* out_data = out->mutable_data<float>(platform::CPUPlace());
std::array<const float*, 4> tensors( std::array<const float*, 4> tensors(
......
...@@ -57,7 +57,7 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl( ...@@ -57,7 +57,7 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
int found_conv_bias_count = 0; int found_conv_bias_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) { Graph* g) {
VLOG(4) << "handle ConvBias fuse"; VLOG(40) << "handle ConvBias fuse";
GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
conv_bias_pattern); // Filter conv_bias_pattern); // Filter
GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_bias_pattern); // tmp GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_bias_pattern); // tmp
...@@ -74,7 +74,7 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl( ...@@ -74,7 +74,7 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
// check if fuse can be done and if MKL-DNN should be used // check if fuse can be done and if MKL-DNN should be used
FuseOptions fuse_option = FindFuseOption(*conv, *eltwise); FuseOptions fuse_option = FindFuseOption(*conv, *eltwise);
if (fuse_option == DO_NOT_FUSE || fuse_option == FUSE_NATIVE) { if (fuse_option == DO_NOT_FUSE || fuse_option == FUSE_NATIVE) {
VLOG(3) << "do not perform conv+bias fuse"; VLOG(30) << "do not perform conv+bias fuse";
return; return;
} }
......
...@@ -121,7 +121,7 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl( ...@@ -121,7 +121,7 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
int found_conv_bn_count = 0; int found_conv_bn_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) { Graph* g) {
VLOG(4) << "handle ConvBN fuse"; VLOG(40) << "handle ConvBN fuse";
// conv, batch_norm, // conv, batch_norm,
// conv_weight, conv_out, // conv_weight, conv_out,
...@@ -133,7 +133,7 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl( ...@@ -133,7 +133,7 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
// check if fuse can be done and if MKL-DNN should be used // check if fuse can be done and if MKL-DNN should be used
FuseOptions fuse_option = FindFuseOption(*conv, *batch_norm); FuseOptions fuse_option = FindFuseOption(*conv, *batch_norm);
if (fuse_option == DO_NOT_FUSE) { if (fuse_option == DO_NOT_FUSE) {
VLOG(3) << "do not perform conv+bn fuse"; VLOG(30) << "do not perform conv+bn fuse";
return; return;
} }
...@@ -241,7 +241,7 @@ std::unique_ptr<ir::Graph> ConvEltwiseAddBNFusePass::ApplyImpl( ...@@ -241,7 +241,7 @@ std::unique_ptr<ir::Graph> ConvEltwiseAddBNFusePass::ApplyImpl(
int found_conv_bn_count = 0; int found_conv_bn_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) { Graph* g) {
VLOG(4) << "handle ConvBN fuse"; VLOG(40) << "handle ConvBN fuse";
// conv, batch_norm, // conv, batch_norm,
// conv_weight, conv_out, // conv_weight, conv_out,
......
...@@ -38,7 +38,7 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl( ...@@ -38,7 +38,7 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
int found_conv_relu_count = 0; int found_conv_relu_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) { Graph* g) {
VLOG(4) << "handle ConvReLU fuse"; VLOG(40) << "handle ConvReLU fuse";
GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
conv_relu_pattern); // Filter conv_relu_pattern); // Filter
GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern); // tmp GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern); // tmp
...@@ -48,7 +48,7 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl( ...@@ -48,7 +48,7 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
FuseOptions fuse_option = FindFuseOption(*conv, *relu); FuseOptions fuse_option = FindFuseOption(*conv, *relu);
if (fuse_option == DO_NOT_FUSE) { if (fuse_option == DO_NOT_FUSE) {
VLOG(3) << "do not perform conv+relu fuse"; VLOG(30) << "do not perform conv+relu fuse";
return; return;
} }
......
...@@ -39,7 +39,7 @@ std::unique_ptr<ir::Graph> DepthwiseConvMKLDNNPass::ApplyImpl( ...@@ -39,7 +39,7 @@ std::unique_ptr<ir::Graph> DepthwiseConvMKLDNNPass::ApplyImpl(
int found_depthwise_conv_mkldnn_count = 0; int found_depthwise_conv_mkldnn_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) { Graph* g) {
VLOG(3) << "handle DepthwiseConvMKLDNN fuse"; VLOG(30) << "handle DepthwiseConvMKLDNN fuse";
GET_NODE(depthwise_conv, (*pattern)); GET_NODE(depthwise_conv, (*pattern));
depthwise_conv->Op()->SetType("conv2d"); depthwise_conv->Op()->SetType("conv2d");
found_depthwise_conv_mkldnn_count++; found_depthwise_conv_mkldnn_count++;
......
...@@ -39,7 +39,7 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl( ...@@ -39,7 +39,7 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
int found_fc_count = 0; int found_fc_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) { Graph* g) {
VLOG(4) << "handle FC fuse"; VLOG(40) << "handle FC fuse";
GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern);
......
...@@ -61,7 +61,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddAct( ...@@ -61,7 +61,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddAct(
auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
Graph *g) { Graph *g) {
VLOG(4) << "handle FuseElewiseAddAct fuse"; VLOG(40) << "handle FuseElewiseAddAct fuse";
GET_IR_NODE_FROM_SUBGRAPH(ele_y, ele_y, elewise_add_act_pattern); GET_IR_NODE_FROM_SUBGRAPH(ele_y, ele_y, elewise_add_act_pattern);
GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out, GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out,
elewise_add_act_pattern); elewise_add_act_pattern);
...@@ -77,10 +77,10 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddAct( ...@@ -77,10 +77,10 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddAct(
Node *elewise_add_act_node = CreateFuseElewiseAddActNode( Node *elewise_add_act_node = CreateFuseElewiseAddActNode(
g, act, ele_add, ele_x_n, ele_y_n, ele_out_n, act_out_n); g, act, ele_add, ele_x_n, ele_y_n, ele_out_n, act_out_n);
VLOG(4) << "\n\t " << ele_x_n << " and " << ele_y_n << " -> " VLOG(40) << "\n\t " << ele_x_n << " and " << ele_y_n << " -> "
<< ele_add->Name() << " -> " << ele_out_n << "\n" << ele_add->Name() << " -> " << ele_out_n << "\n"
<< "\t " << ele_out_n << " -> " << act->Name() << " -> " << "\t " << ele_out_n << " -> " << act->Name() << " -> "
<< act_out_n; << act_out_n;
ReLinkNodes(g, ele_out, ele_add, act, elewise_add_act_node); ReLinkNodes(g, ele_out, ele_add, act, elewise_add_act_node);
found_elewise_add_act_count++; found_elewise_add_act_count++;
...@@ -113,7 +113,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseActElewiseAdd( ...@@ -113,7 +113,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseActElewiseAdd(
auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
Graph *g) { Graph *g) {
VLOG(4) << "handle FuseElewiseAddAct fuse"; VLOG(40) << "handle FuseElewiseAddAct fuse";
GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, act_elewise_add_pattern); GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, act_elewise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(ele_x, ele_x, act_elewise_add_pattern); GET_IR_NODE_FROM_SUBGRAPH(ele_x, ele_x, act_elewise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out, GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out,
...@@ -129,9 +129,9 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseActElewiseAdd( ...@@ -129,9 +129,9 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseActElewiseAdd(
Node *elewise_add_act_node = CreateFuseElewiseAddActNode( Node *elewise_add_act_node = CreateFuseElewiseAddActNode(
g, ele_add, act, elewise_add_x_n, act_i_n, act_o_n, elewise_add_out_n); g, ele_add, act, elewise_add_x_n, act_i_n, act_o_n, elewise_add_out_n);
VLOG(4) << "\n\t " << act_i_n << " -> " << act->Name() << " -> " << act_o_n VLOG(40) << "\n\t " << act_i_n << " -> " << act->Name() << " -> " << act_o_n
<< "\n\t " << act_o_n << " and " << elewise_add_x_n << " -> " << "\n\t " << act_o_n << " and " << elewise_add_x_n << " -> "
<< ele_add->Name() << " -> " << elewise_add_out_n; << ele_add->Name() << " -> " << elewise_add_out_n;
ReLinkNodes(g, act_out, act, ele_add, elewise_add_act_node); ReLinkNodes(g, act_out, act, ele_add, elewise_add_act_node);
found_elewise_add_act_count++; found_elewise_add_act_count++;
...@@ -165,7 +165,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad( ...@@ -165,7 +165,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad(
auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
Graph *g) { Graph *g) {
VLOG(4) << "handle FuseElewiseAddActGrad1 fuse"; VLOG(40) << "handle FuseElewiseAddActGrad1 fuse";
GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, elewise_add_act_grad_pattern); GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, elewise_add_act_grad_pattern);
GET_IR_NODE_FROM_SUBGRAPH(act_grad, act_grad, elewise_add_act_grad_pattern); GET_IR_NODE_FROM_SUBGRAPH(act_grad, act_grad, elewise_add_act_grad_pattern);
GET_IR_NODE_FROM_SUBGRAPH(d_itermediate_out, d_itermediate_out, GET_IR_NODE_FROM_SUBGRAPH(d_itermediate_out, d_itermediate_out,
...@@ -208,10 +208,10 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad( ...@@ -208,10 +208,10 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad(
auto fused_node = g->CreateOpNode(&desc); auto fused_node = g->CreateOpNode(&desc);
VLOG(4) << "\n\t " << d_act_out_n << " and " << act_out_n << " -> " VLOG(40) << "\n\t " << d_act_out_n << " and " << act_out_n << " -> "
<< act_grad->Name() << " -> " << d_itermediate_out_n << "\n\t " << act_grad->Name() << " -> " << d_itermediate_out_n << "\n\t "
<< d_itermediate_out_n << " and " << act_out_n << " -> " << d_itermediate_out_n << " and " << act_out_n << " -> "
<< ele_add_grad->Name() << " -> " << d_itermediate_out_n; << ele_add_grad->Name() << " -> " << d_itermediate_out_n;
ReLinkNodes(g, d_itermediate_out, act_grad, ele_add_grad, fused_node); ReLinkNodes(g, d_itermediate_out, act_grad, ele_add_grad, fused_node);
found_elewise_add_act_count++; found_elewise_add_act_count++;
......
...@@ -92,7 +92,7 @@ Graph::Graph(const ProgramDesc &program) : program_(program) { ...@@ -92,7 +92,7 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
std::map<std::string, std::vector<ir::Node *>> Graph::InitFromProgram( std::map<std::string, std::vector<ir::Node *>> Graph::InitFromProgram(
const ProgramDesc &program) { const ProgramDesc &program) {
VLOG(3) << "block in program:" << program_.Size(); VLOG(30) << "block in program:" << program_.Size();
std::unordered_map<std::string, VarDesc *> all_vars; std::unordered_map<std::string, VarDesc *> all_vars;
// var nodes for each var name, will have multiple versions in SSA // var nodes for each var name, will have multiple versions in SSA
std::map<std::string, std::vector<ir::Node *>> var_nodes; std::map<std::string, std::vector<ir::Node *>> var_nodes;
...@@ -160,7 +160,7 @@ void Graph::ResolveHazard( ...@@ -160,7 +160,7 @@ void Graph::ResolveHazard(
auto it_old = versions.rbegin(); auto it_old = versions.rbegin();
++it_old; ++it_old;
for (; it_old != versions.rend(); it_new = it_old, ++it_old) { for (; it_old != versions.rend(); it_new = it_old, ++it_old) {
VLOG(3) << "deal with var: " << (*it_new)->Name(); VLOG(30) << "deal with var: " << (*it_new)->Name();
ir::Node *write_op = ir::Node *write_op =
(*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0]; (*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0];
const auto &read_ops = (*it_old)->outputs; const auto &read_ops = (*it_old)->outputs;
......
...@@ -89,7 +89,7 @@ class Graph { ...@@ -89,7 +89,7 @@ class Graph {
attr_name); attr_name);
attrs_[attr_name] = attr; attrs_[attr_name] = attr;
attr_dels_[attr_name] = [attr, attr_name]() { attr_dels_[attr_name] = [attr, attr_name]() {
VLOG(3) << "deleting " << attr_name; VLOG(30) << "deleting " << attr_name;
delete attr; delete attr;
}; };
} }
......
...@@ -33,8 +33,9 @@ void SortHelper( ...@@ -33,8 +33,9 @@ void SortHelper(
} }
} }
VLOG(3) << "topology sort insert: " << node->Name() VLOG(30) << "topology sort insert: " << node->Name()
<< reinterpret_cast<void *>(node) << " input " << node->inputs.size(); << reinterpret_cast<void *>(node) << " input "
<< node->inputs.size();
ret->push_back(node); ret->push_back(node);
} }
...@@ -103,9 +104,9 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList( ...@@ -103,9 +104,9 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
for (auto &var : n->inputs) { for (auto &var : n->inputs) {
for (auto &adj_n : var->inputs) { for (auto &adj_n : var->inputs) {
PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation); PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n) VLOG(40) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
<< " -> " << n->Name() << reinterpret_cast<void *>(n) << " -> " << n->Name() << reinterpret_cast<void *>(n)
<< " via " << var->Name() << reinterpret_cast<void *>(var); << " via " << var->Name() << reinterpret_cast<void *>(var);
adj_list[n].insert(adj_n); adj_list[n].insert(adj_n);
} }
} }
...@@ -163,10 +164,10 @@ size_t GraphNum(const Graph &graph) { ...@@ -163,10 +164,10 @@ size_t GraphNum(const Graph &graph) {
graph_nodes.emplace_back(g_nodes); graph_nodes.emplace_back(g_nodes);
} }
if (VLOG_IS_ON(10)) { if (VLOG_IS_ON(100)) {
VLOG(10) << "graph_num: " << graph_nodes.size(); VLOG(100) << "graph_num: " << graph_nodes.size();
for (auto &g_n : graph_nodes) { for (auto &g_n : graph_nodes) {
VLOG(10) << "graph_nodes: " << g_n.size(); VLOG(100) << "graph_nodes: " << g_n.size();
if (g_n.size() < 10) { if (g_n.size() < 10) {
std::stringstream out; std::stringstream out;
for (auto &node : g_n) { for (auto &node : g_n) {
...@@ -180,7 +181,7 @@ size_t GraphNum(const Graph &graph) { ...@@ -180,7 +181,7 @@ size_t GraphNum(const Graph &graph) {
} }
out << "]"; out << "]";
} }
VLOG(10) << out.str(); VLOG(100) << out.str();
} }
} }
} }
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <algorithm>
#include <array> #include <array>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -91,19 +92,19 @@ void GraphPatternDetector::operator()(Graph *graph, ...@@ -91,19 +92,19 @@ void GraphPatternDetector::operator()(Graph *graph,
PrettyLogEndl(Style::detail(), "--- detect %d subgraphs", subgraphs.size()); PrettyLogEndl(Style::detail(), "--- detect %d subgraphs", subgraphs.size());
int id = 0; int id = 0;
for (auto &g : subgraphs) { for (auto &g : subgraphs) {
VLOG(3) << "optimizing #" << id++ << " subgraph"; VLOG(30) << "optimizing #" << id++ << " subgraph";
handler(g, graph); handler(g, graph);
} }
} }
bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) { bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
VLOG(3) << "mark pdnodes in graph"; VLOG(30) << "mark pdnodes in graph";
if (graph.Nodes().empty()) return false; if (graph.Nodes().empty()) return false;
for (auto &node : GraphTraits::DFS(graph)) { for (auto &node : GraphTraits::DFS(graph)) {
for (const auto &pdnode : pattern_.nodes()) { for (const auto &pdnode : pattern_.nodes()) {
if (pdnode->Tell(&node)) { if (pdnode->Tell(&node)) {
VLOG(4) << "pdnode " << pdnode->name() << " marked"; VLOG(40) << "pdnode " << pdnode->name() << " marked";
pdnodes2nodes_[pdnode.get()].insert(&node); pdnodes2nodes_[pdnode.get()].insert(&node);
} }
} }
...@@ -111,7 +112,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) { ...@@ -111,7 +112,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
// Check to early stop if some PDNode can't find matched Node. // Check to early stop if some PDNode can't find matched Node.
for (auto &pdnode : pattern_.nodes()) { for (auto &pdnode : pattern_.nodes()) {
if (!pdnodes2nodes_.count(pdnode.get())) { if (!pdnodes2nodes_.count(pdnode.get())) {
VLOG(4) << pdnode->name() << " can't find matched Node, early stop"; VLOG(40) << pdnode->name() << " can't find matched Node, early stop";
// return false; // return false;
} }
} }
...@@ -120,7 +121,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) { ...@@ -120,7 +121,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
GetMarkedNodes(const_cast<Graph *>(&graph)).insert(n); GetMarkedNodes(const_cast<Graph *>(&graph)).insert(n);
} }
} }
VLOG(3) << pdnodes2nodes_.size() << " nodes marked"; VLOG(30) << pdnodes2nodes_.size() << " nodes marked";
return !pdnodes2nodes_.empty(); return !pdnodes2nodes_.empty();
} }
...@@ -213,7 +214,7 @@ GraphPatternDetector::DetectPatterns() { ...@@ -213,7 +214,7 @@ GraphPatternDetector::DetectPatterns() {
// Extend a PDNode to subgraphs by deducing the connection relations defined // Extend a PDNode to subgraphs by deducing the connection relations defined
// in edges of PDNodes. // in edges of PDNodes.
for (const auto &edge : pattern_.edges()) { for (const auto &edge : pattern_.edges()) {
VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name(); VLOG(40) << "check " << edge.first->name() << " -> " << edge.second->name();
// TODO(Superjomn) Fix bug here, the groups might be duplicate here. // TODO(Superjomn) Fix bug here, the groups might be duplicate here.
// Each role has two PDNodes, which indicates two roles. // Each role has two PDNodes, which indicates two roles.
// Detect two Nodes that can match these two roles and they are connected. // Detect two Nodes that can match these two roles and they are connected.
...@@ -224,7 +225,7 @@ GraphPatternDetector::DetectPatterns() { ...@@ -224,7 +225,7 @@ GraphPatternDetector::DetectPatterns() {
// source -> target // source -> target
for (Node *source : pdnodes2nodes_[edge.first]) { for (Node *source : pdnodes2nodes_[edge.first]) {
for (Node *target : pdnodes2nodes_[edge.second]) { for (Node *target : pdnodes2nodes_[edge.second]) {
VLOG(8) << "check " << source->id() << " -- " << target->id(); VLOG(80) << "check " << source->id() << " -- " << target->id();
// TODO(Superjomn) add some prune strategies. // TODO(Superjomn) add some prune strategies.
for (const auto &group : pre_groups) { for (const auto &group : pre_groups) {
HitGroup new_group = group; HitGroup new_group = group;
...@@ -240,12 +241,13 @@ GraphPatternDetector::DetectPatterns() { ...@@ -240,12 +241,13 @@ GraphPatternDetector::DetectPatterns() {
} }
} }
} }
VLOG(3) << "step " << step << " get records: " << cur_groups.size(); VLOG(30) << "step " << step << " get records: " << cur_groups.size();
for (auto &group : cur_groups) { for (auto &group : cur_groups) {
for (auto &item : group.roles) { for (auto &item : group.roles) {
VLOG(4) << "node " << item.second->id() << " as " << item.first->name(); VLOG(40) << "node " << item.second->id() << " as "
<< item.first->name();
} }
VLOG(4) << "========================================================="; VLOG(40) << "=========================================================";
} }
} }
......
...@@ -41,7 +41,7 @@ std::string FormatName(const Node* node) { ...@@ -41,7 +41,7 @@ std::string FormatName(const Node* node) {
std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl( std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const { std::unique_ptr<ir::Graph> graph) const {
const std::string graph_viz_path = Get<std::string>(kGraphVizPath); const std::string graph_viz_path = Get<std::string>(kGraphVizPath);
VLOG(3) << "draw IR graph viz to " << graph_viz_path; VLOG(30) << "draw IR graph viz to " << graph_viz_path;
std::unique_ptr<std::ostream> fout(new std::ofstream(graph_viz_path)); std::unique_ptr<std::ostream> fout(new std::ofstream(graph_viz_path));
PADDLE_ENFORCE(fout->good()); PADDLE_ENFORCE(fout->good());
std::ostream& sout = *fout; std::ostream& sout = *fout;
......
...@@ -20,7 +20,7 @@ namespace ir { ...@@ -20,7 +20,7 @@ namespace ir {
std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl( std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const { std::unique_ptr<ir::Graph> graph) const {
VLOG(3) << "Aplies MKL-DNN placement strategy."; VLOG(30) << "Aplies MKL-DNN placement strategy.";
for (const Node* n : graph->Nodes()) { for (const Node* n : graph->Nodes()) {
if (n->IsOp() && n->Op()->HasAttr("use_mkldnn")) { if (n->IsOp() && n->Op()->HasAttr("use_mkldnn")) {
n->Op()->SetAttr("use_mkldnn", true); n->Op()->SetAttr("use_mkldnn", true);
......
...@@ -62,7 +62,7 @@ VarDesc UpdateGradVarDesc( ...@@ -62,7 +62,7 @@ VarDesc UpdateGradVarDesc(
string::Sprintf("%s.repeat.%d", var_desc->Name(), repeat); string::Sprintf("%s.repeat.%d", var_desc->Name(), repeat);
VarDesc repeated_var = CopyVarDesc(var_desc); VarDesc repeated_var = CopyVarDesc(var_desc);
repeated_var.SetName(new_gname); repeated_var.SetName(new_gname);
VLOG(3) << "update " << var_desc->Name() << " to repeat " << repeat; VLOG(30) << "update " << var_desc->Name() << " to repeat " << repeat;
return repeated_var; return repeated_var;
} }
return *var_desc; return *var_desc;
...@@ -78,7 +78,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl( ...@@ -78,7 +78,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
std::vector<ir::Node*> nodes = TopologySortOperations(*graph); std::vector<ir::Node*> nodes = TopologySortOperations(*graph);
auto origin_nodes = graph->ReleaseNodes(); auto origin_nodes = graph->ReleaseNodes();
VLOG(3) << "origin nodes count: " << origin_nodes.size(); VLOG(30) << "origin nodes count: " << origin_nodes.size();
ir::Graph& result = *graph; ir::Graph& result = *graph;
// 1. record op nodes of different roles // 1. record op nodes of different roles
...@@ -137,8 +137,8 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl( ...@@ -137,8 +137,8 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
"%s.repeat.%d", repeated_op.Input("Variance")[0], i); "%s.repeat.%d", repeated_op.Input("Variance")[0], i);
bn_vars_need_rename.insert(repeated_op.Input("Mean")[0]); bn_vars_need_rename.insert(repeated_op.Input("Mean")[0]);
bn_vars_need_rename.insert(repeated_op.Input("Variance")[0]); bn_vars_need_rename.insert(repeated_op.Input("Variance")[0]);
VLOG(3) << "renaming " << repeated_op.Input("Mean")[0] << " to " VLOG(30) << "renaming " << repeated_op.Input("Mean")[0] << " to "
<< new_mean_name; << new_mean_name;
repeated_op.RenameInput(repeated_op.Input("Mean")[0], new_mean_name); repeated_op.RenameInput(repeated_op.Input("Mean")[0], new_mean_name);
repeated_op.RenameInput(repeated_op.Input("Variance")[0], new_var_name); repeated_op.RenameInput(repeated_op.Input("Variance")[0], new_var_name);
repeated_op.RenameOutput(repeated_op.Output("MeanOut")[0], repeated_op.RenameOutput(repeated_op.Output("MeanOut")[0],
......
...@@ -76,7 +76,7 @@ class Pass { ...@@ -76,7 +76,7 @@ class Pass {
attr_name); attr_name);
attrs_[attr_name] = attr; attrs_[attr_name] = attr;
attr_dels_[attr_name] = [attr, attr_name]() { attr_dels_[attr_name] = [attr, attr_name]() {
VLOG(3) << "deleting " << attr_name; VLOG(30) << "deleting " << attr_name;
delete attr; delete attr;
}; };
} }
......
...@@ -12,10 +12,13 @@ ...@@ -12,10 +12,13 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h" #include <set>
#include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
namespace paddle { namespace paddle {
...@@ -159,10 +162,7 @@ PDNode* BuildFCPattern(PDPattern* pattern, PDNode* fc_x) { ...@@ -159,10 +162,7 @@ PDNode* BuildFCPattern(PDPattern* pattern, PDNode* fc_x) {
std::set<std::string> acts({"sigmoid", "tanh", "relu", "identity"}); std::set<std::string> acts({"sigmoid", "tanh", "relu", "identity"});
PDNode* act = pattern->NewNode( PDNode* act = pattern->NewNode(
[=](Node* x) { [=](Node* x) { return x && x->IsOp() && acts.count(x->Op()->Type()); },
return x && x->IsOp() && acts.count(x->Op()->Type());
},
"act"); "act");
PDNode* fc_out = pattern->NewNode( PDNode* fc_out = pattern->NewNode(
...@@ -196,7 +196,7 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl( ...@@ -196,7 +196,7 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
detector(graph.get(), [&](const GraphPatternDetector::subgraph_t& subgraph, detector(graph.get(), [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* graph) { Graph* graph) {
VLOG(4) << "get one concat pattern"; VLOG(40) << "get one concat pattern";
// fc // fc
GET_NODE(fc_w, detector.pattern()); GET_NODE(fc_w, detector.pattern());
GET_NODE(fc_bias, detector.pattern()); GET_NODE(fc_bias, detector.pattern());
......
...@@ -60,7 +60,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope) { ...@@ -60,7 +60,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope) {
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) { Graph* g) {
VLOG(4) << "handle SeqConv EltAdd Relu fuse"; VLOG(40) << "handle SeqConv EltAdd Relu fuse";
GET_IR_NODE_FROM_SUBGRAPH(seqconv, seqconv, fuse_pattern); GET_IR_NODE_FROM_SUBGRAPH(seqconv, seqconv, fuse_pattern);
GET_IR_NODE_FROM_SUBGRAPH(seqconv_weight, seqconv_weight, fuse_pattern); GET_IR_NODE_FROM_SUBGRAPH(seqconv_weight, seqconv_weight, fuse_pattern);
GET_IR_NODE_FROM_SUBGRAPH(seqconv_out, seqconv_out, fuse_pattern); GET_IR_NODE_FROM_SUBGRAPH(seqconv_out, seqconv_out, fuse_pattern);
......
...@@ -31,7 +31,7 @@ void LoDRankTable::Reset(const LoD& lod, size_t level) { ...@@ -31,7 +31,7 @@ void LoDRankTable::Reset(const LoD& lod, size_t level) {
TableItem item; TableItem item;
item.index = i; item.index = i;
item.length = vec[i + 1] - vec[i]; item.length = vec[i + 1] - vec[i];
VLOG(10) << "Add item to rank table " << item.index << " " << item.length; VLOG(100) << "Add item to rank table " << item.index << " " << item.length;
items_.emplace_back(item); items_.emplace_back(item);
} }
// NOTE(yuyang18): // NOTE(yuyang18):
......
...@@ -51,7 +51,7 @@ TEST(mixed_vector, InitWithCount) { ...@@ -51,7 +51,7 @@ TEST(mixed_vector, InitWithCount) {
TEST(mixed_vector, ForEach) { TEST(mixed_vector, ForEach) {
vec<int> tmp; vec<int> tmp;
for (auto& v : tmp) { for (auto& v : tmp) {
VLOG(3) << v; VLOG(30) << v;
} }
} }
......
...@@ -71,7 +71,7 @@ void NaiveExecutor::Prepare(Scope *parent_scope, ...@@ -71,7 +71,7 @@ void NaiveExecutor::Prepare(Scope *parent_scope,
void NaiveExecutor::Run() { void NaiveExecutor::Run() {
for (auto &op : ops_) { for (auto &op : ops_) {
VLOG(4) << "run " << op->Type(); VLOG(40) << "run " << op->Type();
op->Run(*scope_, place_); op->Run(*scope_, place_);
} }
} }
...@@ -95,21 +95,21 @@ void NaiveExecutor::CreateVariables(const ProgramDesc &desc, Scope *scope, ...@@ -95,21 +95,21 @@ void NaiveExecutor::CreateVariables(const ProgramDesc &desc, Scope *scope,
if (var->Persistable()) { if (var->Persistable()) {
auto *ptr = const_cast<Scope *>(ancestor_scope)->Var(var->Name()); auto *ptr = const_cast<Scope *>(ancestor_scope)->Var(var->Name());
InitializeVariable(ptr, var->GetType()); InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name() VLOG(30) << "Create Variable " << var->Name()
<< " global, which pointer is " << ptr; << " global, which pointer is " << ptr;
} else { // Create temporary variables in local scope. } else { // Create temporary variables in local scope.
auto *ptr = scope->Var(var->Name()); auto *ptr = scope->Var(var->Name());
InitializeVariable(ptr, var->GetType()); InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name() VLOG(30) << "Create Variable " << var->Name()
<< " locally, which pointer is " << ptr; << " locally, which pointer is " << ptr;
} }
} }
} else { } else {
for (auto &var : global_block.AllVars()) { for (auto &var : global_block.AllVars()) {
auto *ptr = scope->Var(var->Name()); auto *ptr = scope->Var(var->Name());
InitializeVariable(ptr, var->GetType()); InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create variable " << var->Name() << ", which pointer is " VLOG(30) << "Create variable " << var->Name() << ", which pointer is "
<< ptr; << ptr;
} }
} }
} }
......
...@@ -82,7 +82,7 @@ class CompileTimeInferShapeContext : public InferShapeContext { ...@@ -82,7 +82,7 @@ class CompileTimeInferShapeContext : public InferShapeContext {
auto *in_var = block_.FindVarRecursive(Inputs(in)[i]); auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
auto *out_var = block_.FindVarRecursive(Outputs(out)[j]); auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
if (in_var->GetType() != proto::VarType::LOD_TENSOR) { if (in_var->GetType() != proto::VarType::LOD_TENSOR) {
VLOG(3) << "input " << in << " is not LodTensor"; VLOG(30) << "input " << in << " is not LodTensor";
return; return;
} }
out_var->SetLoDLevel(in_var->GetLoDLevel()); out_var->SetLoDLevel(in_var->GetLoDLevel());
...@@ -241,32 +241,32 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) { ...@@ -241,32 +241,32 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
const proto::OpProto::Attr &attr = GetProtoAttr(name); const proto::OpProto::Attr &attr = GetProtoAttr(name);
switch (attr.type()) { switch (attr.type()) {
case proto::AttrType::BOOLEANS: { case proto::AttrType::BOOLEANS: {
VLOG(11) << "SetAttr: " << Type() << ", " << name VLOG(110) << "SetAttr: " << Type() << ", " << name
<< " from INTS to BOOLEANS"; << " from INTS to BOOLEANS";
this->attrs_[name] = std::vector<bool>(); this->attrs_[name] = std::vector<bool>();
break; break;
} }
case proto::AttrType::INTS: { case proto::AttrType::INTS: {
VLOG(11) << "SetAttr: " << Type() << ", " << name VLOG(110) << "SetAttr: " << Type() << ", " << name
<< " from INTS to INTS"; << " from INTS to INTS";
this->attrs_[name] = std::vector<int>(); this->attrs_[name] = std::vector<int>();
break; break;
} }
case proto::AttrType::FLOATS: { case proto::AttrType::FLOATS: {
VLOG(11) << "SetAttr: " << Type() << ", " << name VLOG(110) << "SetAttr: " << Type() << ", " << name
<< " from INTS to FLOATS"; << " from INTS to FLOATS";
this->attrs_[name] = std::vector<float>(); this->attrs_[name] = std::vector<float>();
break; break;
} }
case proto::AttrType::STRINGS: { case proto::AttrType::STRINGS: {
VLOG(11) << "SetAttr: " << Type() << ", " << name VLOG(110) << "SetAttr: " << Type() << ", " << name
<< " from INTS to STRINGS"; << " from INTS to STRINGS";
this->attrs_[name] = std::vector<std::string>(); this->attrs_[name] = std::vector<std::string>();
break; break;
} }
case proto::AttrType::BLOCKS: { case proto::AttrType::BLOCKS: {
VLOG(11) << "SetAttr: " << Type() << ", " << name VLOG(110) << "SetAttr: " << Type() << ", " << name
<< " from INTS to BLOCKS"; << " from INTS to BLOCKS";
this->SetBlocksAttr(name, std::vector<BlockDesc *>()); this->SetBlocksAttr(name, std::vector<BlockDesc *>());
return; return;
} }
...@@ -499,13 +499,13 @@ void OpDesc::CheckAttrs() { ...@@ -499,13 +499,13 @@ void OpDesc::CheckAttrs() {
} }
void OpDesc::InferShape(const BlockDesc &block) const { void OpDesc::InferShape(const BlockDesc &block) const {
VLOG(3) << "CompileTime infer shape on " << Type(); VLOG(30) << "CompileTime infer shape on " << Type();
InitInferShapeFuncs(); InitInferShapeFuncs();
auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_; auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_;
PADDLE_ENFORCE(static_cast<bool>(infer_shape), PADDLE_ENFORCE(static_cast<bool>(infer_shape),
"%s's infer_shape has not been registered", this->Type()); "%s's infer_shape has not been registered", this->Type());
CompileTimeInferShapeContext ctx(*this, block); CompileTimeInferShapeContext ctx(*this, block);
if (VLOG_IS_ON(10)) { if (VLOG_IS_ON(100)) {
std::ostringstream sout; std::ostringstream sout;
auto inames = this->InputArgumentNames(); auto inames = this->InputArgumentNames();
sout << " From ["; sout << " From [";
...@@ -516,7 +516,7 @@ void OpDesc::InferShape(const BlockDesc &block) const { ...@@ -516,7 +516,7 @@ void OpDesc::InferShape(const BlockDesc &block) const {
std::copy(onames.begin(), onames.end(), std::copy(onames.begin(), onames.end(),
std::ostream_iterator<std::string>(sout, ", ")); std::ostream_iterator<std::string>(sout, ", "));
sout << "]"; sout << "]";
VLOG(10) << sout.str(); VLOG(100) << sout.str();
} }
infer_shape(&ctx); infer_shape(&ctx);
} }
...@@ -607,7 +607,7 @@ DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const { ...@@ -607,7 +607,7 @@ DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
auto shape = var->GetShape(); auto shape = var->GetShape();
res = shape.empty() ? make_ddim({0UL}) : make_ddim(shape); res = shape.empty() ? make_ddim({0UL}) : make_ddim(shape);
} catch (...) { } catch (...) {
VLOG(5) << "GetDim of variable " << name << " error"; VLOG(50) << "GetDim of variable " << name << " error";
std::rethrow_exception(std::current_exception()); std::rethrow_exception(std::current_exception());
} }
return res; return res;
...@@ -624,7 +624,7 @@ std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims( ...@@ -624,7 +624,7 @@ std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims(
res.push_back(s.empty() ? make_ddim({0UL}) : make_ddim(s)); res.push_back(s.empty() ? make_ddim({0UL}) : make_ddim(s));
} }
} catch (...) { } catch (...) {
VLOG(5) << "GetRepeatedDim of variable " << name << " error."; VLOG(50) << "GetRepeatedDim of variable " << name << " error.";
std::rethrow_exception(std::current_exception()); std::rethrow_exception(std::current_exception());
} }
return res; return res;
......
...@@ -46,9 +46,9 @@ static VariableNameMap ConvertOpDescVarsToVarNameMap( ...@@ -46,9 +46,9 @@ static VariableNameMap ConvertOpDescVarsToVarNameMap(
std::unique_ptr<OperatorBase> OpRegistry::CreateOp( std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
const proto::OpDesc& op_desc) { const proto::OpDesc& op_desc) {
VLOG(1) << "CreateOp directly from OpDesc is deprecated. It should only be" VLOG(10) << "CreateOp directly from OpDesc is deprecated. It should only be"
"used in unit tests. Use CreateOp(const OpDesc& op_desc) " "used in unit tests. Use CreateOp(const OpDesc& op_desc) "
"instead."; "instead.";
VariableNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs()); VariableNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
VariableNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs()); VariableNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
AttributeMap attrs; AttributeMap attrs;
......
...@@ -140,7 +140,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) { ...@@ -140,7 +140,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
} }
void OperatorBase::Run(const Scope& scope, const platform::Place& place) { void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
VLOG(4) << place << " " << DebugStringEx(&scope); VLOG(40) << place << " " << DebugStringEx(&scope);
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
#ifndef PADDLE_WITH_CUDA #ifndef PADDLE_WITH_CUDA
PADDLE_THROW("Cannot run operator on place %s", place); PADDLE_THROW("Cannot run operator on place %s", place);
...@@ -160,7 +160,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -160,7 +160,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
} else { } else {
RunImpl(scope, place); RunImpl(scope, place);
} }
VLOG(3) << place << " " << DebugStringEx(&scope); VLOG(30) << place << " " << DebugStringEx(&scope);
} }
bool OperatorBase::HasInputs(const std::string& name) const { bool OperatorBase::HasInputs(const std::string& name) const {
...@@ -259,6 +259,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const { ...@@ -259,6 +259,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
if (row_size >= 0) { if (row_size >= 0) {
ss << "[row_size=" << row_size << "]"; ss << "[row_size=" << row_size << "]";
} }
std::string dtype = GetDtype(*scope, output.second[i]);
ss << ":" << dtype;
ss << "[" << GetDims(*scope, var_name, true) << "]"; ss << "[" << GetDims(*scope, var_name, true) << "]";
ss << "(" << GetLoD(*scope, var_name) << ")"; ss << "(" << GetLoD(*scope, var_name) << ")";
} }
...@@ -715,14 +717,14 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -715,14 +717,14 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
auto expected_kernel_key = auto expected_kernel_key =
this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx)); this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
VLOG(3) << "expected_kernel_key:" << expected_kernel_key; VLOG(30) << "expected_kernel_key:" << expected_kernel_key;
auto kernel_iter = kernels.find(expected_kernel_key); auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
// workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
if (kernel_iter == kernels.end() && if (kernel_iter == kernels.end() &&
expected_kernel_key.library_type_ == LibraryType::kMKLDNN) { expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one"; VLOG(30) << "missing MKLDNN kernel: fallbacking to PLAIN one";
expected_kernel_key.library_type_ = LibraryType::kPlain; expected_kernel_key.library_type_ = LibraryType::kPlain;
expected_kernel_key.data_layout_ = DataLayout::kAnyLayout; expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
kernel_iter = kernels.find(expected_kernel_key); kernel_iter = kernels.find(expected_kernel_key);
...@@ -774,7 +776,8 @@ void OperatorWithKernel::TransferInplaceVarsBack( ...@@ -774,7 +776,8 @@ void OperatorWithKernel::TransferInplaceVarsBack(
const Scope& scope, const std::vector<std::string>& inplace_vars, const Scope& scope, const std::vector<std::string>& inplace_vars,
const Scope& transfer_scope) const { const Scope& transfer_scope) const {
for (auto& var_name : inplace_vars) { for (auto& var_name : inplace_vars) {
VLOG(3) << "share inplace var " + var_name + " back to it's original scope"; VLOG(30) << "share inplace var " + var_name +
" back to it's original scope";
auto* original_tensor = auto* original_tensor =
GetMutableLoDTensorOrSelectedRowsValueFromVar(scope.FindVar(var_name)); GetMutableLoDTensorOrSelectedRowsValueFromVar(scope.FindVar(var_name));
auto* var = transfer_scope.FindVar(var_name); auto* var = transfer_scope.FindVar(var_name);
...@@ -815,8 +818,8 @@ Scope* OperatorWithKernel::TryTransferData( ...@@ -815,8 +818,8 @@ Scope* OperatorWithKernel::TryTransferData(
transfered_inplace_vars->emplace_back(var_name); transfered_inplace_vars->emplace_back(var_name);
} }
VLOG(3) << "Transform Variable " << var_name << " from " VLOG(30) << "Transform Variable " << var_name << " from "
<< kernel_type_for_var << " to " << expected_kernel_key; << kernel_type_for_var << " to " << expected_kernel_key;
if (new_scope == nullptr) { if (new_scope == nullptr) {
new_scope = &scope.NewScope(); new_scope = &scope.NewScope();
......
...@@ -199,7 +199,7 @@ void ParallelExecutor::BCastParamsToDevices( ...@@ -199,7 +199,7 @@ void ParallelExecutor::BCastParamsToDevices(
auto &main_tensor = main_var->Get<LoDTensor>(); auto &main_tensor = main_var->Get<LoDTensor>();
if (!main_tensor.IsInitialized()) { if (!main_tensor.IsInitialized()) {
VLOG(3) << "one in var not inited, return!"; VLOG(30) << "one in var not inited, return!";
continue; continue;
} }
auto &dims = main_tensor.dims(); auto &dims = main_tensor.dims();
......
...@@ -149,7 +149,7 @@ Variable* Scope::VarInternal(const std::string& name) { ...@@ -149,7 +149,7 @@ Variable* Scope::VarInternal(const std::string& name) {
v = new Variable(); v = new Variable();
vars_[name].reset(v); vars_[name].reset(v);
VLOG(3) << "Create variable " << name; VLOG(30) << "Create variable " << name;
v->name_ = &(vars_.find(name)->first); v->name_ = &(vars_.find(name)->first);
return v; return v;
} }
......
...@@ -176,7 +176,7 @@ void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value, ...@@ -176,7 +176,7 @@ void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
PADDLE_ENFORCE(value->IsInitialized(), PADDLE_ENFORCE(value->IsInitialized(),
"The value tensor should be initialized."); "The value tensor should be initialized.");
if (ids.numel() == 0) { if (ids.numel() == 0) {
VLOG(3) << "keys is empty, please check data!"; VLOG(30) << "keys is empty, please check data!";
} else { } else {
int64_t value_width = value_->numel() / value_->dims()[0]; int64_t value_width = value_->numel() / value_->dims()[0];
PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0], PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
......
...@@ -23,8 +23,8 @@ namespace framework { ...@@ -23,8 +23,8 @@ namespace framework {
void TensorCopy(const Tensor& src, const platform::Place& dst_place, void TensorCopy(const Tensor& src, const platform::Place& dst_place,
const platform::DeviceContext& ctx, Tensor* dst) { const platform::DeviceContext& ctx, Tensor* dst) {
VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to " VLOG(30) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
<< dst_place; << dst_place;
src.check_memory_size(); src.check_memory_size();
dst->Resize(src.dims()); dst->Resize(src.dims());
...@@ -38,8 +38,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, ...@@ -38,8 +38,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
if (src_ptr == dst_ptr) { if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to " VLOG(30) << "Skip copy the same data async from " << src_place << " to "
<< dst_place; << dst_place;
return; return;
} }
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
...@@ -78,8 +78,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, ...@@ -78,8 +78,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream(); reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
if (platform::is_same_place(src_place, dst_place)) { if (platform::is_same_place(src_place, dst_place)) {
if (src_ptr == dst_ptr) { if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to " VLOG(30) << "Skip copy the same data async from " << src_place << " to "
<< dst_place; << dst_place;
return; return;
} }
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
...@@ -115,8 +115,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, ...@@ -115,8 +115,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
void TensorCopySync(const Tensor& src, const platform::Place& dst_place, void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
Tensor* dst) { Tensor* dst) {
VLOG(3) << "TensorCopySync " << src.dims() << " from " << src.place() VLOG(30) << "TensorCopySync " << src.dims() << " from " << src.place()
<< " to " << dst_place; << " to " << dst_place;
src.check_memory_size(); src.check_memory_size();
dst->Resize(src.dims()); dst->Resize(src.dims());
dst->set_layout(src.layout()); dst->set_layout(src.layout());
...@@ -126,8 +126,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, ...@@ -126,8 +126,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
auto size = src.numel() * SizeOfType(src.type()); auto size = src.numel() * SizeOfType(src.type());
if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
if (src_ptr == dst_ptr) { if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data from " << src_place << " to " VLOG(30) << "Skip copy the same data from " << src_place << " to "
<< dst_place; << dst_place;
return; return;
} }
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
...@@ -147,8 +147,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, ...@@ -147,8 +147,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
} else if (platform::is_gpu_place(src_place) && } else if (platform::is_gpu_place(src_place) &&
platform::is_gpu_place(dst_place)) { platform::is_gpu_place(dst_place)) {
if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) { if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) {
VLOG(3) << "Skip copy the same data from " << src_place << " to " VLOG(30) << "Skip copy the same data from " << src_place << " to "
<< dst_place; << dst_place;
return; return;
} }
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place); auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
......
...@@ -39,7 +39,7 @@ void ThreadPool::Init() { ...@@ -39,7 +39,7 @@ void ThreadPool::Init() {
int num_threads = std::thread::hardware_concurrency(); int num_threads = std::thread::hardware_concurrency();
if (FLAGS_dist_threadpool_size > 0) { if (FLAGS_dist_threadpool_size > 0) {
num_threads = FLAGS_dist_threadpool_size; num_threads = FLAGS_dist_threadpool_size;
VLOG(1) << "set dist_threadpool_size to " << num_threads; VLOG(10) << "set dist_threadpool_size to " << num_threads;
} }
PADDLE_ENFORCE_GT(num_threads, 0); PADDLE_ENFORCE_GT(num_threads, 0);
threadpool_.reset(new ThreadPool(num_threads)); threadpool_.reset(new ThreadPool(num_threads));
......
...@@ -61,10 +61,10 @@ size_t VarDesc::GetTensorDescNum() const { ...@@ -61,10 +61,10 @@ size_t VarDesc::GetTensorDescNum() const {
void VarDesc::SetShapes( void VarDesc::SetShapes(
const std::vector<std::vector<int64_t>> &multiple_dims) { const std::vector<std::vector<int64_t>> &multiple_dims) {
if (multiple_dims.size() != GetTensorDescNum()) { if (multiple_dims.size() != GetTensorDescNum()) {
VLOG(3) << "WARNING: The number of given shapes(" << multiple_dims.size() VLOG(30) << "WARNING: The number of given shapes(" << multiple_dims.size()
<< ") doesn't match the existing tensor number(" << ") doesn't match the existing tensor number("
<< GetTensorDescNum() << GetTensorDescNum()
<< "). The Reader is going to be reinitialized."; << "). The Reader is going to be reinitialized.";
SetTensorDescNum(multiple_dims.size()); SetTensorDescNum(multiple_dims.size());
} }
std::vector<proto::VarType::TensorDesc *> tensors = mutable_tensor_descs(); std::vector<proto::VarType::TensorDesc *> tensors = mutable_tensor_descs();
...@@ -94,11 +94,11 @@ void VarDesc::SetDataType(proto::VarType::Type data_type) { ...@@ -94,11 +94,11 @@ void VarDesc::SetDataType(proto::VarType::Type data_type) {
void VarDesc::SetDataTypes( void VarDesc::SetDataTypes(
const std::vector<proto::VarType::Type> &multiple_data_type) { const std::vector<proto::VarType::Type> &multiple_data_type) {
if (multiple_data_type.size() != GetTensorDescNum()) { if (multiple_data_type.size() != GetTensorDescNum()) {
VLOG(3) << "WARNING: The number of given data types(" VLOG(30) << "WARNING: The number of given data types("
<< multiple_data_type.size() << multiple_data_type.size()
<< ") doesn't match the existing tensor number(" << ") doesn't match the existing tensor number("
<< GetTensorDescNum() << GetTensorDescNum()
<< "). The Reader is going to be reinitialized."; << "). The Reader is going to be reinitialized.";
SetTensorDescNum(multiple_data_type.size()); SetTensorDescNum(multiple_data_type.size());
} }
std::vector<proto::VarType::TensorDesc *> tensor_descs = std::vector<proto::VarType::TensorDesc *> tensor_descs =
...@@ -139,11 +139,11 @@ void VarDesc::SetLoDLevel(int32_t lod_level) { ...@@ -139,11 +139,11 @@ void VarDesc::SetLoDLevel(int32_t lod_level) {
void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) { void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) {
if (multiple_lod_level.size() != GetTensorDescNum()) { if (multiple_lod_level.size() != GetTensorDescNum()) {
VLOG(3) << "WARNING: The number of given lod_levels(" VLOG(30) << "WARNING: The number of given lod_levels("
<< multiple_lod_level.size() << multiple_lod_level.size()
<< ") doesn't match the existing tensor number(" << ") doesn't match the existing tensor number("
<< GetTensorDescNum() << GetTensorDescNum()
<< "). The Reader is going to be reinitialized."; << "). The Reader is going to be reinitialized.";
SetTensorDescNum(multiple_lod_level.size()); SetTensorDescNum(multiple_lod_level.size());
} }
switch (desc_.type().type()) { switch (desc_.type().type()) {
......
...@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <string>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/type_defs.h" #include "paddle/fluid/framework/type_defs.h"
namespace paddle { namespace paddle {
...@@ -24,5 +27,27 @@ class VarTypeInference { ...@@ -24,5 +27,27 @@ class VarTypeInference {
virtual void operator()(const OpDesc& op_desc, BlockDesc* block) const = 0; virtual void operator()(const OpDesc& op_desc, BlockDesc* block) const = 0;
}; };
class PassInDtypeAndVarTypeToOutput : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const final {
auto in_out_var_names = this->GetInputOutputWithSameType();
for (auto& i_o_n : in_out_var_names) {
auto& x_name = op_desc.Input(i_o_n.first).at(0);
auto& out_name = op_desc.Output(i_o_n.second).at(0);
auto& x = block->FindRecursiveOrCreateVar(x_name);
auto& out = block->FindRecursiveOrCreateVar(out_name);
out.SetType(x.GetType());
out.SetDataType(x.GetDataType());
}
}
protected:
virtual std::unordered_map<std::string, std::string>
GetInputOutputWithSameType() const = 0;
};
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -60,7 +60,7 @@ class DfgPassManagerImpl final : public DfgPassManager { ...@@ -60,7 +60,7 @@ class DfgPassManagerImpl final : public DfgPassManager {
private: private:
void AddPass(const std::string& name, AnalysisPass* pass) { void AddPass(const std::string& name, AnalysisPass* pass) {
VLOG(3) << "Adding pass " << name; VLOG(30) << "Adding pass " << name;
Register(name, pass); Register(name, pass);
AddGraphvizDebugerPass(pass); AddGraphvizDebugerPass(pass);
} }
...@@ -104,7 +104,7 @@ void Analyzer::Run(Argument* argument) { ...@@ -104,7 +104,7 @@ void Analyzer::Run(Argument* argument) {
passes.push_back("graph_viz_pass"); // add graphviz for debug. passes.push_back("graph_viz_pass"); // add graphviz for debug.
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
if (use_mkldnn_) { if (use_mkldnn_) {
VLOG(3) << "Adding MKL-DNN placement pass"; VLOG(30) << "Adding MKL-DNN placement pass";
passes.push_back("mkldnn_placement_pass"); passes.push_back("mkldnn_placement_pass");
} }
#endif #endif
...@@ -113,7 +113,9 @@ void Analyzer::Run(Argument* argument) { ...@@ -113,7 +113,9 @@ void Analyzer::Run(Argument* argument) {
passes.push_back("infer_clean_graph_pass"); passes.push_back("infer_clean_graph_pass");
passes.push_back("graph_viz_pass"); // add graphviz for debug. passes.push_back("graph_viz_pass"); // add graphviz for debug.
for (auto& pass : ir_passes_) { for (auto& pass : ir_passes_) {
if (!disabled_ir_passes_.count(pass)) { // skip mkldnn pass when use_mkldnn_ = false;
bool skip_pass = (!use_mkldnn_) && pass.find("mkldnn") != std::string::npos;
if (!disabled_ir_passes_.count(pass) && !skip_pass) {
passes.push_back(pass); passes.push_back(pass);
passes.push_back("graph_viz_pass"); // add graphviz for debug. passes.push_back("graph_viz_pass"); // add graphviz for debug.
} }
......
...@@ -68,8 +68,8 @@ struct Argument { ...@@ -68,8 +68,8 @@ struct Argument {
key); key);
attrs_[key] = data; attrs_[key] = data;
attr_deleters_[key] = [data, key]() { attr_deleters_[key] = [data, key]() {
VLOG(3) << "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"; VLOG(30) << "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
VLOG(3) << "argument delete attr: " << key; VLOG(30) << "argument delete attr: " << key;
delete data; delete data;
}; };
} }
......
...@@ -132,7 +132,7 @@ void DataFlowGraph::Build(const framework::ir::Graph &graph) { ...@@ -132,7 +132,7 @@ void DataFlowGraph::Build(const framework::ir::Graph &graph) {
Node *x{nullptr}; Node *x{nullptr};
if (ir_node->IsOp()) { if (ir_node->IsOp()) {
PADDLE_ENFORCE(ir_node->Op()); PADDLE_ENFORCE(ir_node->Op());
VLOG(4) << "get op " << ir_node << " " << ir_node->Name(); VLOG(40) << "get op " << ir_node << " " << ir_node->Name();
x = nodes.Create(Node::Type::kFunction); x = nodes.Create(Node::Type::kFunction);
x->attr("ir_node").Pointer() = ir_node; x->attr("ir_node").Pointer() = ir_node;
PADDLE_ENFORCE(ir_node->Op()->Proto()); PADDLE_ENFORCE(ir_node->Op()->Proto());
...@@ -141,7 +141,7 @@ void DataFlowGraph::Build(const framework::ir::Graph &graph) { ...@@ -141,7 +141,7 @@ void DataFlowGraph::Build(const framework::ir::Graph &graph) {
} else if (ir_node->IsVar()) { } else if (ir_node->IsVar()) {
// Not create a Node for IR ControlDepVar, considering Inference currently // Not create a Node for IR ControlDepVar, considering Inference currently
// just used in single thread scenerio. // just used in single thread scenerio.
VLOG(4) << "get var " << ir_node->Name(); VLOG(40) << "get var " << ir_node->Name();
x = nodes.Create(Node::Type::kValue); x = nodes.Create(Node::Type::kValue);
x->attr("ir_node").Pointer() = ir_node; x->attr("ir_node").Pointer() = ir_node;
x->SetName(ir_node->Name()); x->SetName(ir_node->Name());
...@@ -151,9 +151,9 @@ void DataFlowGraph::Build(const framework::ir::Graph &graph) { ...@@ -151,9 +151,9 @@ void DataFlowGraph::Build(const framework::ir::Graph &graph) {
} }
ir_node_map.emplace(ir_node, x); ir_node_map.emplace(ir_node, x);
} }
VLOG(4) << "finish creating Nodes"; VLOG(40) << "finish creating Nodes";
VLOG(4) << "to create edge"; VLOG(40) << "to create edge";
// Create links // Create links
for (auto *ir_node : graph.Nodes()) { for (auto *ir_node : graph.Nodes()) {
auto it = ir_node_map.find(ir_node); auto it = ir_node_map.find(ir_node);
...@@ -175,7 +175,7 @@ void DataFlowGraph::Build(const framework::ir::Graph &graph) { ...@@ -175,7 +175,7 @@ void DataFlowGraph::Build(const framework::ir::Graph &graph) {
"Can't deduce any inputs from the graph, Is the graph empty?"); "Can't deduce any inputs from the graph, Is the graph empty?");
ir_graph = &graph; ir_graph = &graph;
VLOG(3) << "finished build from IR"; VLOG(30) << "finished build from IR";
} }
void DataFlowGraph::Clean() { void DataFlowGraph::Clean() {
......
...@@ -239,9 +239,10 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) { ...@@ -239,9 +239,10 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
framework::BlockDesc block_desc(nullptr, &proto); framework::BlockDesc block_desc(nullptr, &proto);
block_desc.Proto()->set_parent_idx(-1); block_desc.Proto()->set_parent_idx(-1);
block_desc.Proto()->set_idx(0); block_desc.Proto()->set_idx(0);
VLOG(4) << "origin variable size: " VLOG(40) << "origin variable size: "
<< argument_->origin_program_desc->blocks(0).vars().size(); << argument_->origin_program_desc->blocks(0).vars().size();
VLOG(4) << "transformed variable size: " << block_desc.Proto()->vars().size(); VLOG(40) << "transformed variable size: "
<< block_desc.Proto()->vars().size();
// copy ops. // copy ops.
for (auto *node : block_node->subgraph) { for (auto *node : block_node->subgraph) {
......
...@@ -29,7 +29,7 @@ void DFG_GraphvizDrawPass::Run(DataFlowGraph *graph) { ...@@ -29,7 +29,7 @@ void DFG_GraphvizDrawPass::Run(DataFlowGraph *graph) {
auto png_path = dot_path.substr(0, dot_path.size() - 4) + ".png"; auto png_path = dot_path.substr(0, dot_path.size() - 4) + ".png";
std::string message; std::string message;
VLOG(3) << "draw to " << png_path; VLOG(30) << "draw to " << png_path;
ExecShellCommand("dot -Tpng " + dot_path + " -o " + png_path, &message); ExecShellCommand("dot -Tpng " + dot_path + " -o " + png_path, &message);
} }
......
...@@ -29,7 +29,7 @@ void FluidToIrPass::EnableParamModify(const std::string &model_dir, ...@@ -29,7 +29,7 @@ void FluidToIrPass::EnableParamModify(const std::string &model_dir,
PADDLE_ENFORCE(argument_); PADDLE_ENFORCE(argument_);
argument_->Set(framework::ir::kParamScopeAttr, new framework::Scope); argument_->Set(framework::ir::kParamScopeAttr, new framework::Scope);
// Load parameters. // Load parameters.
VLOG(3) << "Loading parameters from " << model_dir; VLOG(30) << "Loading parameters from " << model_dir;
LoadParams(&argument_->Get<framework::Scope>(framework::ir::kParamScopeAttr), LoadParams(&argument_->Get<framework::Scope>(framework::ir::kParamScopeAttr),
model_dir, prog_file, param_file); model_dir, prog_file, param_file);
} }
......
...@@ -35,21 +35,21 @@ void ModelStorePass::Run(DataFlowGraph *x) { ...@@ -35,21 +35,21 @@ void ModelStorePass::Run(DataFlowGraph *x) {
std::stringstream ss; std::stringstream ss;
// NOTE these commands only works on linux. // NOTE these commands only works on linux.
ss << "mkdir -p " << *argument_->model_output_store_path; ss << "mkdir -p " << *argument_->model_output_store_path;
VLOG(3) << "run command: " << ss.str(); VLOG(30) << "run command: " << ss.str();
PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0); PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);
ss.str(""); ss.str("");
ss << "cp " << *argument_->fluid_model_dir << "/*" ss << "cp " << *argument_->fluid_model_dir << "/*"
<< " " << *argument_->model_output_store_path; << " " << *argument_->model_output_store_path;
VLOG(3) << "run command: " << ss.str(); VLOG(30) << "run command: " << ss.str();
PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0); PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);
// Store program // Store program
PADDLE_ENFORCE_NOT_NULL(argument_->transformed_program_desc, PADDLE_ENFORCE_NOT_NULL(argument_->transformed_program_desc,
"program desc is not transformed, should call " "program desc is not transformed, should call "
"DataFlowGraphToFluidPass first."); "DataFlowGraphToFluidPass first.");
VLOG(3) << "store analyzed program to " VLOG(30) << "store analyzed program to "
<< *argument_->model_output_store_path; << *argument_->model_output_store_path;
const std::string program_output_path = const std::string program_output_path =
*argument_->model_output_store_path + "/__model__"; *argument_->model_output_store_path + "/__model__";
std::ofstream file(program_output_path, std::ios::binary); std::ofstream file(program_output_path, std::ios::binary);
......
...@@ -23,7 +23,7 @@ namespace analysis { ...@@ -23,7 +23,7 @@ namespace analysis {
bool PassManager::Initialize(Argument* argument) { bool PassManager::Initialize(Argument* argument) {
argument_ = argument; argument_ = argument;
for (auto& pass : data_) { for (auto& pass : data_) {
VLOG(3) << "Initializing pass [" << pass->repr() << "]"; VLOG(30) << "Initializing pass [" << pass->repr() << "]";
if (!pass->Initialize(argument)) { if (!pass->Initialize(argument)) {
LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]"; LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]";
return false; return false;
...@@ -34,7 +34,7 @@ bool PassManager::Initialize(Argument* argument) { ...@@ -34,7 +34,7 @@ bool PassManager::Initialize(Argument* argument) {
void DfgPassManager::RunAll() { void DfgPassManager::RunAll() {
PADDLE_ENFORCE(argument_); PADDLE_ENFORCE(argument_);
VLOG(3) << "Total " << data_.size() << " Analysys passes"; VLOG(30) << "Total " << data_.size() << " Analysys passes";
for (auto& pass : data_) { for (auto& pass : data_) {
string::PrettyLogEndl(string::Style::H1(), "* Running Analysis pass [%s]", string::PrettyLogEndl(string::Style::H1(), "* Running Analysis pass [%s]",
pass->repr()); pass->repr());
......
...@@ -232,7 +232,7 @@ std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() { ...@@ -232,7 +232,7 @@ std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
BriefNode *brief_node = itr.second; BriefNode *brief_node = itr.second;
if (!brief_node->node->attr(kMarkerAttrName).Bool()) { if (!brief_node->node->attr(kMarkerAttrName).Bool()) {
VLOG(4) << brief_node->node->id() << " node not a trt candicate."; VLOG(40) << brief_node->node->id() << " node not a trt candicate.";
continue; continue;
} }
......
...@@ -25,9 +25,9 @@ TensorRTSubGraphPass::TensorRTSubGraphPass( ...@@ -25,9 +25,9 @@ TensorRTSubGraphPass::TensorRTSubGraphPass(
void TensorRTSubGraphPass::Run(DataFlowGraph *graph) { void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
SubGraphFuse(graph, node_inside_subgraph_teller_, argument_)(); SubGraphFuse(graph, node_inside_subgraph_teller_, argument_)();
VLOG(4) << "debug info " VLOG(40) << "debug info "
<< graph->HumanReadableInfo(false /*show_values*/, << graph->HumanReadableInfo(false /*show_values*/,
true /*show_functions*/); true /*show_functions*/);
} }
} // namespace analysis } // namespace analysis
......
...@@ -38,7 +38,7 @@ using contrib::AnalysisConfig; ...@@ -38,7 +38,7 @@ using contrib::AnalysisConfig;
bool AnalysisPredictor::Init( bool AnalysisPredictor::Init(
const std::shared_ptr<framework::Scope> &parent_scope, const std::shared_ptr<framework::Scope> &parent_scope,
const std::shared_ptr<framework::ProgramDesc> &program) { const std::shared_ptr<framework::ProgramDesc> &program) {
VLOG(3) << "Predictor::init()"; VLOG(30) << "Predictor::init()";
#if !defined(_WIN32) #if !defined(_WIN32)
if (FLAGS_profile) { if (FLAGS_profile) {
LOG(WARNING) << "Profiler is actived, might affect the performance"; LOG(WARNING) << "Profiler is actived, might affect the performance";
...@@ -89,7 +89,7 @@ bool AnalysisPredictor::Init( ...@@ -89,7 +89,7 @@ bool AnalysisPredictor::Init(
bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs, bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data, std::vector<PaddleTensor> *output_data,
int batch_size) { int batch_size) {
VLOG(3) << "Predictor::predict"; VLOG(30) << "Predictor::predict";
inference::Timer timer; inference::Timer timer;
timer.tic(); timer.tic();
// set feed variable // set feed variable
...@@ -109,7 +109,7 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs, ...@@ -109,7 +109,7 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
LOG(ERROR) << "fail to get fetches"; LOG(ERROR) << "fail to get fetches";
return false; return false;
} }
VLOG(3) << "predict cost: " << timer.toc() << "ms"; VLOG(30) << "predict cost: " << timer.toc() << "ms";
// Fix TensorArray reuse not cleaned bug. // Fix TensorArray reuse not cleaned bug.
tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get()); tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
...@@ -119,7 +119,7 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs, ...@@ -119,7 +119,7 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs, bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
framework::Scope *scope) { framework::Scope *scope) {
VLOG(3) << "Predictor::set_feed"; VLOG(30) << "Predictor::set_feed";
if (inputs.size() != feeds_.size()) { if (inputs.size() != feeds_.size()) {
LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get " LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get "
<< inputs.size(); << inputs.size();
...@@ -184,7 +184,7 @@ void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch, ...@@ -184,7 +184,7 @@ void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch,
bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs, bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
framework::Scope *scope) { framework::Scope *scope) {
VLOG(3) << "Predictor::get_fetch"; VLOG(30) << "Predictor::get_fetch";
outputs->resize(fetchs_.size()); outputs->resize(fetchs_.size());
for (size_t i = 0; i < fetchs_.size(); ++i) { for (size_t i = 0; i < fetchs_.size(); ++i) {
int idx = boost::get<int>(fetchs_[i]->GetAttr("col")); int idx = boost::get<int>(fetchs_[i]->GetAttr("col"));
...@@ -246,7 +246,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -246,7 +246,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
} }
CHECK(argument_.transformed_program_desc); CHECK(argument_.transformed_program_desc);
VLOG(5) << "to prepare executor"; VLOG(50) << "to prepare executor";
inference_program_.reset( inference_program_.reset(
new framework::ProgramDesc(*argument_.transformed_program_desc)); new framework::ProgramDesc(*argument_.transformed_program_desc));
if (argument_.Has(framework::ir::kParamScopeAttr)) { if (argument_.Has(framework::ir::kParamScopeAttr)) {
...@@ -260,7 +260,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -260,7 +260,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
template <> template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) { AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
VLOG(3) << "create AnalysisConfig"; VLOG(30) << "create AnalysisConfig";
if (config.use_gpu) { if (config.use_gpu) {
// 1. GPU memeroy // 1. GPU memeroy
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
...@@ -274,7 +274,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< ...@@ -274,7 +274,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
std::string flag = "--fraction_of_gpu_memory_to_use=" + std::string flag = "--fraction_of_gpu_memory_to_use=" +
std::to_string(config.fraction_of_gpu_memory); std::to_string(config.fraction_of_gpu_memory);
flags.push_back(flag); flags.push_back(flag);
VLOG(3) << "set flag: " << flag; VLOG(30) << "set flag: " << flag;
framework::InitGflags(flags); framework::InitGflags(flags);
} }
} }
......
...@@ -13,6 +13,8 @@ ...@@ -13,6 +13,8 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <algorithm>
#include <map>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/naive_executor.h"
......
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle_inference_api.h"
namespace paddle { namespace paddle {
......
...@@ -157,7 +157,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs, ...@@ -157,7 +157,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
LOG(ERROR) << "fail to get fetches"; LOG(ERROR) << "fail to get fetches";
return false; return false;
} }
VLOG(3) << "predict cost: " << timer.toc() << "ms"; VLOG(30) << "predict cost: " << timer.toc() << "ms";
// Fix TensorArray reuse not cleaned bug. // Fix TensorArray reuse not cleaned bug.
tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get()); tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
......
...@@ -34,7 +34,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { ...@@ -34,7 +34,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
bool Init(const std::shared_ptr<framework::Scope>& parent_scope) { bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
FLAGS_IA_enable_tensorrt_subgraph_engine = true; FLAGS_IA_enable_tensorrt_subgraph_engine = true;
VLOG(3) << "Predictor::init()"; VLOG(30) << "Predictor::init()";
if (config_.use_gpu) { if (config_.use_gpu) {
place_ = paddle::platform::CUDAPlace(config_.device); place_ = paddle::platform::CUDAPlace(config_.device);
} else { } else {
...@@ -70,7 +70,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { ...@@ -70,7 +70,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
OptimizeInferenceProgram(); OptimizeInferenceProgram();
ctx_ = executor_->Prepare(*inference_program_, 0); ctx_ = executor_->Prepare(*inference_program_, 0);
VLOG(5) << "to create variables"; VLOG(50) << "to create variables";
executor_->CreateVariables(*inference_program_, executor_->CreateVariables(*inference_program_,
sub_scope_ ? sub_scope_ : scope_.get(), 0); sub_scope_ ? sub_scope_ : scope_.get(), 0);
// Get the feed_target_names and fetch_target_names // Get the feed_target_names and fetch_target_names
...@@ -114,9 +114,9 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { ...@@ -114,9 +114,9 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
new ProgramDesc(*inference_program_->Proto())); new ProgramDesc(*inference_program_->Proto()));
Singleton<Analyzer>::Global().Run(&argument); Singleton<Analyzer>::Global().Run(&argument);
CHECK(argument.transformed_program_desc); CHECK(argument.transformed_program_desc);
VLOG(5) << "transformed program:\n" VLOG(50) << "transformed program:\n"
<< argument.transformed_program_desc->SerializeAsString(); << argument.transformed_program_desc->SerializeAsString();
VLOG(5) << "to prepare executor"; VLOG(50) << "to prepare executor";
inference_program_.reset( inference_program_.reset(
new framework::ProgramDesc(*argument.transformed_program_desc)); new framework::ProgramDesc(*argument.transformed_program_desc));
} }
...@@ -129,7 +129,7 @@ template <> ...@@ -129,7 +129,7 @@ template <>
std::unique_ptr<PaddlePredictor> std::unique_ptr<PaddlePredictor>
CreatePaddlePredictor<MixedRTConfig, PaddleEngineKind::kAutoMixedTensorRT>( CreatePaddlePredictor<MixedRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
const MixedRTConfig& config) { const MixedRTConfig& config) {
VLOG(3) << "create TensorRTSubgraphPredictor"; VLOG(30) << "create TensorRTSubgraphPredictor";
if (config.use_gpu) { if (config.use_gpu) {
// 1. GPU memeroy // 1. GPU memeroy
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
...@@ -143,7 +143,7 @@ CreatePaddlePredictor<MixedRTConfig, PaddleEngineKind::kAutoMixedTensorRT>( ...@@ -143,7 +143,7 @@ CreatePaddlePredictor<MixedRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
std::string flag = "--fraction_of_gpu_memory_to_use=" + std::string flag = "--fraction_of_gpu_memory_to_use=" +
std::to_string(config.fraction_of_gpu_memory); std::to_string(config.fraction_of_gpu_memory);
flags.push_back(flag); flags.push_back(flag);
VLOG(3) << "set flag: " << flag; VLOG(30) << "set flag: " << flag;
framework::InitGflags(flags); framework::InitGflags(flags);
} }
} }
......
...@@ -45,7 +45,7 @@ void Main() { ...@@ -45,7 +45,7 @@ void Main() {
config.fraction_of_gpu_memory = 0.1; // set by yourself config.fraction_of_gpu_memory = 0.1; // set by yourself
predictor = CreatePaddlePredictor<paddle::contrib::MixedRTConfig>(config); predictor = CreatePaddlePredictor<paddle::contrib::MixedRTConfig>(config);
VLOG(3) << "begin to process data"; VLOG(30) << "begin to process data";
// Just a single batch of data. // Just a single batch of data.
std::string line; std::string line;
std::ifstream file(FLAGS_data); std::ifstream file(FLAGS_data);
...@@ -60,13 +60,13 @@ void Main() { ...@@ -60,13 +60,13 @@ void Main() {
PaddleBuf(record.data.data(), record.data.size() * sizeof(float)); PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
input.dtype = PaddleDType::FLOAT32; input.dtype = PaddleDType::FLOAT32;
VLOG(3) << "run executor"; VLOG(30) << "run executor";
std::vector<PaddleTensor> output; std::vector<PaddleTensor> output;
predictor->Run({input}, &output, 1); predictor->Run({input}, &output, 1);
VLOG(3) << "output.size " << output.size(); VLOG(30) << "output.size " << output.size();
auto& tensor = output.front(); auto& tensor = output.front();
VLOG(3) << "output: " << SummaryTensor(tensor); VLOG(30) << "output: " << SummaryTensor(tensor);
// compare with reference result // compare with reference result
CheckOutput(FLAGS_refer, tensor); CheckOutput(FLAGS_refer, tensor);
......
...@@ -47,7 +47,7 @@ static void split(const std::string& str, char sep, ...@@ -47,7 +47,7 @@ static void split(const std::string& str, char sep,
} }
Record ProcessALine(const std::string& line) { Record ProcessALine(const std::string& line) {
VLOG(3) << "process a line"; VLOG(30) << "process a line";
std::vector<std::string> columns; std::vector<std::string> columns;
split(line, '\t', &columns); split(line, '\t', &columns);
CHECK_EQ(columns.size(), 2UL) CHECK_EQ(columns.size(), 2UL)
...@@ -65,8 +65,8 @@ Record ProcessALine(const std::string& line) { ...@@ -65,8 +65,8 @@ Record ProcessALine(const std::string& line) {
for (auto& s : shape_strs) { for (auto& s : shape_strs) {
record.shape.push_back(std::stoi(s)); record.shape.push_back(std::stoi(s));
} }
VLOG(3) << "data size " << record.data.size(); VLOG(30) << "data size " << record.data.size();
VLOG(3) << "data shape size " << record.shape.size(); VLOG(30) << "data shape size " << record.shape.size();
return record; return record;
} }
...@@ -78,8 +78,8 @@ void CheckOutput(const std::string& referfile, const PaddleTensor& output) { ...@@ -78,8 +78,8 @@ void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
file.close(); file.close();
size_t numel = output.data.length() / PaddleDtypeSize(output.dtype); size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
VLOG(3) << "predictor output numel " << numel; VLOG(30) << "predictor output numel " << numel;
VLOG(3) << "reference output numel " << refer.data.size(); VLOG(30) << "reference output numel " << refer.data.size();
CHECK_EQ(numel, refer.data.size()); CHECK_EQ(numel, refer.data.size());
switch (output.dtype) { switch (output.dtype) {
case PaddleDType::INT64: { case PaddleDType::INT64: {
......
...@@ -49,11 +49,11 @@ void Main(bool use_gpu) { ...@@ -49,11 +49,11 @@ void Main(bool use_gpu) {
config.fraction_of_gpu_memory = 0.1; // set by yourself config.fraction_of_gpu_memory = 0.1; // set by yourself
} }
VLOG(3) << "init predictor"; VLOG(30) << "init predictor";
predictor = CreatePaddlePredictor<NativeConfig>(config); predictor = CreatePaddlePredictor<NativeConfig>(config);
analysis_predictor = CreatePaddlePredictor<AnalysisConfig>(config); analysis_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
VLOG(3) << "begin to process data"; VLOG(30) << "begin to process data";
// Just a single batch of data. // Just a single batch of data.
std::string line; std::string line;
std::ifstream file(FLAGS_data); std::ifstream file(FLAGS_data);
...@@ -68,13 +68,13 @@ void Main(bool use_gpu) { ...@@ -68,13 +68,13 @@ void Main(bool use_gpu) {
PaddleBuf(record.data.data(), record.data.size() * sizeof(float)); PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
input.dtype = PaddleDType::FLOAT32; input.dtype = PaddleDType::FLOAT32;
VLOG(3) << "run executor"; VLOG(30) << "run executor";
std::vector<PaddleTensor> output, analysis_output; std::vector<PaddleTensor> output, analysis_output;
predictor->Run({input}, &output, 1); predictor->Run({input}, &output, 1);
VLOG(3) << "output.size " << output.size(); VLOG(30) << "output.size " << output.size();
auto& tensor = output.front(); auto& tensor = output.front();
VLOG(3) << "output: " << SummaryTensor(tensor); VLOG(30) << "output: " << SummaryTensor(tensor);
// compare with reference result // compare with reference result
CheckOutput(FLAGS_refer, tensor); CheckOutput(FLAGS_refer, tensor);
......
...@@ -26,7 +26,7 @@ void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) { ...@@ -26,7 +26,7 @@ void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) {
// parameter. // parameter.
if (var_name == "feed" || var_name == "fetch") continue; if (var_name == "feed" || var_name == "fetch") continue;
if (var->Type() == typeid(framework::LoDTensorArray)) { if (var->Type() == typeid(framework::LoDTensorArray)) {
VLOG(4) << "collect " << var_name; VLOG(40) << "collect " << var_name;
arrays_.push_back(var->GetMutable<framework::LoDTensorArray>()); arrays_.push_back(var->GetMutable<framework::LoDTensorArray>());
} }
} }
...@@ -34,7 +34,7 @@ void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) { ...@@ -34,7 +34,7 @@ void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) {
CollectTensorArrays(kid); CollectTensorArrays(kid);
} }
VLOG(3) << "Collect " << arrays_.size() << " arrays"; VLOG(30) << "Collect " << arrays_.size() << " arrays";
flag_ = false; flag_ = false;
} }
} }
......
...@@ -16,13 +16,14 @@ ...@@ -16,13 +16,14 @@
#include <glog/logging.h> #include <glog/logging.h>
#include <sys/time.h> #include <sys/time.h>
#include <algorithm>
#include <chrono> // NOLINT #include <chrono> // NOLINT
#include <numeric> #include <numeric>
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
#include "paddle_inference_api.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
......
...@@ -59,7 +59,8 @@ void ReadBinaryFile(const std::string& filename, std::string* contents) { ...@@ -59,7 +59,8 @@ void ReadBinaryFile(const std::string& filename, std::string* contents) {
bool IsPersistable(const framework::VarDesc* var) { bool IsPersistable(const framework::VarDesc* var) {
if (var->Persistable() && if (var->Persistable() &&
var->GetType() != framework::proto::VarType::FEED_MINIBATCH && var->GetType() != framework::proto::VarType::FEED_MINIBATCH &&
var->GetType() != framework::proto::VarType::FETCH_LIST) { var->GetType() != framework::proto::VarType::FETCH_LIST &&
var->GetType() != framework::proto::VarType::RAW) {
return true; return true;
} }
return false; return false;
...@@ -77,7 +78,7 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope, ...@@ -77,7 +78,7 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
for (auto* var : global_block.AllVars()) { for (auto* var : global_block.AllVars()) {
if (IsPersistable(var)) { if (IsPersistable(var)) {
VLOG(3) << "persistable variable's name: " << var->Name(); VLOG(30) << "persistable variable's name: " << var->Name();
framework::VarDesc* new_var = load_block->Var(var->Name()); framework::VarDesc* new_var = load_block->Var(var->Name());
new_var->SetShape(var->GetShape()); new_var->SetShape(var->GetShape());
...@@ -120,7 +121,7 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor, ...@@ -120,7 +121,7 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
const std::string& dirname) { const std::string& dirname) {
std::string model_filename = dirname + "/__model__"; std::string model_filename = dirname + "/__model__";
std::string program_desc_str; std::string program_desc_str;
VLOG(3) << "loading model from " << model_filename; VLOG(30) << "loading model from " << model_filename;
ReadBinaryFile(model_filename, &program_desc_str); ReadBinaryFile(model_filename, &program_desc_str);
std::unique_ptr<framework::ProgramDesc> main_program( std::unique_ptr<framework::ProgramDesc> main_program(
......
...@@ -25,7 +25,7 @@ class ConcatOpConverter : public OpConverter { ...@@ -25,7 +25,7 @@ class ConcatOpConverter : public OpConverter {
public: public:
void operator()(const framework::proto::OpDesc& op, void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override { const framework::Scope& scope, bool test_mode) override {
VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias"; VLOG(40) << "convert a fluid mul op to tensorrt mul layer without bias";
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
// Declare inputs // Declare inputs
......
...@@ -25,7 +25,7 @@ class DropoutOpConverter : public OpConverter { ...@@ -25,7 +25,7 @@ class DropoutOpConverter : public OpConverter {
public: public:
void operator()(const framework::proto::OpDesc& op, void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override { const framework::Scope& scope, bool test_mode) override {
VLOG(4) << "convert a fluid dropout op to tensorrt dropout layer"; VLOG(40) << "convert a fluid dropout op to tensorrt dropout layer";
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
// Declare inputs // Declare inputs
auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]); auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
......
...@@ -52,7 +52,7 @@ class FcOpConverter : public OpConverter { ...@@ -52,7 +52,7 @@ class FcOpConverter : public OpConverter {
public: public:
void operator()(const framework::proto::OpDesc& op, void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override { const framework::Scope& scope, bool test_mode) override {
VLOG(4) << "convert a fluid fc op to tensorrt fc layer without bias"; VLOG(40) << "convert a fluid fc op to tensorrt fc layer without bias";
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
......
...@@ -25,7 +25,7 @@ class MulOpConverter : public OpConverter { ...@@ -25,7 +25,7 @@ class MulOpConverter : public OpConverter {
public: public:
void operator()(const framework::proto::OpDesc& op, void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override { const framework::Scope& scope, bool test_mode) override {
VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias"; VLOG(40) << "convert a fluid mul op to tensorrt mul layer without bias";
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
// Declare inputs // Declare inputs
......
...@@ -25,7 +25,7 @@ class PadOpConverter : public OpConverter { ...@@ -25,7 +25,7 @@ class PadOpConverter : public OpConverter {
public: public:
void operator()(const framework::proto::OpDesc& op, void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override { const framework::Scope& scope, bool test_mode) override {
VLOG(4) << "convert a fluid transpose op to tensorrt tranpose layer"; VLOG(40) << "convert a fluid transpose op to tensorrt tranpose layer";
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
// Declare inputs // Declare inputs
......
...@@ -25,7 +25,7 @@ class Pool2dOpConverter : public OpConverter { ...@@ -25,7 +25,7 @@ class Pool2dOpConverter : public OpConverter {
public: public:
void operator()(const framework::proto::OpDesc& op, void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override { const framework::Scope& scope, bool test_mode) override {
VLOG(4) VLOG(40)
<< "convert a fluid pool2d op to tensorrt pool2d layer without bias"; << "convert a fluid pool2d op to tensorrt pool2d layer without bias";
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
// Declare inputs // Declare inputs
......
...@@ -25,7 +25,7 @@ class SoftMaxOpConverter : public OpConverter { ...@@ -25,7 +25,7 @@ class SoftMaxOpConverter : public OpConverter {
public: public:
void operator()(const framework::proto::OpDesc& op, void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override { const framework::Scope& scope, bool test_mode) override {
VLOG(4) VLOG(40)
<< "convert a fluid softmax op to tensorrt softmax layer without bias"; << "convert a fluid softmax op to tensorrt softmax layer without bias";
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
// Declare inputs // Declare inputs
......
...@@ -134,7 +134,7 @@ class TensorRTEngine : public EngineBase { ...@@ -134,7 +134,7 @@ class TensorRTEngine : public EngineBase {
std::unordered_map<std::string /*name*/, std::unique_ptr<framework::Tensor>> std::unordered_map<std::string /*name*/, std::unique_ptr<framework::Tensor>>
weight_map; weight_map;
// TODO: (NHZLX) // TODO(NHZLX)
// In the normal case, the paddle-trt exists bug when runing the googlenet. // In the normal case, the paddle-trt exists bug when runing the googlenet.
// When there are more than two convolutions of 1 * 1 with the same input, the // When there are more than two convolutions of 1 * 1 with the same input, the
// paddle-tensorrt will do the merging optimization, which fuse those conv // paddle-tensorrt will do the merging optimization, which fuse those conv
......
...@@ -27,7 +27,7 @@ struct Record { ...@@ -27,7 +27,7 @@ struct Record {
}; };
Record ProcessALine(const std::string &line) { Record ProcessALine(const std::string &line) {
VLOG(3) << "process a line"; VLOG(30) << "process a line";
std::vector<std::string> columns; std::vector<std::string> columns;
split(line, '\t', &columns); split(line, '\t', &columns);
CHECK_EQ(columns.size(), 2UL) CHECK_EQ(columns.size(), 2UL)
...@@ -45,8 +45,8 @@ Record ProcessALine(const std::string &line) { ...@@ -45,8 +45,8 @@ Record ProcessALine(const std::string &line) {
for (auto &s : shape_strs) { for (auto &s : shape_strs) {
record.shape.push_back(std::stoi(s)); record.shape.push_back(std::stoi(s));
} }
VLOG(3) << "data size " << record.data.size(); VLOG(30) << "data size " << record.data.size();
VLOG(3) << "data shape size " << record.shape.size(); VLOG(30) << "data shape size " << record.shape.size();
return record; return record;
} }
......
...@@ -32,11 +32,11 @@ BuddyAllocator::BuddyAllocator( ...@@ -32,11 +32,11 @@ BuddyAllocator::BuddyAllocator(
system_allocator_(std::move(system_allocator)) {} system_allocator_(std::move(system_allocator)) {}
BuddyAllocator::~BuddyAllocator() { BuddyAllocator::~BuddyAllocator() {
VLOG(10) << "BuddyAllocator Disconstructor makes sure that all of these " VLOG(100) << "BuddyAllocator Disconstructor makes sure that all of these "
"have actually been freed"; "have actually been freed";
while (!pool_.empty()) { while (!pool_.empty()) {
auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin())); auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
VLOG(10) << "Free from block (" << block << ", " << max_chunk_size_ << ")"; VLOG(100) << "Free from block (" << block << ", " << max_chunk_size_ << ")";
system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
cache_.invalidate(block); cache_.invalidate(block);
...@@ -57,12 +57,12 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) { ...@@ -57,12 +57,12 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
// acquire the allocator lock // acquire the allocator lock
std::lock_guard<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
VLOG(10) << "Allocate " << unaligned_size << " bytes from chunk size " VLOG(100) << "Allocate " << unaligned_size << " bytes from chunk size "
<< size; << size;
// if the allocation is huge, send directly to the system allocator // if the allocation is huge, send directly to the system allocator
if (size > max_chunk_size_) { if (size > max_chunk_size_) {
VLOG(10) << "Allocate from system allocator."; VLOG(100) << "Allocate from system allocator.";
return SystemAlloc(size); return SystemAlloc(size);
} }
...@@ -77,9 +77,9 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) { ...@@ -77,9 +77,9 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
return nullptr; return nullptr;
} }
} else { } else {
VLOG(10) << "Allocation from existing memory block " << std::get<2>(*it) VLOG(100) << "Allocation from existing memory block " << std::get<2>(*it)
<< " at address " << " at address "
<< reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data(); << reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
} }
total_used_ += size; total_used_ += size;
...@@ -96,10 +96,10 @@ void BuddyAllocator::Free(void* p) { ...@@ -96,10 +96,10 @@ void BuddyAllocator::Free(void* p) {
// Acquire the allocator lock // Acquire the allocator lock
std::lock_guard<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
VLOG(10) << "Free from address " << block; VLOG(100) << "Free from address " << block;
if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) { if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) {
VLOG(10) << "Free directly from system allocator"; VLOG(100) << "Free directly from system allocator";
system_allocator_->Free(block, block->total_size(cache_), system_allocator_->Free(block, block->total_size(cache_),
block->index(cache_)); block->index(cache_));
...@@ -116,8 +116,8 @@ void BuddyAllocator::Free(void* p) { ...@@ -116,8 +116,8 @@ void BuddyAllocator::Free(void* p) {
// Trying to merge the right buddy // Trying to merge the right buddy
if (block->has_right_buddy(cache_)) { if (block->has_right_buddy(cache_)) {
VLOG(10) << "Merging this block " << block << " with its right buddy " VLOG(100) << "Merging this block " << block << " with its right buddy "
<< block->right_buddy(cache_); << block->right_buddy(cache_);
auto right_buddy = block->right_buddy(cache_); auto right_buddy = block->right_buddy(cache_);
...@@ -134,8 +134,8 @@ void BuddyAllocator::Free(void* p) { ...@@ -134,8 +134,8 @@ void BuddyAllocator::Free(void* p) {
// Trying to merge the left buddy // Trying to merge the left buddy
if (block->has_left_buddy(cache_)) { if (block->has_left_buddy(cache_)) {
VLOG(10) << "Merging this block " << block << " with its left buddy " VLOG(100) << "Merging this block " << block << " with its left buddy "
<< block->left_buddy(cache_); << block->left_buddy(cache_);
auto left_buddy = block->left_buddy(cache_); auto left_buddy = block->left_buddy(cache_);
...@@ -151,8 +151,8 @@ void BuddyAllocator::Free(void* p) { ...@@ -151,8 +151,8 @@ void BuddyAllocator::Free(void* p) {
} }
// Dumping this block into pool // Dumping this block into pool
VLOG(10) << "Inserting free block (" << block << ", " VLOG(100) << "Inserting free block (" << block << ", "
<< block->total_size(cache_) << ")"; << block->total_size(cache_) << ")";
pool_.insert( pool_.insert(
IndexSizeAddress(block->index(cache_), block->total_size(cache_), block)); IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
...@@ -174,7 +174,7 @@ void* BuddyAllocator::SystemAlloc(size_t size) { ...@@ -174,7 +174,7 @@ void* BuddyAllocator::SystemAlloc(size_t size) {
size_t index = 0; size_t index = 0;
void* p = system_allocator_->Alloc(&index, size); void* p = system_allocator_->Alloc(&index, size);
VLOG(10) << "Allocated " << p << " from system allocator."; VLOG(100) << "Allocated " << p << " from system allocator.";
if (p == nullptr) return nullptr; if (p == nullptr) return nullptr;
...@@ -200,8 +200,8 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { ...@@ -200,8 +200,8 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
if (p == nullptr) return pool_.end(); if (p == nullptr) return pool_.end();
VLOG(10) << "Creating and inserting new block " << p VLOG(100) << "Creating and inserting new block " << p
<< " from system allocator"; << " from system allocator";
static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index, static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index,
max_chunk_size_, nullptr, nullptr); max_chunk_size_, nullptr, nullptr);
...@@ -245,19 +245,19 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it, ...@@ -245,19 +245,19 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
auto block = static_cast<MemoryBlock*>(std::get<2>(*it)); auto block = static_cast<MemoryBlock*>(std::get<2>(*it));
pool_.erase(it); pool_.erase(it);
VLOG(10) << "Split block (" << block << ", " << block->total_size(cache_) VLOG(100) << "Split block (" << block << ", " << block->total_size(cache_)
<< ") into"; << ") into";
block->split(&cache_, size); block->split(&cache_, size);
VLOG(10) << "Left block (" << block << ", " << block->total_size(cache_) VLOG(100) << "Left block (" << block << ", " << block->total_size(cache_)
<< ")"; << ")";
block->set_type(&cache_, MemoryBlock::ARENA_CHUNK); block->set_type(&cache_, MemoryBlock::ARENA_CHUNK);
// the rest of memory if exist // the rest of memory if exist
if (block->has_right_buddy(cache_)) { if (block->has_right_buddy(cache_)) {
if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) { if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) {
VLOG(10) << "Insert right block (" << block->right_buddy(cache_) << ", " VLOG(100) << "Insert right block (" << block->right_buddy(cache_) << ", "
<< block->right_buddy(cache_)->total_size(cache_) << ")"; << block->right_buddy(cache_)->total_size(cache_) << ")";
pool_.insert( pool_.insert(
IndexSizeAddress(block->right_buddy(cache_)->index(cache_), IndexSizeAddress(block->right_buddy(cache_)->index(cache_),
...@@ -284,7 +284,7 @@ void BuddyAllocator::CleanIdleFallBackAlloc() { ...@@ -284,7 +284,7 @@ void BuddyAllocator::CleanIdleFallBackAlloc() {
return; return;
} }
VLOG(10) << "Return block " << block << " to fallback allocator."; VLOG(100) << "Return block " << block << " to fallback allocator.";
system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
cache_.invalidate(block); cache_.invalidate(block);
...@@ -320,7 +320,7 @@ void BuddyAllocator::CleanIdleNormalAlloc() { ...@@ -320,7 +320,7 @@ void BuddyAllocator::CleanIdleNormalAlloc() {
MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool)); MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
VLOG(10) << "Return block " << block << " to base allocator."; VLOG(100) << "Return block " << block << " to base allocator.";
system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
cache_.invalidate(block); cache_.invalidate(block);
......
...@@ -29,7 +29,7 @@ MemoryBlock::Desc MetadataCache::load(const MemoryBlock* block) const { ...@@ -29,7 +29,7 @@ MemoryBlock::Desc MetadataCache::load(const MemoryBlock* block) const {
return existing_desc->second; return existing_desc->second;
} else { } else {
auto* desc = reinterpret_cast<const MemoryBlock::Desc*>(block); auto* desc = reinterpret_cast<const MemoryBlock::Desc*>(block);
VLOG(10) << "Load MemoryBlock::Desc type=" << desc->type; VLOG(100) << "Load MemoryBlock::Desc type=" << desc->type;
PADDLE_ASSERT(desc->check_guards()); PADDLE_ASSERT(desc->check_guards());
return *reinterpret_cast<const MemoryBlock::Desc*>(block); return *reinterpret_cast<const MemoryBlock::Desc*>(block);
} }
......
...@@ -78,7 +78,7 @@ void* Alloc<platform::CPUPlace>(const platform::CPUPlace& place, size_t size) { ...@@ -78,7 +78,7 @@ void* Alloc<platform::CPUPlace>(const platform::CPUPlace& place, size_t size) {
if (FLAGS_init_allocated_mem) { if (FLAGS_init_allocated_mem) {
memset(p, 0xEF, size); memset(p, 0xEF, size);
} }
VLOG(10) << " pointer=" << p; VLOG(100) << " pointer=" << p;
return p; return p;
} }
...@@ -111,12 +111,12 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { ...@@ -111,12 +111,12 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
std::unique_ptr<detail::SystemAllocator>(new detail::GPUAllocator(i)), std::unique_ptr<detail::SystemAllocator>(new detail::GPUAllocator(i)),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
VLOG(10) << "\n\nNOTE: each GPU device use " VLOG(100) << "\n\nNOTE: each GPU device use "
<< FLAGS_fraction_of_gpu_memory_to_use * 100 << FLAGS_fraction_of_gpu_memory_to_use * 100
<< "% of GPU memory.\n" << "% of GPU memory.\n"
<< "You can set GFlags environment variable '" << "You can set GFlags environment variable '"
<< "FLAGS_fraction_of_gpu_memory_to_use" << "FLAGS_fraction_of_gpu_memory_to_use"
<< "' to change the fraction of GPU usage.\n\n"; << "' to change the fraction of GPU usage.\n\n";
} }
}); });
......
...@@ -317,6 +317,7 @@ op_library(save_op DEPS lod_tensor) ...@@ -317,6 +317,7 @@ op_library(save_op DEPS lod_tensor)
op_library(load_op DEPS lod_tensor) op_library(load_op DEPS lod_tensor)
op_library(save_combine_op DEPS lod_tensor) op_library(save_combine_op DEPS lod_tensor)
op_library(load_combine_op DEPS lod_tensor) op_library(load_combine_op DEPS lod_tensor)
op_library(tensor_array_to_tensor_op DEPS concat_op)
op_library(concat_op DEPS concat_and_split) op_library(concat_op DEPS concat_and_split)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
......
...@@ -91,16 +91,12 @@ class ActivationOp : public framework::OperatorWithKernel { ...@@ -91,16 +91,12 @@ class ActivationOp : public framework::OperatorWithKernel {
} }
}; };
class ActivationOpInferVarType : public framework::VarTypeInference { class ActivationOpInferVarType
public: : public framework::PassInDtypeAndVarTypeToOutput {
void operator()(const framework::OpDesc& op_desc, protected:
framework::BlockDesc* block) const override { std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
auto x_name = op_desc.Input("X")[0]; const override {
auto out_name = op_desc.Output("Out")[0]; return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
auto& x = block->FindRecursiveOrCreateVar(x_name);
auto& out = block->FindRecursiveOrCreateVar(out_name);
out.SetType(x.GetType());
out.SetDataType(x.GetDataType());
} }
}; };
......
...@@ -95,7 +95,7 @@ class ActivationGradKernel ...@@ -95,7 +95,7 @@ class ActivationGradKernel
auto x = framework::EigenVector<T>::Flatten(*X); auto x = framework::EigenVector<T>::Flatten(*X);
functor(*place, x, out, dout, dx); functor(*place, x, out, dout, dx);
} else { } else {
VLOG(10) << " Inplace activation "; VLOG(100) << " Inplace activation ";
auto x = framework::EigenVector<T>::Flatten(*dX); auto x = framework::EigenVector<T>::Flatten(*dX);
functor(*place, x, out, dout, dx); functor(*place, x, out, dout, dx);
} }
......
...@@ -297,7 +297,7 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -297,7 +297,7 @@ class AdamOpKernel : public framework::OpKernel<T> {
auto& grad = auto& grad =
Ref(ctx.Input<framework::SelectedRows>("Grad"), "Must set Grad"); Ref(ctx.Input<framework::SelectedRows>("Grad"), "Must set Grad");
if (grad.rows().size() == 0) { if (grad.rows().size() == 0) {
VLOG(3) << "grad row size is 0!!"; VLOG(30) << "grad row size is 0!!";
return; return;
} }
......
...@@ -66,9 +66,10 @@ class AddPositionEncodingKernel : public framework::OpKernel<T> { ...@@ -66,9 +66,10 @@ class AddPositionEncodingKernel : public framework::OpKernel<T> {
x_lod.empty() ? max_seq_len : x_lod[0][i + 1] - x_lod[0][i]; x_lod.empty() ? max_seq_len : x_lod[0][i + 1] - x_lod[0][i];
for (int j = 0; j < max_length; ++j) { for (int j = 0; j < max_length; ++j) {
for (int k = 0; k < half_size; ++k) { for (int k = 0; k < half_size; ++k) {
const double val = (half_size > 1) const double val =
? j / pow(10000.0, double(k) / (half_size - 1)) (half_size > 1)
: j / 10000.0; ? j / pow(10000.0, static_cast<double>(k) / (half_size - 1))
: j / 10000.0;
dst_ptr[k] = src_ptr[k] * alpha + sin(val) * beta; dst_ptr[k] = src_ptr[k] * alpha + sin(val) * beta;
dst_ptr[half_size + k] = dst_ptr[half_size + k] =
src_ptr[half_size + k] * alpha + cos(val) * beta; src_ptr[half_size + k] * alpha + cos(val) * beta;
......
...@@ -49,7 +49,7 @@ class ArrayOp : public framework::OperatorBase { ...@@ -49,7 +49,7 @@ class ArrayOp : public framework::OperatorBase {
} else { } else {
offset = static_cast<size_t>(*i_tensor.data<int64_t>()); offset = static_cast<size_t>(*i_tensor.data<int64_t>());
} }
VLOG(10) << " Offset = " << offset; VLOG(100) << " Offset = " << offset;
return offset; return offset;
} }
}; };
......
...@@ -148,8 +148,8 @@ class ArrayToLoDTensorOp : public framework::OperatorBase { ...@@ -148,8 +148,8 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
size_t start_offset = lod_and_offset.second.first; size_t start_offset = lod_and_offset.second.first;
size_t end_offset = lod_and_offset.second.second; size_t end_offset = lod_and_offset.second.second;
VLOG(10) << "idx=" << idx << " x_idx=" << x_idx << " [" VLOG(100) << "idx=" << idx << " x_idx=" << x_idx << " ["
<< ", " << end_offset << "]"; << ", " << end_offset << "]";
// Copy data // Copy data
PADDLE_ENFORCE_GE(end_offset, start_offset); PADDLE_ENFORCE_GE(end_offset, start_offset);
size_t len = end_offset - start_offset; size_t len = end_offset - start_offset;
......
...@@ -170,6 +170,15 @@ The required data format for this layer is one of the following: ...@@ -170,6 +170,15 @@ The required data format for this layer is one of the following:
} }
}; };
class BatchNormOpInferVarType
: public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
const override {
return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
}
};
template <typename T> template <typename T>
class BatchNormKernel<platform::CPUDeviceContext, T> class BatchNormKernel<platform::CPUDeviceContext, T>
: public framework::OpKernel<T> { : public framework::OpKernel<T> {
...@@ -525,7 +534,7 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker { ...@@ -525,7 +534,7 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
ops::BatchNormGradMaker); ops::BatchNormOpInferVarType, ops::BatchNormGradMaker);
REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp); REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
......
...@@ -96,7 +96,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T> ...@@ -96,7 +96,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
mode_ = CUDNN_BATCHNORM_SPATIAL; mode_ = CUDNN_BATCHNORM_SPATIAL;
#endif #endif
VLOG(3) << "Setting descriptors."; VLOG(30) << "Setting descriptors.";
std::vector<int> dims; std::vector<int> dims;
std::vector<int> strides; std::vector<int> strides;
if (data_layout == DataLayout::kNCHW) { if (data_layout == DataLayout::kNCHW) {
......
...@@ -33,11 +33,11 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids, ...@@ -33,11 +33,11 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
auto items = SelectTopBeamSizeItems(pre_ids, pre_scores); auto items = SelectTopBeamSizeItems(pre_ids, pre_scores);
auto selected_items = ToMap(items, high_level.back()); auto selected_items = ToMap(items, high_level.back());
VLOG(3) << "selected_items:"; VLOG(30) << "selected_items:";
for (size_t i = 0; i < selected_items.size(); ++i) { for (size_t i = 0; i < selected_items.size(); ++i) {
VLOG(3) << "offset:" << i; VLOG(30) << "offset:" << i;
for (auto &item : selected_items[i]) { for (auto &item : selected_items[i]) {
VLOG(3) << ItemToString(item); VLOG(30) << ItemToString(item);
} }
} }
...@@ -138,11 +138,11 @@ std::vector<std::vector<BeamSearch::Item>> BeamSearch::SelectTopBeamSizeItems( ...@@ -138,11 +138,11 @@ std::vector<std::vector<BeamSearch::Item>> BeamSearch::SelectTopBeamSizeItems(
} }
result.emplace_back(items); result.emplace_back(items);
} }
VLOG(3) << "SelectTopBeamSizeItems result size " << result.size(); VLOG(30) << "SelectTopBeamSizeItems result size " << result.size();
for (auto &items : result) { for (auto &items : result) {
VLOG(3) << "item set:"; VLOG(30) << "item set:";
for (auto &item : items) { for (auto &item : items) {
VLOG(3) << ItemToString(item); VLOG(30) << ItemToString(item);
} }
} }
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
class BilinearInterpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input_t = ctx.Input<Tensor>("X"); // float tensor
auto* output_t = ctx.Output<Tensor>("Out"); // float tensor
auto out_dims = output_t->dims();
auto* input = input_t->data<T>();
int out_h = ctx.Attr<int>("out_h");
int out_w = ctx.Attr<int>("out_w");
auto out_size_t = ctx.Input<Tensor>("OutSize");
if (out_size_t != nullptr) {
auto out_size_data = out_size_t->data<int>();
out_h = out_size_data[0];
out_w = out_size_data[1];
}
auto* output = output_t->mutable_data<T>(
{out_dims[0], out_dims[1], out_h, out_w}, ctx.GetPlace());
int batch_size = input_t->dims()[0];
int channels = input_t->dims()[1];
int in_h = input_t->dims()[2];
int in_w = input_t->dims()[3];
int in_hw = in_h * in_w;
int out_hw = out_h * out_w;
int in_chw = channels * in_hw;
int out_chw = channels * out_hw;
float ratio_h =
(out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
float ratio_w =
(out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
if (in_h == out_h && in_w == out_w) {
memcpy(output, input, input_t->numel() * sizeof(T));
} else {
for (int k = 0; k < batch_size; ++k) { // loop for batches
for (int i = 0; i < out_h; ++i) { // loop for images
int h = ratio_h * i;
int hid = (h < in_h - 1) ? 1 : 0;
float h1lambda = ratio_h * i - h;
float h2lambda = 1.f - h1lambda;
for (int j = 0; j < out_w; ++j) {
int w = ratio_w * j;
int wid = (w < in_w - 1) ? 1 : 0;
float w1lambda = ratio_w * j - w;
float w2lambda = 1.f - w1lambda;
// calculate four position for bilinear interpolation
const T* in_pos = &input[k * in_chw + h * in_w + w];
T* out_pos = &output[k * out_chw + i * out_w + j];
for (int c = 0; c < channels; ++c) { // loop for channels
// bilinear interpolation
out_pos[0] = static_cast<T>(
h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[wid]) +
h1lambda * (w2lambda * in_pos[hid * in_w] +
w1lambda * in_pos[hid * in_w + wid]));
in_pos += in_hw;
out_pos += out_hw;
}
}
}
}
}
}
};
template <typename T>
class BilinearInterpGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* d_input_t = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* d_output_t = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* d_output = d_output_t->data<T>();
auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace());
auto& device_ctx =
ctx.template device_context<platform::CPUDeviceContext>();
math::SetConstant<platform::CPUDeviceContext, T> zero;
zero(device_ctx, d_input_t, static_cast<T>(0.0));
int out_h = ctx.Attr<int>("out_h");
int out_w = ctx.Attr<int>("out_w");
auto out_size_t = ctx.Input<Tensor>("OutSize");
if (out_size_t != nullptr) {
auto out_size_data = out_size_t->data<int>();
out_h = out_size_data[0];
out_w = out_size_data[1];
}
int batch_size = d_input_t->dims()[0];
int channels = d_input_t->dims()[1];
int in_h = d_input_t->dims()[2];
int in_w = d_input_t->dims()[3];
int in_hw = in_h * in_w;
int out_hw = out_h * out_w;
int in_chw = channels * in_hw;
int out_chw = channels * out_hw;
float ratio_h =
(out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
float ratio_w =
(out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
if (in_h == out_h && in_w == out_w) {
memcpy(d_input, d_output, d_input_t->numel() * sizeof(T));
} else {
for (int k = 0; k < batch_size; ++k) { // loop for batches
for (int i = 0; i < out_h; ++i) { // loop for images
int h = ratio_h * i;
int hid = (h < in_h - 1) ? 1 : 0;
float h1lambda = ratio_h * i - h;
float h2lambda = 1 - h1lambda;
for (int j = 0; j < out_w; ++j) {
int w = ratio_w * j;
int wid = (w < in_w - 1) ? 1 : 0;
float w1lambda = ratio_w * j - w;
float w2lambda = 1 - w1lambda;
T* in_pos = &d_input[k * in_chw + h * in_w + w];
const T* out_pos = &d_output[k * out_chw + i * out_w + j];
for (int c = 0; c < channels; ++c) { // loop for channels
in_pos[0] += static_cast<T>(h2lambda * w2lambda * out_pos[0]);
in_pos[wid] += static_cast<T>(h2lambda * w1lambda * out_pos[0]);
in_pos[hid * in_w] +=
static_cast<T>(h1lambda * w2lambda * out_pos[0]);
in_pos[hid * in_w + wid] +=
static_cast<T>(h1lambda * w1lambda * out_pos[0]);
in_pos += in_hw;
out_pos += out_hw;
}
}
}
}
}
}
};
} // namespace operators
} // namespace paddle
...@@ -46,8 +46,8 @@ class CheckpointNotifyOp : public framework::OperatorBase { ...@@ -46,8 +46,8 @@ class CheckpointNotifyOp : public framework::OperatorBase {
auto lookup_table_save_dir = auto lookup_table_save_dir =
string::Sprintf("%s/%s_%d", dir, lookup_table_name, i); string::Sprintf("%s/%s_%d", dir, lookup_table_name, i);
rpc_client->AsyncCheckpointNotify(epmap[i], lookup_table_save_dir); rpc_client->AsyncCheckpointNotify(epmap[i], lookup_table_save_dir);
VLOG(3) << "checkpoint notify sending lookup table: " << lookup_table_name VLOG(30) << "checkpoint notify sending lookup table: "
<< " and dir:" << dir << " to " << epmap[i]; << lookup_table_name << " and dir:" << dir << " to " << epmap[i];
} }
PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
} }
......
...@@ -37,7 +37,7 @@ class ConcatOp : public framework::OperatorWithKernel { ...@@ -37,7 +37,7 @@ class ConcatOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_GT(n, 0, "Input tensors count should > 0."); PADDLE_ENFORCE_GT(n, 0, "Input tensors count should > 0.");
if (n == 1) { if (n == 1) {
VLOG(3) << "Warning: concat op have only one input, may waste memory"; VLOG(30) << "Warning: concat op have only one input, may waste memory";
} }
auto out_dims = ins[0]; auto out_dims = ins[0];
......
...@@ -15,15 +15,22 @@ limitations under the License. */ ...@@ -15,15 +15,22 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/assert.h"
#include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_bool(cudnn_deterministic, false, DEFINE_bool(cudnn_deterministic, false,
"Whether allow using an autotuning algorithm for convolution " "Whether allow using an autotuning algorithm for convolution "
"operator. The autotuning algorithm may be non-deterministic. If " "operator. The autotuning algorithm may be non-deterministic. If "
"true, the algorithm is deterministic."); "true, the algorithm is deterministic.");
DEFINE_uint64(conv_workspace_size_limit, 4096,
"cuDNN convolution workspace limit in MB unit.");
DEFINE_bool(cudnn_exhaustive_search, false,
"Whether enable exhaustive search for cuDNN convolution or "
"not, defalut is False.");
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -36,13 +43,25 @@ using DataLayout = platform::DataLayout; ...@@ -36,13 +43,25 @@ using DataLayout = platform::DataLayout;
template <typename T> template <typename T>
using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType; using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
static constexpr char kCUDNNFwdAlgoCache[] = "kCUDNNFwdAlgoCache";
static constexpr char kCUDNNBwdDataAlgoCache[] = "kCUDNNBwdDataAlgoCache";
static constexpr char kCUDNNBwdFilterAlgoCache[] = "kCUDNNBwdFilterAlgoCache";
static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
static_cast<size_t>(1024) * 1024 * 1024; static_cast<size_t>(1024) * 1024 * 1024;
static constexpr size_t kNUM_CUDNN_FWD_ALGS =
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS =
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS =
CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
template <typename T> template <typename T>
class CUDNNConvOpKernel : public framework::OpKernel<T> { class CUDNNConvOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use CUDAPlace."); "It must use CUDAPlace.");
auto* input = ctx.Input<Tensor>("Input"); auto* input = ctx.Input<Tensor>("Input");
...@@ -55,6 +74,8 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -55,6 +74,8 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
int groups = ctx.Attr<int>("groups"); int groups = ctx.Attr<int>("groups");
int64_t user_workspace_size = int64_t user_workspace_size =
static_cast<size_t>(ctx.Attr<int>("workspace_size_MB")); static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
bool exhaustive_search =
FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
const T* input_data = input->data<T>(); const T* input_data = input->data<T>();
const T* filter_data = filter->data<T>(); const T* filter_data = filter->data<T>();
...@@ -120,19 +141,19 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -120,19 +141,19 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv workspace --------------------- // ------------------- cudnn conv workspace ---------------------
size_t workspace_size_in_bytes; // final workspace to allocate. size_t workspace_size_in_bytes; // final workspace to allocate.
size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES; size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
if (user_workspace_size > 0) { if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) {
workspace_size_limit = user_workspace_size * 1024 * 1024; int64_t max_user_size =
std::max(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
user_workspace_size);
workspace_size_limit = max_user_size * 1024 * 1024;
} }
// ------------------- cudnn conv algorithm --------------------- // ------------------- cudnn conv algorithm ---------------------
cudnnConvolutionFwdAlgo_t algo; cudnnConvolutionFwdAlgo_t algo;
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle(); auto handle = dev_ctx.cudnn_handle();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( bool half_float = false;
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
// Tensor core is supported since the volta GPU and // Tensor core is supported since the volta GPU and
// is only enabled when input and filter data are float16 // is only enabled when input and filter data are float16
...@@ -143,14 +164,66 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -143,14 +164,66 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
cudnn_conv_desc, CUDNN_TENSOR_OP_MATH)); cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
// Currently tensor core is only enabled using this algo // Currently tensor core is only enabled using this algo
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
VLOG(5) << "use cudnn_tensor_op_math"; half_float = true;
VLOG(50) << "use cudnn_tensor_op_math";
} else { } else {
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_DEFAULT_MATH)); cudnn_conv_desc, CUDNN_DEFAULT_MATH));
VLOG(5) << "NOT use cudnn_tensor_op_math"; VLOG(50) << "NOT use cudnn_tensor_op_math";
} }
#endif #endif
auto x_dims = framework::vectorize(input->dims());
auto f_dims = framework::vectorize(filter->dims());
if ((!exhaustive_search) && (!half_float)) {
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
VLOG(3) << "cuDNN forward algo " << algo;
} else if (exhaustive_search && (!half_float)) {
AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* algo_cache = nullptr;
if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) {
algo_cache =
ctx.scope()
.FindVar(kCUDNNFwdAlgoCache)
->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
} else {
algo_cache =
const_cast<framework::Scope&>(ctx.scope())
.Var(kCUDNNFwdAlgoCache)
->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
}
algo = algo_cache->GetAlgorithm(
x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
int returned_algo_count;
std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
fwd_perf_stat;
auto cudnn_find_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(
platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
handle, cudnn_input_desc, input_data, cudnn_filter_desc,
filter_data, cudnn_conv_desc, cudnn_output_desc,
output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
fwd_perf_stat.data(), cudnn_workspace,
workspace_size_limit));
};
workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit);
VLOG(3) << "Perf result: (algo: stat, time, memory)";
for (int i = 0; i < returned_algo_count; ++i) {
const auto& stat = fwd_perf_stat[i];
VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
<< " " << stat.memory;
}
return fwd_perf_stat[0].algo;
});
VLOG(3) << "choose algo " << algo;
} else {
PADDLE_ENFORCE(half_float,
"cuDNN exhaustive search doesn't support half float.");
}
// get workspace size able to allocate // get workspace size able to allocate
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
...@@ -162,7 +235,6 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -162,7 +235,6 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv forward --------------------- // ------------------- cudnn conv forward ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
for (int i = 0; i < groups; i++) { for (int i = 0; i < groups; i++) {
auto cudnn_func = [&](void* cudnn_workspace) { auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
...@@ -180,6 +252,7 @@ template <typename T> ...@@ -180,6 +252,7 @@ template <typename T>
class CUDNNConvGradOpKernel : public framework::OpKernel<T> { class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use CUDAPlace."); "It must use CUDAPlace.");
auto input = ctx.Input<Tensor>("Input"); auto input = ctx.Input<Tensor>("Input");
...@@ -198,6 +271,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -198,6 +271,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
int groups = ctx.Attr<int>("groups"); int groups = ctx.Attr<int>("groups");
int64_t user_workspace_size = int64_t user_workspace_size =
static_cast<size_t>(ctx.Attr<int>("workspace_size_MB")); static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
bool exhaustive_search =
FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
if (exhaustive_search && FLAGS_cudnn_deterministic) {
PADDLE_THROW(
"Cann't set exhaustive_search True and "
"FLAGS_cudnn_deterministic True at same time.");
}
// ------------------- cudnn descriptors --------------------- // ------------------- cudnn descriptors ---------------------
ScopedTensorDescriptor input_desc; ScopedTensorDescriptor input_desc;
...@@ -265,14 +345,66 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -265,14 +345,66 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
cudnnConvolutionBwdFilterAlgo_t filter_algo; cudnnConvolutionBwdFilterAlgo_t filter_algo;
size_t workspace_size_in_bytes = 0, tmp_size = 0; size_t workspace_size_in_bytes = 0, tmp_size = 0;
size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES; size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
if (user_workspace_size > 0) { if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) {
workspace_size_limit = user_workspace_size * 1024 * 1024; int64_t max_user_size =
std::max(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
user_workspace_size);
workspace_size_limit = max_user_size * 1024 * 1024;
} }
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto x_dims = framework::vectorize(input->dims());
auto f_dims = framework::vectorize(filter->dims());
auto handle = dev_ctx.cudnn_handle(); auto handle = dev_ctx.cudnn_handle();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
if (input_grad) { if (input_grad) {
if (!FLAGS_cudnn_deterministic) { T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
if (exhaustive_search) {
AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>* data_algo_cache;
if (ctx.scope().FindVar(kCUDNNBwdDataAlgoCache)) {
data_algo_cache =
ctx.scope()
.FindVar(kCUDNNBwdDataAlgoCache)
->GetMutable<
AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>();
} else {
data_algo_cache =
const_cast<framework::Scope&>(ctx.scope())
.Var(kCUDNNBwdDataAlgoCache)
->GetMutable<
AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>();
}
data_algo = data_algo_cache->GetAlgorithm(
x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
int returned_algo_count;
std::array<cudnnConvolutionBwdDataAlgoPerf_t,
kNUM_CUDNN_BWD_DATA_ALGS>
data_perf_stat;
auto cudnn_find_bd_data_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(
platform::dynload::
cudnnFindConvolutionBackwardDataAlgorithmEx(
handle, cudnn_filter_desc, filter_data,
cudnn_output_grad_desc, output_grad_data,
cudnn_conv_desc, cudnn_input_desc, input_grad_data,
kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count,
data_perf_stat.data(), cudnn_workspace,
workspace_size_limit));
};
workspace_handle.RunFunc(cudnn_find_bd_data_func,
workspace_size_limit);
VLOG(3) << "Perf result: (algo: stat, time, memory)";
for (int i = 0; i < returned_algo_count; ++i) {
const auto& stat = data_perf_stat[i];
VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
<< " " << stat.memory;
}
return data_perf_stat[0].algo;
});
VLOG(3) << "cuDNN backward data algo " << data_algo;
} else if (FLAGS_cudnn_deterministic) {
data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
} else {
CUDNN_ENFORCE( CUDNN_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
handle, cudnn_filter_desc, handle, cudnn_filter_desc,
...@@ -285,10 +417,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -285,10 +417,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
cudnn_input_desc, cudnn_input_desc,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &data_algo)); workspace_size_limit, &data_algo));
} else {
data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
} }
CUDNN_ENFORCE( CUDNN_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
handle, cudnn_filter_desc, cudnn_output_grad_desc, handle, cudnn_filter_desc, cudnn_output_grad_desc,
...@@ -297,17 +426,54 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -297,17 +426,54 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
} }
if (filter_grad) { if (filter_grad) {
if (!FLAGS_cudnn_deterministic) { T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
if (exhaustive_search) {
AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>* f_algo_cache;
if (ctx.scope().FindVar(kCUDNNBwdFilterAlgoCache)) {
f_algo_cache =
ctx.scope()
.FindVar(kCUDNNBwdFilterAlgoCache)
->GetMutable<
AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>();
} else {
f_algo_cache =
const_cast<framework::Scope&>(ctx.scope())
.Var(kCUDNNBwdFilterAlgoCache)
->GetMutable<
AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>();
}
filter_algo = f_algo_cache->GetAlgorithm(
x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
int returned_algo_count;
std::array<cudnnConvolutionBwdFilterAlgoPerf_t,
kNUM_CUDNN_BWD_FILTER_ALGS>
filter_perf_stat;
auto cudnn_find_bd_f_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(
platform::dynload::
cudnnFindConvolutionBackwardFilterAlgorithmEx(
handle, cudnn_input_desc, input_data,
cudnn_output_grad_desc, output_grad_data,
cudnn_conv_desc, cudnn_filter_desc,
filter_grad_data, kNUM_CUDNN_BWD_FILTER_ALGS,
&returned_algo_count, filter_perf_stat.data(),
cudnn_workspace, workspace_size_limit));
};
workspace_handle.RunFunc(cudnn_find_bd_f_func,
workspace_size_limit);
return filter_perf_stat[0].algo;
});
VLOG(3) << "cuDNN backward filter algo " << filter_algo;
} else if (FLAGS_cudnn_deterministic) {
filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
} else {
CUDNN_ENFORCE( CUDNN_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
handle, cudnn_input_desc, cudnn_output_grad_desc, handle, cudnn_input_desc, cudnn_output_grad_desc,
cudnn_conv_desc, cudnn_filter_desc, cudnn_conv_desc, cudnn_filter_desc,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &filter_algo)); workspace_size_limit, &filter_algo));
} else {
filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
} }
CUDNN_ENFORCE( CUDNN_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc, handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
...@@ -317,7 +483,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -317,7 +483,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv backward data --------------------- // ------------------- cudnn conv backward data ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
if (input_grad) { if (input_grad) {
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace()); T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
// Because beta is zero, it is unnecessary to reset input_grad. // Because beta is zero, it is unnecessary to reset input_grad.
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <functional>
#include <unordered_map>
#include <vector>
namespace paddle {
namespace operators {
template <typename TAlgorithm>
class AlgorithmsCache {
public:
// Caches the best algorithm for a given
// combination of tensor dimensions & compute data type.
TAlgorithm GetAlgorithm(
const std::vector<int64_t>& dims1, const std::vector<int64_t>& dims2,
const std::vector<int>& strides, const std::vector<int>& paddings,
const std::vector<int>& dilations,
int algorithmFlags, // can set for different data type
std::function<TAlgorithm()> gen_func);
private:
std::unordered_map<int64_t, TAlgorithm> hash_;
std::mutex mutex_;
};
template <typename TAlgorithm>
TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
const std::vector<int64_t>& dims1, const std::vector<int64_t>& dims2,
const std::vector<int>& strides, const std::vector<int>& paddings,
const std::vector<int>& dilations, int algorithmFlags,
std::function<TAlgorithm()> gen_func) {
std::lock_guard<std::mutex> lock(mutex_);
int64_t seed = 0;
// Hash all of the inputs, use to try and look up a previously
// discovered algorithm, or fall back to generating a new one.
std::hash<int64_t> hashFn;
// do hash like boost
// https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x
for (const auto num : dims1) {
seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
}
for (const auto num : dims2) {
seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2) + 1;
}
for (const auto num : strides) {
seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
(seed >> 2) + 2;
}
for (const auto num : paddings) {
seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
(seed >> 2) + 3;
}
for (const auto num : dilations) {
seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
(seed >> 2) + 4;
}
seed ^= hashFn(static_cast<int64_t>(algorithmFlags)) + 0x9e3779b9 +
(seed << 6) + (seed >> 2) + 5;
if (seed == 0) return gen_func();
if (hash_.find(seed) == hash_.end()) {
TAlgorithm value = gen_func();
hash_[seed] = value;
}
return hash_[seed];
}
} // namespace operators
} // namespace paddle
...@@ -189,6 +189,11 @@ void Conv2DOpMaker::Make() { ...@@ -189,6 +189,11 @@ void Conv2DOpMaker::Make() {
"workspace size can increase performance but also requires " "workspace size can increase performance but also requires "
"better hardware. This size should be chosen carefully.") "better hardware. This size should be chosen carefully.")
.SetDefault(4096); .SetDefault(4096);
AddAttr<bool>("exhaustive_search",
"(bool, default false) cuDNN has many algorithm to calculation "
"convolution, whether enable exhaustive search ",
"for cuDNN convolution or not, defalut is False.")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Convolution Operator. Convolution Operator.
...@@ -219,6 +224,15 @@ $$ ...@@ -219,6 +224,15 @@ $$
)DOC"); )DOC");
} }
class ConvOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
const override {
return std::unordered_map<std::string, std::string>{
{"Input", /*->*/ "Output"}};
}
};
void Conv3DOpMaker::Make() { void Conv3DOpMaker::Make() {
AddInput( AddInput(
"Input", "Input",
...@@ -283,7 +297,11 @@ void Conv3DOpMaker::Make() { ...@@ -283,7 +297,11 @@ void Conv3DOpMaker::Make() {
"workspace size can increase performance but also requires " "workspace size can increase performance but also requires "
"better hardware. This size should be chosen carefully.") "better hardware. This size should be chosen carefully.")
.SetDefault(4096); .SetDefault(4096);
AddAttr<bool>("exhaustive_search",
"(bool, default false) cuDNN has many algorithm to calculation "
"convolution, whether enable exhaustive search ",
"for cuDNN convolution or not, defalut is False.")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Convolution3D Operator. Convolution3D Operator.
...@@ -356,6 +374,7 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType( ...@@ -356,6 +374,7 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker, REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker,
ops::ConvOpInferVarType,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad); REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad);
...@@ -363,7 +382,9 @@ REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad); ...@@ -363,7 +382,9 @@ REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad);
REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker, REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad); REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad);
REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker, REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker,
ops::ConvOpInferVarType,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad); REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad);
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/cross_entropy_op.h" #include "paddle/fluid/operators/cross_entropy_op.h"
#include <string>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -179,6 +180,15 @@ or not. But the output only shares the LoD information with input X. ...@@ -179,6 +180,15 @@ or not. But the output only shares the LoD information with input X.
)DOC"); )DOC");
} }
}; };
class CrossEntropyOpInferVarType
: public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
const override {
return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -186,6 +196,7 @@ namespace ops = paddle::operators; ...@@ -186,6 +196,7 @@ namespace ops = paddle::operators;
using CPUCtx = paddle::platform::CPUDeviceContext; using CPUCtx = paddle::platform::CPUDeviceContext;
REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker, REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
ops::CrossEntropyOpInferVarType,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp); REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp);
REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<CPUCtx, float>, REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<CPUCtx, float>,
......
...@@ -133,10 +133,10 @@ void AsyncBRPCServer::StartServer() { ...@@ -133,10 +133,10 @@ void AsyncBRPCServer::StartServer() {
void AsyncBRPCServer::ShutDownImpl() { server_.Stop(1000); } void AsyncBRPCServer::ShutDownImpl() { server_.Stop(1000); }
void AsyncBRPCServer::WaitServerReady() { void AsyncBRPCServer::WaitServerReady() {
VLOG(3) << "AsyncGRPCServer is wait server ready"; VLOG(30) << "AsyncGRPCServer is wait server ready";
std::unique_lock<std::mutex> lock(this->mutex_ready_); std::unique_lock<std::mutex> lock(this->mutex_ready_);
condition_ready_.wait(lock, [=] { return this->ready_ == 1; }); condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
VLOG(3) << "AsyncGRPCServer WaitSeverReady"; VLOG(30) << "AsyncGRPCServer WaitSeverReady";
} }
}; // namespace distributed }; // namespace distributed
......
...@@ -38,7 +38,7 @@ void GRPCClient::SendComplete() { ...@@ -38,7 +38,7 @@ void GRPCClient::SendComplete() {
std::unique_lock<std::mutex> lk(completed_mutex_); std::unique_lock<std::mutex> lk(completed_mutex_);
if (!completed_) { if (!completed_) {
for (auto& it : channels_) { for (auto& it : channels_) {
VLOG(3) << "send complete message to " << it.first; VLOG(30) << "send complete message to " << it.first;
this->AsyncSendComplete(it.first); this->AsyncSendComplete(it.first);
} }
PADDLE_ENFORCE(this->Wait(), "internal grpc error"); PADDLE_ENFORCE(this->Wait(), "internal grpc error");
...@@ -81,7 +81,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep, ...@@ -81,7 +81,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
::grpc::ByteBuffer req; ::grpc::ByteBuffer req;
SerializeToByteBuffer(var_name_val, var, *p_ctx, &req, "", trainer_id_); SerializeToByteBuffer(var_name_val, var, *p_ctx, &req, "", trainer_id_);
VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; VLOG(30) << s->GetVarHandlePtr()->String() << " begin";
// stub context // stub context
s->response_call_back_ = nullptr; s->response_call_back_ = nullptr;
...@@ -142,7 +142,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, ...@@ -142,7 +142,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
::grpc::ByteBuffer buf; ::grpc::ByteBuffer buf;
RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf); RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; VLOG(30) << s->GetVarHandlePtr()->String() << " begin";
// stub context // stub context
s->response_call_back_ = ProcGetResponse; s->response_call_back_ = ProcGetResponse;
...@@ -190,7 +190,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep, ...@@ -190,7 +190,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
::grpc::ByteBuffer req; ::grpc::ByteBuffer req;
SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val); SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val);
VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; VLOG(30) << s->GetVarHandlePtr()->String() << " begin";
// stub context // stub context
s->response_call_back_ = ProcGetResponse; s->response_call_back_ = ProcGetResponse;
...@@ -328,14 +328,14 @@ void GRPCClient::Proceed() { ...@@ -328,14 +328,14 @@ void GRPCClient::Proceed() {
void* tag = nullptr; void* tag = nullptr;
bool ok = false; bool ok = false;
VLOG(3) << "GRPCClient Proceed begin"; VLOG(30) << "GRPCClient Proceed begin";
while (!stopped_ && cq_.Next(&tag, &ok)) { while (!stopped_ && cq_.Next(&tag, &ok)) {
BaseProcessor* c = static_cast<BaseProcessor*>(tag); BaseProcessor* c = static_cast<BaseProcessor*>(tag);
GPR_ASSERT(ok); GPR_ASSERT(ok);
PADDLE_ENFORCE(c); PADDLE_ENFORCE(c);
if (c->status_.ok()) { if (c->status_.ok()) {
VLOG(3) << c->GetVarHandlePtr()->String() << " process"; VLOG(30) << c->GetVarHandlePtr()->String() << " process";
c->Process(); c->Process();
} else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) { } else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) {
// FIXME(gongwb): parse error_details? // FIXME(gongwb): parse error_details?
...@@ -370,7 +370,7 @@ void GRPCClient::Proceed() { ...@@ -370,7 +370,7 @@ void GRPCClient::Proceed() {
sync_cond_.notify_all(); sync_cond_.notify_all();
} }
} }
VLOG(3) << "GRPCClient Proceed end"; VLOG(30) << "GRPCClient Proceed end";
} }
std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) { std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
......
...@@ -98,7 +98,7 @@ class RequestSend final : public RequestBase { ...@@ -98,7 +98,7 @@ class RequestSend final : public RequestBase {
void Process() override { void Process() override {
std::string varname = GetReqName(); std::string varname = GetReqName();
VLOG(4) << "RequestSend var_name:" << varname; VLOG(40) << "RequestSend var_name:" << varname;
auto scope = request_->GetMutableLocalScope(); auto scope = request_->GetMutableLocalScope();
auto invar = request_->GetVar(); auto invar = request_->GetVar();
...@@ -135,7 +135,7 @@ class RequestGet final : public RequestBase { ...@@ -135,7 +135,7 @@ class RequestGet final : public RequestBase {
// proc request. // proc request.
std::string varname = request_.varname(); std::string varname = request_.varname();
int trainer_id = request_.trainer_id(); int trainer_id = request_.trainer_id();
VLOG(4) << "RequestGet " << varname; VLOG(40) << "RequestGet " << varname;
auto scope = request_handler_->scope(); auto scope = request_handler_->scope();
auto invar = scope->FindVar(varname); auto invar = scope->FindVar(varname);
...@@ -182,8 +182,8 @@ class RequestPrefetch final : public RequestBase { ...@@ -182,8 +182,8 @@ class RequestPrefetch final : public RequestBase {
std::string in_var_name = request_->Varname(); std::string in_var_name = request_->Varname();
std::string out_var_name = request_->OutVarname(); std::string out_var_name = request_->OutVarname();
int trainer_id = request_->GetTrainerId(); int trainer_id = request_->GetTrainerId();
VLOG(4) << "RequestPrefetch, in_var_name: " << in_var_name VLOG(40) << "RequestPrefetch, in_var_name: " << in_var_name
<< " out_var_name: " << out_var_name; << " out_var_name: " << out_var_name;
auto scope = request_->GetMutableLocalScope(); auto scope = request_->GetMutableLocalScope();
auto invar = scope->FindVar(in_var_name); auto invar = scope->FindVar(in_var_name);
...@@ -231,8 +231,8 @@ class RequestCheckpointNotify final : public RequestBase { ...@@ -231,8 +231,8 @@ class RequestCheckpointNotify final : public RequestBase {
std::string checkpoint_dir = request_->OutVarname(); std::string checkpoint_dir = request_->OutVarname();
int trainer_id = request_->GetTrainerId(); int trainer_id = request_->GetTrainerId();
VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify VLOG(40) << "RequestCheckpointNotify notify: " << checkpoint_notify
<< ", dir: " << checkpoint_dir; << ", dir: " << checkpoint_dir;
request_handler_->Handle(checkpoint_notify, scope, nullptr, nullptr, request_handler_->Handle(checkpoint_notify, scope, nullptr, nullptr,
trainer_id, checkpoint_dir); trainer_id, checkpoint_dir);
...@@ -246,10 +246,10 @@ class RequestCheckpointNotify final : public RequestBase { ...@@ -246,10 +246,10 @@ class RequestCheckpointNotify final : public RequestBase {
}; };
void AsyncGRPCServer::WaitServerReady() { void AsyncGRPCServer::WaitServerReady() {
VLOG(4) << "AsyncGRPCServer is wait server ready"; VLOG(40) << "AsyncGRPCServer is wait server ready";
std::unique_lock<std::mutex> lock(this->mutex_ready_); std::unique_lock<std::mutex> lock(this->mutex_ready_);
condition_ready_.wait(lock, [=] { return this->ready_ == 1; }); condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
VLOG(4) << "AsyncGRPCServer WaitSeverReady"; VLOG(40) << "AsyncGRPCServer WaitSeverReady";
} }
void AsyncGRPCServer::StartServer() { void AsyncGRPCServer::StartServer() {
...@@ -282,14 +282,15 @@ void AsyncGRPCServer::StartServer() { ...@@ -282,14 +282,15 @@ void AsyncGRPCServer::StartServer() {
reqs.reserve(kRequestBufSize); reqs.reserve(kRequestBufSize);
for (int i = 0; i < kRequestBufSize; i++) { for (int i = 0; i < kRequestBufSize; i++) {
VLOG(6) << "TryToRegisterNewOne on RPC NAME: " << rpc_name << " I: " << i; VLOG(60) << "TryToRegisterNewOne on RPC NAME: " << rpc_name
<< " I: " << i;
TryToRegisterNewOne(rpc_name, i); TryToRegisterNewOne(rpc_name, i);
} }
for (int i = 0; i < threadnum; i++) { for (int i = 0; i < threadnum; i++) {
rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind( rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind(
&AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f))); &AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f)));
VLOG(4) << t.first << " creates threads!"; VLOG(40) << t.first << " creates threads!";
} }
} }
...@@ -306,7 +307,7 @@ void AsyncGRPCServer::StartServer() { ...@@ -306,7 +307,7 @@ void AsyncGRPCServer::StartServer() {
auto& threads = t.second; auto& threads = t.second;
for (size_t i = 0; i < threads.size(); ++i) { for (size_t i = 0; i < threads.size(); ++i) {
threads[i]->join(); threads[i]->join();
VLOG(4) << t.first << " threads ends!"; VLOG(40) << t.first << " threads ends!";
} }
} }
} }
...@@ -314,7 +315,7 @@ void AsyncGRPCServer::StartServer() { ...@@ -314,7 +315,7 @@ void AsyncGRPCServer::StartServer() {
void AsyncGRPCServer::ShutdownQueue() { void AsyncGRPCServer::ShutdownQueue() {
for (auto& t : rpc_cq_) { for (auto& t : rpc_cq_) {
t.second->Shutdown(); t.second->Shutdown();
VLOG(4) << t.first << " queue shutdown!"; VLOG(40) << t.first << " queue shutdown!";
} }
} }
...@@ -323,7 +324,7 @@ void AsyncGRPCServer::ShutDownImpl() { ...@@ -323,7 +324,7 @@ void AsyncGRPCServer::ShutDownImpl() {
is_shut_down_ = true; is_shut_down_ = true;
ShutdownQueue(); ShutdownQueue();
VLOG(4) << "server_ shutdown!"; VLOG(40) << "server_ shutdown!";
server_->Shutdown(); server_->Shutdown();
} }
...@@ -331,12 +332,12 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name, ...@@ -331,12 +332,12 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
int req_id) { int req_id) {
std::unique_lock<std::mutex> lock(cq_mutex_); std::unique_lock<std::mutex> lock(cq_mutex_);
if (is_shut_down_) { if (is_shut_down_) {
VLOG(4) << "shutdown, do not TryToRegisterNewSendOne"; VLOG(40) << "shutdown, do not TryToRegisterNewSendOne";
return; return;
} }
VLOG(4) << "TryToRegisterNewOne on RPC NAME: " << rpc_name VLOG(40) << "TryToRegisterNewOne on RPC NAME: " << rpc_name
<< " REQ ID: " << req_id; << " REQ ID: " << req_id;
auto& reqs = rpc_reqs_[rpc_name]; auto& reqs = rpc_reqs_[rpc_name];
auto& handler = rpc_call_map_[rpc_name]; auto& handler = rpc_call_map_[rpc_name];
...@@ -357,7 +358,7 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name, ...@@ -357,7 +358,7 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
reqs[req_id] = b; reqs[req_id] = b;
VLOG(4) << "Create RequestSend status:" << b->Status(); VLOG(40) << "Create RequestSend status:" << b->Status();
} }
void AsyncGRPCServer::HandleRequest( void AsyncGRPCServer::HandleRequest(
...@@ -367,15 +368,15 @@ void AsyncGRPCServer::HandleRequest( ...@@ -367,15 +368,15 @@ void AsyncGRPCServer::HandleRequest(
bool ok = false; bool ok = false;
while (true) { while (true) {
VLOG(4) << "HandleRequest " << rpc_name << " wait next"; VLOG(40) << "HandleRequest " << rpc_name << " wait next";
if (!cq->Next(&tag, &ok)) { if (!cq->Next(&tag, &ok)) {
VLOG(3) << "CompletionQueue " << rpc_name << " shutdown!"; VLOG(30) << "CompletionQueue " << rpc_name << " shutdown!";
break; break;
} }
int req_id = static_cast<int>(reinterpret_cast<intptr_t>(tag)); int req_id = static_cast<int>(reinterpret_cast<intptr_t>(tag));
VLOG(4) << "HandleRequest " << rpc_name << ", req_id:" << req_id VLOG(40) << "HandleRequest " << rpc_name << ", req_id:" << req_id
<< " get next"; << " get next";
auto& reqs = rpc_reqs_[rpc_name]; auto& reqs = rpc_reqs_[rpc_name];
RequestBase* base = nullptr; RequestBase* base = nullptr;
...@@ -385,7 +386,7 @@ void AsyncGRPCServer::HandleRequest( ...@@ -385,7 +386,7 @@ void AsyncGRPCServer::HandleRequest(
base = reqs[req_id]; base = reqs[req_id];
} }
VLOG(3) << base->Status2String(rpc_name); VLOG(30) << base->Status2String(rpc_name);
// reference: // reference:
// https://github.com/tensorflow/tensorflow/issues/5596 // https://github.com/tensorflow/tensorflow/issues/5596
......
...@@ -75,7 +75,7 @@ class VarHandle { ...@@ -75,7 +75,7 @@ class VarHandle {
wait_cond_.wait(lk, [this] { return status_ != kDefaultState; }); wait_cond_.wait(lk, [this] { return status_ != kDefaultState; });
ret = status_; ret = status_;
} }
VLOG(7) << "VarHandle wait:" << ret; VLOG(70) << "VarHandle wait:" << ret;
return ret != kErrorState; return ret != kErrorState;
} }
...@@ -84,7 +84,7 @@ class VarHandle { ...@@ -84,7 +84,7 @@ class VarHandle {
std::unique_lock<std::mutex> lk(sync_mutex_); std::unique_lock<std::mutex> lk(sync_mutex_);
status_ = ok ? kFinishState : kErrorState; status_ = ok ? kFinishState : kErrorState;
} }
VLOG(7) << "VarHandle finish:" << ok; VLOG(70) << "VarHandle finish:" << ok;
wait_cond_.notify_all(); wait_cond_.notify_all();
} }
......
...@@ -38,19 +38,19 @@ bool RequestSendHandler::Handle(const std::string& varname, ...@@ -38,19 +38,19 @@ bool RequestSendHandler::Handle(const std::string& varname,
framework::Variable** outvar, framework::Variable** outvar,
const int trainer_id, const int trainer_id,
const std::string& out_var_name) { const std::string& out_var_name) {
VLOG(4) << "RequestSendHandler:" << varname; VLOG(40) << "RequestSendHandler:" << varname;
// Sync // Sync
if (varname == BATCH_BARRIER_MESSAGE) { if (varname == BATCH_BARRIER_MESSAGE) {
VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE"; VLOG(30) << "sync: recv BATCH_BARRIER_MESSAGE";
rpc_server_->IncreaseBatchBarrier(kRequestSend); rpc_server_->IncreaseBatchBarrier(kRequestSend);
} else if (varname == COMPLETE_MESSAGE) { } else if (varname == COMPLETE_MESSAGE) {
VLOG(3) << "sync: recv complete message"; VLOG(30) << "sync: recv complete message";
rpc_server_->Complete(); rpc_server_->Complete();
} else { } else {
// Async // Async
if (!sync_mode_) { if (!sync_mode_) {
VLOG(3) << "async process var: " << varname; VLOG(30) << "async process var: " << varname;
try { try {
executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
scope); scope);
...@@ -61,7 +61,7 @@ bool RequestSendHandler::Handle(const std::string& varname, ...@@ -61,7 +61,7 @@ bool RequestSendHandler::Handle(const std::string& varname,
return true; return true;
} else { // sync } else { // sync
rpc_server_->WaitCond(kRequestSend); rpc_server_->WaitCond(kRequestSend);
VLOG(3) << "sync: processing received var: " << varname; VLOG(30) << "sync: processing received var: " << varname;
if (invar == nullptr) { if (invar == nullptr) {
LOG(FATAL) << "sync: Can not find server side var: " << varname; LOG(FATAL) << "sync: Can not find server side var: " << varname;
...@@ -78,10 +78,10 @@ bool RequestGetHandler::Handle(const std::string& varname, ...@@ -78,10 +78,10 @@ bool RequestGetHandler::Handle(const std::string& varname,
framework::Variable** outvar, framework::Variable** outvar,
const int trainer_id, const int trainer_id,
const std::string& out_var_name) { const std::string& out_var_name) {
VLOG(4) << "RequestGetHandler:" << varname; VLOG(40) << "RequestGetHandler:" << varname;
if (sync_mode_) { if (sync_mode_) {
if (varname == FETCH_BARRIER_MESSAGE) { if (varname == FETCH_BARRIER_MESSAGE) {
VLOG(3) << "sync: recv fetch barrier message"; VLOG(30) << "sync: recv fetch barrier message";
rpc_server_->IncreaseBatchBarrier(kRequestGet); rpc_server_->IncreaseBatchBarrier(kRequestGet);
} else { } else {
rpc_server_->WaitCond(kRequestGet); rpc_server_->WaitCond(kRequestGet);
...@@ -93,13 +93,14 @@ bool RequestGetHandler::Handle(const std::string& varname, ...@@ -93,13 +93,14 @@ bool RequestGetHandler::Handle(const std::string& varname,
// NOTE: the format is determined by distributed_transpiler.py // NOTE: the format is determined by distributed_transpiler.py
std::string param_bak_name = std::string param_bak_name =
string::Sprintf("%s.trainer_%d_bak", varname, trainer_id); string::Sprintf("%s.trainer_%d_bak", varname, trainer_id);
VLOG(3) << "getting " << param_bak_name << " trainer_id " << trainer_id; VLOG(30) << "getting " << param_bak_name << " trainer_id "
<< trainer_id;
auto var = scope_->FindVar(varname); auto var = scope_->FindVar(varname);
auto t_orig = var->Get<framework::LoDTensor>(); auto t_orig = var->Get<framework::LoDTensor>();
auto param_bak = scope_->Var(param_bak_name); auto param_bak = scope_->Var(param_bak_name);
auto t = param_bak->GetMutable<framework::LoDTensor>(); auto t = param_bak->GetMutable<framework::LoDTensor>();
t->mutable_data(dev_ctx_->GetPlace(), t_orig.type()); t->mutable_data(dev_ctx_->GetPlace(), t_orig.type());
VLOG(3) << "copying " << varname << " to " << param_bak_name; VLOG(30) << "copying " << varname << " to " << param_bak_name;
framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t); framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t);
} }
*outvar = scope_->FindVar(varname); *outvar = scope_->FindVar(varname);
...@@ -114,7 +115,7 @@ bool RequestPrefetchHandler::Handle(const std::string& varname, ...@@ -114,7 +115,7 @@ bool RequestPrefetchHandler::Handle(const std::string& varname,
framework::Variable** outvar, framework::Variable** outvar,
const int trainer_id, const int trainer_id,
const std::string& out_var_name) { const std::string& out_var_name) {
VLOG(4) << "RequestPrefetchHandler " << varname; VLOG(40) << "RequestPrefetchHandler " << varname;
auto var_desc = program_->Block(0).FindVar(out_var_name); auto var_desc = program_->Block(0).FindVar(out_var_name);
InitializeVariable(*outvar, var_desc->GetType()); InitializeVariable(*outvar, var_desc->GetType());
...@@ -138,8 +139,8 @@ bool RequestCheckpointHandler::Handle(const std::string& varname, ...@@ -138,8 +139,8 @@ bool RequestCheckpointHandler::Handle(const std::string& varname,
auto* lt_var = scope_->FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>(); auto* lt_var = scope_->FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>();
lt_var->clear(); lt_var->clear();
lt_var->append(out_var_name); lt_var->append(out_var_name);
VLOG(4) << "RequestCheckpointHandler update var kLookupTablePath to: " VLOG(40) << "RequestCheckpointHandler update var kLookupTablePath to: "
<< out_var_name; << out_var_name;
executor_->RunPreparedContext(checkpoint_prepared_ctx_.get(), scope_); executor_->RunPreparedContext(checkpoint_prepared_ctx_.get(), scope_);
return true; return true;
} }
......
...@@ -39,7 +39,7 @@ void RPCServer::SavePort() const { ...@@ -39,7 +39,7 @@ void RPCServer::SavePort() const {
port_file.open(file_path); port_file.open(file_path);
port_file << selected_port_; port_file << selected_port_;
port_file.close(); port_file.close();
VLOG(4) << "selected port written to " << file_path; VLOG(40) << "selected port written to " << file_path;
} }
void RPCServer::WaitBarrier(const std::string& rpc_name) { void RPCServer::WaitBarrier(const std::string& rpc_name) {
...@@ -49,12 +49,12 @@ void RPCServer::WaitBarrier(const std::string& rpc_name) { ...@@ -49,12 +49,12 @@ void RPCServer::WaitBarrier(const std::string& rpc_name) {
exit_flag_.load()); exit_flag_.load());
}); });
VLOG(3) << "batch_barrier_: " << rpc_name << " " VLOG(30) << "batch_barrier_: " << rpc_name << " "
<< barrier_counter_[rpc_name]; << barrier_counter_[rpc_name];
} }
void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) { void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
VLOG(4) << "RPCServer begin IncreaseBatchBarrier " << rpc_name; VLOG(40) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
int b = 0; int b = 0;
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
b = ++barrier_counter_[rpc_name]; b = ++barrier_counter_[rpc_name];
...@@ -71,7 +71,7 @@ void RPCServer::Complete() { ...@@ -71,7 +71,7 @@ void RPCServer::Complete() {
client_num_--; client_num_--;
need_reset_all_vars_ = true; need_reset_all_vars_ = true;
VLOG(4) << "decrease client_num to: " << client_num_; VLOG(40) << "decrease client_num to: " << client_num_;
if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) { if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) {
barrier_counter_[kRequestGet]--; barrier_counter_[kRequestGet]--;
} }
...@@ -90,7 +90,7 @@ int RPCServer::GetClientNum() { ...@@ -90,7 +90,7 @@ int RPCServer::GetClientNum() {
} }
void RPCServer::ResetBarrierCounter() { void RPCServer::ResetBarrierCounter() {
VLOG(3) << "RPCServer ResetBarrierCounter "; VLOG(30) << "RPCServer ResetBarrierCounter ";
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
for (auto& t : barrier_counter_) { for (auto& t : barrier_counter_) {
t.second = 0; t.second = 0;
...@@ -105,12 +105,12 @@ void RPCServer::RegisterRPC(const std::string& rpc_name, ...@@ -105,12 +105,12 @@ void RPCServer::RegisterRPC(const std::string& rpc_name,
static int cond = -1; static int cond = -1;
rpc_cond_map_[rpc_name] = ++cond; rpc_cond_map_[rpc_name] = ++cond;
VLOG(4) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler VLOG(40) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler
<< ", cond:" << rpc_cond_map_[rpc_name]; << ", cond:" << rpc_cond_map_[rpc_name];
} }
void RPCServer::SetCond(const std::string& rpc_name) { void RPCServer::SetCond(const std::string& rpc_name) {
VLOG(3) << "RPCServer SetCond " << rpc_name; VLOG(30) << "RPCServer SetCond " << rpc_name;
{ {
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
cur_cond_ = rpc_cond_map_[rpc_name]; cur_cond_ = rpc_cond_map_[rpc_name];
...@@ -120,7 +120,7 @@ void RPCServer::SetCond(const std::string& rpc_name) { ...@@ -120,7 +120,7 @@ void RPCServer::SetCond(const std::string& rpc_name) {
} }
void RPCServer::WaitCond(const std::string& rpc_name) { void RPCServer::WaitCond(const std::string& rpc_name) {
VLOG(4) << "RPCServer WaitCond " << rpc_name; VLOG(40) << "RPCServer WaitCond " << rpc_name;
int cond = 0; int cond = 0;
{ {
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
......
...@@ -50,7 +50,7 @@ bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input, ...@@ -50,7 +50,7 @@ bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
size_to_write = length - total_written; size_to_write = length - total_written;
} }
// This log is useful to see how long a internal block size is of rpc. // This log is useful to see how long a internal block size is of rpc.
VLOG(7) << "copy " << size_to_write << " data to CUDAPlace"; VLOG(70) << "copy " << size_to_write << " data to CUDAPlace";
memory::Copy(boost::get<platform::CUDAPlace>(place), memory::Copy(boost::get<platform::CUDAPlace>(place),
reinterpret_cast<void*>(p), cpu, data, size_to_write, reinterpret_cast<void*>(p), cpu, data, size_to_write,
gpu_dev_ctx.stream()); gpu_dev_ctx.stream());
...@@ -79,7 +79,7 @@ bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input, ...@@ -79,7 +79,7 @@ bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
// TODO(gongwb): can we avoid copy? // TODO(gongwb): can we avoid copy?
platform::CPUPlace cpu; platform::CPUPlace cpu;
// This log is useful to see how long a internal block size is of rpc. // This log is useful to see how long a internal block size is of rpc.
VLOG(7) << "copy " << size_to_write << " data to CPUPlace"; VLOG(70) << "copy " << size_to_write << " data to CPUPlace";
memory::Copy(cpu, reinterpret_cast<void*>(p), cpu, data, size_to_write); memory::Copy(cpu, reinterpret_cast<void*>(p), cpu, data, size_to_write);
p += size_to_write; p += size_to_write;
...@@ -198,8 +198,8 @@ bool VariableResponse::ProcSerializedField( ...@@ -198,8 +198,8 @@ bool VariableResponse::ProcSerializedField(
#endif #endif
} }
VLOG(7) << "ProcSerializedField:" << meta_.varname() VLOG(70) << "ProcSerializedField:" << meta_.varname()
<< ", type:" << meta_.type() << std::endl; << ", type:" << meta_.type() << std::endl;
framework::DDim dims = GetDims(meta_.dims()); framework::DDim dims = GetDims(meta_.dims());
if (meta_.type() == sendrecv::LOD_TENSOR) { if (meta_.type() == sendrecv::LOD_TENSOR) {
PADDLE_ENFORCE(meta_.lod_size() >= 0, "lod info should be got first!"); PADDLE_ENFORCE(meta_.lod_size() >= 0, "lod info should be got first!");
......
...@@ -75,16 +75,12 @@ class ElementwiseOp : public framework::OperatorWithKernel { ...@@ -75,16 +75,12 @@ class ElementwiseOp : public framework::OperatorWithKernel {
} }
}; };
class ElementwiseOpInferVarType : public framework::VarTypeInference { class ElementwiseOpInferVarType
public: : public framework::PassInDtypeAndVarTypeToOutput {
void operator()(const framework::OpDesc &op_desc, protected:
framework::BlockDesc *block) const override { std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
auto x_name = op_desc.Input("X")[0]; const override {
auto out_name = op_desc.Output("Out")[0]; return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
auto &x = block->FindRecursiveOrCreateVar(x_name);
auto &out = block->FindRecursiveOrCreateVar(out_name);
out.SetType(x.GetType());
out.SetDataType(x.GetDataType());
} }
}; };
......
...@@ -47,8 +47,8 @@ class FeedOp : public framework::OperatorBase { ...@@ -47,8 +47,8 @@ class FeedOp : public framework::OperatorBase {
auto col = Attr<int>("col"); auto col = Attr<int>("col");
VLOG(3) << "Feed Var " << feed_var_name << "'s " << col << " column to var " VLOG(30) << "Feed Var " << feed_var_name << "'s " << col
<< out_name; << " column to var " << out_name;
auto &feed_list = feed_var->Get<framework::FeedFetchList>(); auto &feed_list = feed_var->Get<framework::FeedFetchList>();
auto &feed_item = feed_list.at(static_cast<size_t>(col)); auto &feed_item = feed_list.at(static_cast<size_t>(col));
......
...@@ -43,7 +43,7 @@ class FetchBarrierOp : public framework::OperatorBase { ...@@ -43,7 +43,7 @@ class FetchBarrierOp : public framework::OperatorBase {
PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
for (auto& ep : eps) { for (auto& ep : eps) {
VLOG(3) << "fetch barrier, ep: " << ep; VLOG(30) << "fetch barrier, ep: " << ep;
rpc_client->AsyncSendFetchBarrier(ep); rpc_client->AsyncSendFetchBarrier(ep);
} }
PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
......
...@@ -57,7 +57,7 @@ class FetchOp : public framework::OperatorBase { ...@@ -57,7 +57,7 @@ class FetchOp : public framework::OperatorBase {
TensorCopySync(src_item, platform::CPUPlace(), &dst_item); TensorCopySync(src_item, platform::CPUPlace(), &dst_item);
dst_item.set_lod(src_item.lod()); dst_item.set_lod(src_item.lod());
VLOG(3) << "Fetch variable " << fetch_var_name << " to " << out_name; VLOG(30) << "Fetch variable " << fetch_var_name << " to " << out_name;
} }
}; };
......
...@@ -64,7 +64,7 @@ class GenNCCLIdOp : public framework::OperatorBase { ...@@ -64,7 +64,7 @@ class GenNCCLIdOp : public framework::OperatorBase {
distributed::RPCClient::GetInstance<RPCCLIENT_T>(0); distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
for (auto& ep : endpoint_list) { for (auto& ep : endpoint_list) {
VLOG(3) << "sending nccl id to " << ep; VLOG(30) << "sending nccl id to " << ep;
client->AsyncSendVar(ep, dev_ctx, *scope, NCCL_ID_VARNAME); client->AsyncSendVar(ep, dev_ctx, *scope, NCCL_ID_VARNAME);
} }
client->Wait(); client->Wait();
...@@ -72,7 +72,7 @@ class GenNCCLIdOp : public framework::OperatorBase { ...@@ -72,7 +72,7 @@ class GenNCCLIdOp : public framework::OperatorBase {
client->AsyncSendBatchBarrier(ep); client->AsyncSendBatchBarrier(ep);
} }
client->Wait(); client->Wait();
VLOG(3) << "sending completed..."; VLOG(30) << "sending completed...";
} }
void GetIdByServer(framework::Scope* scope, void GetIdByServer(framework::Scope* scope,
...@@ -99,11 +99,11 @@ class GenNCCLIdOp : public framework::OperatorBase { ...@@ -99,11 +99,11 @@ class GenNCCLIdOp : public framework::OperatorBase {
std::bind(&distributed::RPCServer::StartServer, rpc_service.get())); std::bind(&distributed::RPCServer::StartServer, rpc_service.get()));
rpc_service->SetCond(distributed::kRequestSend); rpc_service->SetCond(distributed::kRequestSend);
VLOG(3) << "start getting nccl id from trainer 0..."; VLOG(30) << "start getting nccl id from trainer 0...";
rpc_service->WaitBarrier(distributed::kRequestSend); rpc_service->WaitBarrier(distributed::kRequestSend);
VLOG(3) << "got nccl id and stop server..."; VLOG(30) << "got nccl id and stop server...";
rpc_service->ShutDown(); rpc_service->ShutDown();
VLOG(3) << "rpc server stopped"; VLOG(30) << "rpc server stopped";
server_thread.join(); server_thread.join();
} }
}; };
......
...@@ -63,12 +63,19 @@ static void CalcGridLocations(const platform::CPUDeviceContext& ctx, ...@@ -63,12 +63,19 @@ static void CalcGridLocations(const platform::CPUDeviceContext& ctx,
Tensor ones; Tensor ones;
ones.mutable_data<T>({n, h, w}, ctx.GetPlace()); ones.mutable_data<T>({n, h, w}, ctx.GetPlace());
auto ones_t = EigenTensor<T, 3>::From(ones).setConstant(1.0); auto ones_t = EigenTensor<T, 3>::From(ones).setConstant(1.0);
Tensor half_xmax, half_ymax;
half_xmax.mutable_data<T>({n, h, w}, ctx.GetPlace());
auto half_xmax_t =
EigenTensor<T, 3>::From(half_xmax).setConstant(0.5 * x_max);
half_ymax.mutable_data<T>({n, h, w}, ctx.GetPlace());
auto half_ymax_t =
EigenTensor<T, 3>::From(half_ymax).setConstant(0.5 * y_max);
// scale grid to [0, h-1/w-1] // scale grid to [0, h-1/w-1]
auto grid_x_t = EigenTensor<T, 3>::From(grid_x); auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
auto grid_y_t = EigenTensor<T, 3>::From(grid_y); auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
grid_x_t.device(place) = 0.5 * ((grid_x_t + ones_t) * x_max); grid_x_t.device(place) = (grid_x_t + ones_t) * half_xmax_t;
grid_y_t.device(place) = 0.5 * ((grid_y_t + ones_t) * y_max); grid_y_t.device(place) = (grid_y_t + ones_t) * half_ymax_t;
// calculate coords of 4 corner points // calculate coords of 4 corner points
x_w->mutable_data<T>({n, h, w}, ctx.GetPlace()); x_w->mutable_data<T>({n, h, w}, ctx.GetPlace());
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
...@@ -9,7 +9,8 @@ ...@@ -9,7 +9,8 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/bilinear_interp_op.h" #include "paddle/fluid/operators/interpolate_op.h"
#include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -18,27 +19,34 @@ namespace operators { ...@@ -18,27 +19,34 @@ namespace operators {
using framework::Tensor; using framework::Tensor;
class BilinearInterpOp : public framework::OperatorWithKernel { class InterpolateOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
protected: protected:
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of BilinearInterOp should not be null."); "Input(X) of InterpolateOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of BilinearInterOp should not be null."); "Output(Out) of InterpolationOp should not be null.");
auto interp_method = ctx->Attrs().Get<std::string>("interp_method");
PADDLE_ENFORCE(
"bilinear" == interp_method || "nearest" == interp_method,
"Interpolation method can only be \"bilinear\" or \"nearest\".");
auto dim_x = ctx->GetInputDim("X"); // NCHW format auto dim_x = ctx->GetInputDim("X"); // NCHW format
int out_h = ctx->Attrs().Get<int>("out_h"); int out_h = ctx->Attrs().Get<int>("out_h");
int out_w = ctx->Attrs().Get<int>("out_w"); int out_w = ctx->Attrs().Get<int>("out_w");
PADDLE_ENFORCE_EQ(dim_x.size(), 4, "X's dimension must be 4"); PADDLE_ENFORCE_EQ(dim_x.size(), 4, "X's dimension must be 4");
if (ctx->HasInput("OutSize")) { if (ctx->HasInput("OutSize") && ctx->IsRuntime()) {
auto out_size_dim = ctx->GetInputDim("OutSize"); auto out_size_dim = ctx->GetInputDim("OutSize");
PADDLE_ENFORCE_EQ(out_size_dim.size(), 1, PADDLE_ENFORCE_EQ(out_size_dim.size(), 1,
"OutSize's dimension size must be 1"); "OutSize's dimension size must be 1");
PADDLE_ENFORCE_EQ(out_size_dim[0], 2, "OutSize's dim[0] must be 2"); PADDLE_ENFORCE_EQ(out_size_dim[0], 2, "OutSize's dim[0] must be 2");
ctx->ShareLoD("X", "Out");
return;
} }
std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w}); std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
ctx->SetOutputDim("Out", framework::make_ddim(dim_out)); ctx->SetOutputDim("Out", framework::make_ddim(dim_out));
...@@ -52,35 +60,53 @@ class BilinearInterpOp : public framework::OperatorWithKernel { ...@@ -52,35 +60,53 @@ class BilinearInterpOp : public framework::OperatorWithKernel {
} }
}; };
class BilinearInterpOpMaker : public framework::OpProtoAndCheckerMaker { class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("X", AddInput("X",
"The input tensor of bilinear interpolation, " "The input tensor of interpolate operator, "
"This is a 4-D tensor with shape of (N x C x h x w)"); "This is a 4-D tensor with shape of [N, C, H, w].");
AddInput("OutSize", AddInput("OutSize",
"This is a 1-D tensor with two number. " "This is a 1-D tensor with two numbers to specify output size. "
"The first number is height and the second number is width.") "The first number is height and the second number is width.")
.AsDispensable(); .AsDispensable();
AddOutput("Out", "The dimension of output is (N x C x out_h x out_w)"); AddOutput("Out",
"The output tensor of interpolate operator, "
"This is a 4-D tensor with shape of [N, C, H, W].");
AddAttr<int>("out_h", "output height of bilinear interpolation op."); AddAttr<int>("out_h", "output height of interpolate op.");
AddAttr<int>("out_w", "output width of bilinear interpolation op."); AddAttr<int>("out_w", "output width of interpolate op.");
AddAttr<std::string>(
"interp_method",
"(string), interpolation method, can be \"bilinear\" for "
"bilinear interpolation and \"nearest\" for nearest "
"neighbor interpolation.");
AddComment(R"DOC( AddComment(R"DOC(
This operator samples input X to given output shape by using specified
interpolation method, the interpolation methods can be \"nearest\"
for nearest neighbor interpolation and \"bilinear\" for bilinear
interpolation.
Nearest neighbor interpolation is to perform nearest neighbor interpolation
in both the 3rd dimention(in height direction) and the 4th dimention(in width
direction) on input tensor.
Bilinear interpolation is an extension of linear interpolation for Bilinear interpolation is an extension of linear interpolation for
interpolating functions of two variables (e.g. H-direction and interpolating functions of two variables (e.g. H-direction and
W-direction in this op) on a rectilinear 2D grid. W-direction in this op) on a rectilinear 2D grid. The key idea is
to perform linear interpolation first in one direction, and then
The key idea is to perform linear interpolation first in one again in the other direction.
direction, and then again in the other direction.
For details of nearest neighbor interpolation, please refer to Wikipedia:
For details, please refer to Wikipedia: https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation
For details of bilinear interpolation, please refer to Wikipedia:
https://en.wikipedia.org/wiki/Bilinear_interpolation https://en.wikipedia.org/wiki/Bilinear_interpolation
)DOC"); )DOC");
} }
}; };
class BilinearInterpOpGrad : public framework::OperatorWithKernel { class InterpolateOpGrad : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -106,11 +132,11 @@ class BilinearInterpOpGrad : public framework::OperatorWithKernel { ...@@ -106,11 +132,11 @@ class BilinearInterpOpGrad : public framework::OperatorWithKernel {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(bilinear_interp, ops::BilinearInterpOp, REGISTER_OPERATOR(interpolate, ops::InterpolateOp, ops::InterpolateOpMaker,
ops::BilinearInterpOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(bilinear_interp_grad, ops::BilinearInterpOpGrad); REGISTER_OPERATOR(interpolate_grad, ops::InterpolateOpGrad);
REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::BilinearInterpKernel<float>, REGISTER_OP_CPU_KERNEL(interpolate, ops::InterpolateKernel<float>,
ops::BilinearInterpKernel<uint8_t>); ops::InterpolateKernel<double>,
REGISTER_OP_CPU_KERNEL(bilinear_interp_grad, ops::InterpolateKernel<uint8_t>);
ops::BilinearInterpGradKernel<float>); REGISTER_OP_CPU_KERNEL(interpolate_grad, ops::InterpolateGradKernel<float>,
ops::InterpolateGradKernel<double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
...@@ -9,7 +9,8 @@ ...@@ -9,7 +9,8 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/bilinear_interp_op.h" #include <string>
#include "paddle/fluid/operators/interpolate_op.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
...@@ -17,15 +18,72 @@ namespace operators { ...@@ -17,15 +18,72 @@ namespace operators {
using framework::Tensor; using framework::Tensor;
template <typename T>
__global__ void KeNearestNeighborInterpFw(
const T* in, const size_t in_img_h, const size_t in_img_w,
const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
const size_t out_img_w, const size_t output_h, const size_t output_w,
const size_t num_channels, const float ratio_h, const float ratio_w) {
int nthreads = output_h * output_w;
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (; tid < nthreads; tid += stride) {
int out_id_h = tid / output_w;
int out_id_w = tid % output_w;
int in_img_size = input_w / num_channels;
int out_img_size = output_w / num_channels;
int channel_id = out_id_w / out_img_size;
int out_img_idy = (out_id_w % out_img_size) / out_img_w;
int in_img_idy = static_cast<int>(ratio_h * out_img_idy + 0.5);
int out_img_idx = tid % out_img_w;
int in_img_idx = static_cast<int>(ratio_w * out_img_idx + 0.5);
out[tid] = in[out_id_h * input_w + channel_id * in_img_size +
in_img_idy * in_img_w + in_img_idx];
}
}
template <typename T>
__global__ void KeNearestNeighborInterpBw(
T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
const size_t input_w, const T* out, const size_t out_img_h,
const size_t out_img_w, const size_t output_h, const size_t output_w,
const size_t num_channels, const float ratio_h, const float ratio_w) {
int nthreads = output_h * output_w;
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (; tid < nthreads; tid += stride) {
int out_id_h = tid / output_w;
int out_id_w = tid % output_w;
int in_img_size = input_w / num_channels;
int out_img_size = output_w / num_channels;
int channel_id = out_id_w / out_img_size;
int out_img_idy = (out_id_w % out_img_size) / out_img_w;
int in_img_idy = static_cast<int>(ratio_h * out_img_idy + 0.5);
int out_img_idx = tid % out_img_w;
int in_img_idx = static_cast<int>(ratio_w * out_img_idx + 0.5);
T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
in_img_idy * in_img_w + in_img_idx];
const T out_pos = out[out_id_h * output_w + out_id_w];
platform::CudaAtomicAdd(in_pos, out_pos);
}
}
template <typename T> template <typename T>
__global__ void KeBilinearInterpFw( __global__ void KeBilinearInterpFw(
const T* in, const size_t in_img_h, const size_t in_img_w, const T* in, const size_t in_img_h, const size_t in_img_w,
const size_t input_h, const size_t input_w, T* out, const size_t out_img_h, const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
const size_t out_img_w, const size_t output_h, const size_t output_w, const size_t out_img_w, const size_t output_h, const size_t output_w,
const size_t num_channels, const T ratio_h, const T ratioW) { const size_t num_channels, const float ratio_h, const float ratio_w) {
int nthreads = output_h * output_w; int nthreads = output_h * output_w;
int tid = blockIdx.x * blockDim.x + threadIdx.x; int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < nthreads) { int stride = blockDim.x * gridDim.x;
for (; tid < nthreads; tid += stride) {
int out_id_h = tid / output_w; int out_id_h = tid / output_w;
int out_id_w = tid % output_w; int out_id_w = tid % output_w;
int in_img_size = input_w / num_channels; int in_img_size = input_w / num_channels;
...@@ -39,9 +97,9 @@ __global__ void KeBilinearInterpFw( ...@@ -39,9 +97,9 @@ __global__ void KeBilinearInterpFw(
T h2lambda = 1.f - h1lambda; T h2lambda = 1.f - h1lambda;
int out_img_idx = tid % out_img_w; int out_img_idx = tid % out_img_w;
int in_img_idx = ratioW * out_img_idx; int in_img_idx = ratio_w * out_img_idx;
int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
T w1lambda = ratioW * out_img_idx - in_img_idx; T w1lambda = ratio_w * out_img_idx - in_img_idx;
T w2lambda = 1.f - w1lambda; T w2lambda = 1.f - w1lambda;
const T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size + const T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
...@@ -60,10 +118,11 @@ __global__ void KeBilinearInterpBw( ...@@ -60,10 +118,11 @@ __global__ void KeBilinearInterpBw(
T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h, T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
const size_t input_w, const T* out, const size_t out_img_h, const size_t input_w, const T* out, const size_t out_img_h,
const size_t out_img_w, const size_t output_h, const size_t output_w, const size_t out_img_w, const size_t output_h, const size_t output_w,
const size_t num_channels, const T ratio_h, const T ratioW) { const size_t num_channels, const T ratio_h, const T ratio_w) {
int nthreads = output_h * output_w; int nthreads = output_h * output_w;
int tid = blockIdx.x * blockDim.x + threadIdx.x; int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < nthreads) { int stride = blockDim.x * gridDim.x;
for (; tid < nthreads; tid += stride) {
int out_id_h = tid / output_w; int out_id_h = tid / output_w;
int out_id_w = tid % output_w; int out_id_w = tid % output_w;
int in_img_size = input_w / num_channels; int in_img_size = input_w / num_channels;
...@@ -77,122 +136,146 @@ __global__ void KeBilinearInterpBw( ...@@ -77,122 +136,146 @@ __global__ void KeBilinearInterpBw(
T h2lambda = 1.f - h1lambda; T h2lambda = 1.f - h1lambda;
int out_img_idx = tid % out_img_w; int out_img_idx = tid % out_img_w;
int in_img_idx = ratioW * out_img_idx; int in_img_idx = ratio_w * out_img_idx;
int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
T w1lambda = ratioW * out_img_idx - in_img_idx; T w1lambda = ratio_w * out_img_idx - in_img_idx;
T w2lambda = 1.f - w1lambda; T w2lambda = 1.f - w1lambda;
T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size + T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
in_img_idy * in_img_w + in_img_idx]; in_img_idy * in_img_w + in_img_idx];
const T* out_pos = &out[out_id_h * output_w + out_id_w]; const T* out_pos = &out[out_id_h * output_w + out_id_w];
atomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]); platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
atomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]); platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]);
atomicAdd(&in_pos[h_id * in_img_w], h1lambda * w2lambda * out_pos[0]); platform::CudaAtomicAdd(&in_pos[h_id * in_img_w],
atomicAdd(&in_pos[h_id * in_img_w + w_id], h1lambda * w2lambda * out_pos[0]);
h1lambda * w1lambda * out_pos[0]); platform::CudaAtomicAdd(&in_pos[h_id * in_img_w + w_id],
h1lambda * w1lambda * out_pos[0]);
} }
} }
template <typename T> template <typename T>
class BilinearInterpOpCUDAKernel : public framework::OpKernel<T> { class InterpolateOpCUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"This kernel only runs on GPU device."); "This kernel only runs on GPU device.");
auto* input_t = ctx.Input<Tensor>("X"); // float tensor auto* input = ctx.Input<Tensor>("X");
auto* output_t = ctx.Output<Tensor>("Out"); // float tensor auto* output = ctx.Output<Tensor>("Out");
auto* input = input_t->data<T>(); auto* input_data = input->data<T>();
auto interp_method = ctx.Attr<std::string>("interp_method");
int out_h = ctx.Attr<int>("out_h"); int out_h = ctx.Attr<int>("out_h");
int out_w = ctx.Attr<int>("out_w"); int out_w = ctx.Attr<int>("out_w");
auto out_dims = output_t->dims(); auto out_size = ctx.Input<Tensor>("OutSize");
auto out_size_t = ctx.Input<Tensor>("OutSize"); if (out_size != nullptr) {
if (out_size_t != nullptr) {
Tensor sizes; Tensor sizes;
framework::TensorCopy(*out_size_t, platform::CPUPlace(), &sizes); framework::TensorCopy(*out_size, platform::CPUPlace(), &sizes);
auto size_data = sizes.data<int>(); auto size_data = sizes.data<int>();
out_h = size_data[0]; out_h = size_data[0];
out_w = size_data[1]; out_w = size_data[1];
} }
auto* output = output_t->mutable_data<T>(
{out_dims[0], out_dims[1], out_h, out_w}, ctx.GetPlace());
int batch_size = input_t->dims()[0]; int n = input->dims()[0];
int channels = input_t->dims()[1]; int c = input->dims()[1];
int in_h = input_t->dims()[2]; int in_h = input->dims()[2];
int in_w = input_t->dims()[3]; int in_w = input->dims()[3];
auto* output_data =
output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
int in_hw = in_h * in_w; int in_hw = in_h * in_w;
int out_hw = out_h * out_w; int out_hw = out_h * out_w;
int in_chw = channels * in_hw; int in_chw = c * in_hw;
int out_chw = channels * out_hw; int out_chw = c * out_hw;
T ratio_h = (out_h > 1) ? static_cast<T>(in_h - 1) / (out_h - 1) : 0.f; float ratio_h =
T ratio_w = (out_w > 1) ? static_cast<T>(in_w - 1) / (out_w - 1) : 0.f; (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
float ratio_w =
(out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
if (in_h == out_h && in_w == out_w) { if (in_h == out_h && in_w == out_w) {
memcpy(output, input, input_t->numel() * sizeof(T)); framework::TensorCopy(*input, ctx.GetPlace(), output);
} else { return;
int threadNum = batch_size * out_chw; }
int blocks = (threadNum + 1024 - 1) / 1024;
int pixelNum = n * out_chw;
int grid_dim = (pixelNum + 512 - 1) / 512;
grid_dim = grid_dim > 8 ? 8 : grid_dim;
if ("nearest" == interp_method) {
KeNearestNeighborInterpFw<
T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
out_chw, c, ratio_h, ratio_w);
} else if ("bilinear" == interp_method) {
KeBilinearInterpFw< KeBilinearInterpFw<
T><<<blocks, 1024, 0, ctx.cuda_device_context().stream()>>>( T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
input, in_h, in_w, batch_size, in_chw, output, out_h, out_w, input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
batch_size, out_chw, channels, ratio_h, ratio_w); out_chw, c, ratio_h, ratio_w);
} }
} }
}; };
template <typename T> template <typename T>
class BilinearInterpGradOpCUDAKernel : public framework::OpKernel<T> { class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* d_input_t = ctx.Output<Tensor>(framework::GradVarName("X")); auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* d_output_t = ctx.Input<Tensor>(framework::GradVarName("Out")); auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* d_output = d_output_t->data<T>(); auto* output_grad_data = output_grad->data<T>();
auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace()); auto* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
auto& device_ctx = auto& device_ctx =
ctx.template device_context<platform::CUDADeviceContext>(); ctx.template device_context<platform::CUDADeviceContext>();
math::SetConstant<platform::CUDADeviceContext, T> zero; math::SetConstant<platform::CUDADeviceContext, T> zero;
zero(device_ctx, d_input_t, static_cast<T>(0.0)); zero(device_ctx, input_grad, static_cast<T>(0.0));
auto interp_method = ctx.Attr<std::string>("interp_method");
int out_h = ctx.Attr<int>("out_h"); int out_h = ctx.Attr<int>("out_h");
int out_w = ctx.Attr<int>("out_w"); int out_w = ctx.Attr<int>("out_w");
auto out_size = ctx.Input<Tensor>("OutSize");
auto out_size_t = ctx.Input<Tensor>("OutSize"); if (out_size != nullptr) {
if (out_size_t != nullptr) {
Tensor sizes; Tensor sizes;
framework::TensorCopy(*out_size_t, platform::CPUPlace(), &sizes); framework::TensorCopy(*out_size, platform::CPUPlace(), &sizes);
auto size_data = sizes.data<int>(); auto size_data = sizes.data<int>();
out_h = size_data[0]; out_h = size_data[0];
out_w = size_data[1]; out_w = size_data[1];
} }
int batch_size = d_input_t->dims()[0]; int n = input_grad->dims()[0];
int channels = d_input_t->dims()[1]; int c = input_grad->dims()[1];
int in_h = d_input_t->dims()[2]; int in_h = input_grad->dims()[2];
int in_w = d_input_t->dims()[3]; int in_w = input_grad->dims()[3];
int in_hw = in_h * in_w; int in_hw = in_h * in_w;
int out_hw = out_h * out_w; int out_hw = out_h * out_w;
int in_chw = channels * in_hw; int in_chw = c * in_hw;
int out_chw = channels * out_hw; int out_chw = c * out_hw;
T ratio_h = (out_h > 1) ? static_cast<T>(in_h - 1) / (out_h - 1) : 0.f; float ratio_h =
T ratio_w = (out_w > 1) ? static_cast<T>(in_w - 1) / (out_w - 1) : 0.f; (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
float ratio_w =
(out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
if (in_h == out_h && in_w == out_w) { if (in_h == out_h && in_w == out_w) {
memcpy(d_input, d_output, d_input_t->numel() * sizeof(T)); framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad);
} else { return;
int threadNum = batch_size * out_chw; }
int blocks = (threadNum + 1024 - 1) / 1024;
int pixelNum = n * out_chw;
int grid_dim = (pixelNum + 512 - 1) / 512;
grid_dim = grid_dim > 8 ? 8 : grid_dim;
if ("nearest" == interp_method) {
KeNearestNeighborInterpBw<
T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h,
out_w, n, out_chw, c, ratio_h, ratio_w);
} else if ("bilinear" == interp_method) {
KeBilinearInterpBw< KeBilinearInterpBw<
T><<<blocks, 1024, 0, ctx.cuda_device_context().stream()>>>( T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
d_input, in_h, in_w, batch_size, in_chw, d_output, out_h, out_w, input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h,
batch_size, out_chw, channels, ratio_h, ratio_w); out_w, n, out_chw, c, ratio_h, ratio_w);
} }
} }
}; };
...@@ -201,7 +284,9 @@ class BilinearInterpGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -201,7 +284,9 @@ class BilinearInterpGradOpCUDAKernel : public framework::OpKernel<T> {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(bilinear_interp, REGISTER_OP_CUDA_KERNEL(interpolate, ops::InterpolateOpCUDAKernel<float>,
ops::BilinearInterpOpCUDAKernel<float>); ops::InterpolateOpCUDAKernel<double>,
REGISTER_OP_CUDA_KERNEL(bilinear_interp_grad, ops::InterpolateOpCUDAKernel<int>);
ops::BilinearInterpGradOpCUDAKernel<float>); REGISTER_OP_CUDA_KERNEL(interpolate_grad,
ops::InterpolateGradOpCUDAKernel<float>,
ops::InterpolateGradOpCUDAKernel<double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
template <typename T, size_t D, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
using Tensor = framework::Tensor;
template <typename T>
static void NearestNeighborInterpolate(const Tensor& input, Tensor* output,
const float ratio_h, const float ratio_w,
const int n, const int c,
const int out_h, const int out_w) {
auto input_t = EigenTensor<T, 4>::From(input);
auto output_t = EigenTensor<T, 4>::From(*output);
for (int k = 0; k < out_h; k++) { // loop for images
int in_k = static_cast<int>(ratio_h * k + 0.5);
for (int l = 0; l < out_w; l++) {
int in_l = static_cast<int>(ratio_w * l + 0.5);
for (int i = 0; i < n; i++) { // loop for batches
for (int j = 0; j < c; j++) { // loop for channels
output_t(i, j, k, l) = input_t(i, j, in_k, in_l);
}
}
}
}
}
template <typename T>
static void BilinearInterpolation(const Tensor& input, Tensor* output,
const float ratio_h, const float ratio_w,
const int in_h, const int in_w, const int n,
const int c, const int out_h,
const int out_w) {
auto input_t = EigenTensor<T, 4>::From(input);
auto output_t = EigenTensor<T, 4>::From(*output);
for (int k = 0; k < out_h; k++) { // loop for images
int y_n = static_cast<int>(ratio_h * k);
int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
float d_n = ratio_h * k - y_n;
float d_s = 1.f - d_n;
for (int l = 0; l < out_w; l++) {
int x_w = static_cast<int>(ratio_w * l);
int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
float d_w = ratio_w * l - x_w;
float d_e = 1.f - d_w;
for (int i = 0; i < n; i++) { // loop for batches
for (int j = 0; j < c; j++) { // loop for channels
// bilinear interpolation
output_t(i, j, k, l) = input_t(i, j, y_n, x_w) * d_s * d_e +
input_t(i, j, y_s, x_w) * d_n * d_e +
input_t(i, j, y_n, x_e) * d_s * d_w +
input_t(i, j, y_s, x_e) * d_n * d_w;
}
}
}
}
}
template <typename T>
static void NearestNeighborInterpolateGrad(const Tensor& output_grad,
Tensor* input_grad,
const float ratio_h,
const float ratio_w, const int n,
const int c, const int out_h,
const int out_w) {
auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
for (int k = 0; k < out_h; k++) { // loop for images
int in_k = static_cast<int>(ratio_h * k + 0.5);
for (int l = 0; l < out_w; l++) {
int in_l = static_cast<int>(ratio_w * l + 0.5);
for (int i = 0; i < n; i++) { // loop for batches
for (int j = 0; j < c; j++) { // loop for channels
input_grad_t(i, j, in_k, in_l) += output_grad_t(i, j, k, l);
}
}
}
}
}
template <typename T>
static void BilinearInterpolationGrad(const Tensor& output_grad,
Tensor* input_grad, const float ratio_h,
const float ratio_w, const int in_h,
const int in_w, const int n, const int c,
const int out_h, const int out_w) {
auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
for (int k = 0; k < out_h; k++) { // loop for images
int y_n = static_cast<int>(ratio_h * k);
int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
float d_n = ratio_h * k - y_n;
float d_s = 1.f - d_n;
for (int l = 0; l < out_w; l++) {
int x_w = static_cast<int>(ratio_w * l);
int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
float d_w = ratio_w * l - x_w;
float d_e = 1.f - d_w;
for (int i = 0; i < n; i++) { // loop for batches
for (int j = 0; j < c; j++) { // loop for channels
// bilinear interpolation grad
const T grad = output_grad_t(i, j, k, l);
input_grad_t(i, j, y_n, x_w) += static_cast<T>(grad * d_s * d_e);
input_grad_t(i, j, y_s, x_w) += static_cast<T>(grad * d_n * d_e);
input_grad_t(i, j, y_n, x_e) += static_cast<T>(grad * d_s * d_w);
input_grad_t(i, j, y_s, x_e) += static_cast<T>(grad * d_n * d_w);
}
}
}
}
}
template <typename T>
class InterpolateKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<Tensor>("X");
auto* output = ctx.Output<Tensor>("Out");
std::string interp_method = ctx.Attr<std::string>("interp_method");
int out_h = ctx.Attr<int>("out_h");
int out_w = ctx.Attr<int>("out_w");
auto out_size = ctx.Input<Tensor>("OutSize");
if (out_size != nullptr) {
auto out_size_data = out_size->data<int>();
out_h = out_size_data[0];
out_w = out_size_data[1];
}
const int n = input->dims()[0];
const int c = input->dims()[1];
const int in_h = input->dims()[2];
const int in_w = input->dims()[3];
output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
auto& device_ctx =
ctx.template device_context<platform::CPUDeviceContext>();
math::SetConstant<platform::CPUDeviceContext, T> zero;
zero(device_ctx, output, static_cast<T>(0.0));
if (in_h == out_h && in_w == out_w) {
framework::TensorCopy(*input, ctx.GetPlace(), output);
return;
}
float ratio_h =
(out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
float ratio_w =
(out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
if ("bilinear" == interp_method) {
BilinearInterpolation<T>(*input, output, ratio_h, ratio_w, in_h, in_w, n,
c, out_h, out_w);
} else if ("nearest" == interp_method) {
NearestNeighborInterpolate<T>(*input, output, ratio_h, ratio_w, n, c,
out_h, out_w);
}
}
};
template <typename T>
class InterpolateGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<Tensor>("X");
auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
std::string interp_method = ctx.Attr<std::string>("interp_method");
int out_h = ctx.Attr<int>("out_h");
int out_w = ctx.Attr<int>("out_w");
auto out_size = ctx.Input<Tensor>("OutSize");
if (out_size != nullptr) {
auto out_size_data = out_size->data<int>();
out_h = out_size_data[0];
out_w = out_size_data[1];
}
const int n = input->dims()[0];
const int c = input->dims()[1];
const int in_h = input->dims()[2];
const int in_w = input->dims()[3];
input_grad->mutable_data<T>({n, c, in_h, in_w}, ctx.GetPlace());
auto& device_ctx =
ctx.template device_context<platform::CPUDeviceContext>();
math::SetConstant<platform::CPUDeviceContext, T> zero;
zero(device_ctx, input_grad, static_cast<T>(0.0));
if (in_h == out_h && in_w == out_w) {
framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad);
return;
}
float ratio_h =
(out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
float ratio_w =
(out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
if ("bilinear" == interp_method) {
BilinearInterpolationGrad<T>(*output_grad, input_grad, ratio_h, ratio_w,
in_h, in_w, n, c, out_h, out_w);
} else if ("nearest" == interp_method) {
NearestNeighborInterpolateGrad<T>(*output_grad, input_grad, ratio_h,
ratio_w, n, c, out_h, out_w);
}
}
};
} // namespace operators
} // namespace paddle
...@@ -36,7 +36,7 @@ namespace operators { ...@@ -36,7 +36,7 @@ namespace operators {
void RunServer(std::shared_ptr<distributed::RPCServer> service) { void RunServer(std::shared_ptr<distributed::RPCServer> service) {
service->StartServer(); service->StartServer();
VLOG(4) << "RunServer thread end"; VLOG(40) << "RunServer thread end";
} }
static void split(const std::string &str, char sep, static void split(const std::string &str, char sep,
std::vector<std::string> *pieces) { std::vector<std::string> *pieces) {
...@@ -66,8 +66,8 @@ static void ParallelExecuteBlocks( ...@@ -66,8 +66,8 @@ static void ParallelExecuteBlocks(
fs.push_back(framework::Async([&executor, &prepared, &scope, idx]() { fs.push_back(framework::Async([&executor, &prepared, &scope, idx]() {
int run_block = idx; // thread local int run_block = idx; // thread local
try { try {
VLOG(3) << "running server block: " << run_block VLOG(30) << "running server block: " << run_block
<< "pointer: " << prepared[run_block].get(); << "pointer: " << prepared[run_block].get();
executor->RunPreparedContext(prepared[run_block].get(), scope); executor->RunPreparedContext(prepared[run_block].get(), scope);
} catch (const std::exception &e) { } catch (const std::exception &e) {
LOG(FATAL) << "run sub program:" << idx << " error " << e.what(); LOG(FATAL) << "run sub program:" << idx << " error " << e.what();
...@@ -108,7 +108,7 @@ void ListenAndServOp::RunSyncLoop( ...@@ -108,7 +108,7 @@ void ListenAndServOp::RunSyncLoop(
framework::Scope *recv_scope, platform::DeviceContext *dev_ctx, framework::Scope *recv_scope, platform::DeviceContext *dev_ctx,
const std::vector<int> &prefetch_block_id_list, const std::vector<int> &prefetch_block_id_list,
const int checkpoint_point_block_id) const { const int checkpoint_point_block_id) const {
VLOG(2) << "RunSyncLoop"; VLOG(20) << "RunSyncLoop";
size_t num_blocks = program->Size(); size_t num_blocks = program->Size();
auto optimize_blocks = auto optimize_blocks =
Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks); Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
...@@ -167,7 +167,7 @@ void ListenAndServOp::RunSyncLoop( ...@@ -167,7 +167,7 @@ void ListenAndServOp::RunSyncLoop(
} }
ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program, ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program,
recv_scope); recv_scope);
VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)"; VLOG(20) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
ResetReceivedVars(recv_scope, dev_ctx, rpc_service_->NeedResetAllVars()); ResetReceivedVars(recv_scope, dev_ctx, rpc_service_->NeedResetAllVars());
...@@ -183,11 +183,11 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope, ...@@ -183,11 +183,11 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
for (auto &varname : sparse_vars_) { for (auto &varname : sparse_vars_) {
auto var = recv_scope->FindVar(varname); auto var = recv_scope->FindVar(varname);
if (var == nullptr) { if (var == nullptr) {
VLOG(2) << "can not find var " << varname << " in received scope"; VLOG(20) << "can not find var " << varname << " in received scope";
continue; continue;
} }
if (var->IsType<framework::SelectedRows>()) { if (var->IsType<framework::SelectedRows>()) {
VLOG(3) << "reset sparse var: " << varname; VLOG(30) << "reset sparse var: " << varname;
var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear(); var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
} else { } else {
PADDLE_THROW("The type of sparse var should be SelectedRows"); PADDLE_THROW("The type of sparse var should be SelectedRows");
...@@ -197,7 +197,7 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope, ...@@ -197,7 +197,7 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
for (auto &varname : dense_vars_) { for (auto &varname : dense_vars_) {
auto var = recv_scope->FindVar(varname); auto var = recv_scope->FindVar(varname);
if (var == nullptr) { if (var == nullptr) {
VLOG(2) << "can not find var " << varname << " in received scope"; VLOG(20) << "can not find var " << varname << " in received scope";
continue; continue;
} }
if (var->IsType<framework::LoDTensor>()) { if (var->IsType<framework::LoDTensor>()) {
...@@ -216,7 +216,7 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope, ...@@ -216,7 +216,7 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
framework::ProgramDesc *program, framework::ProgramDesc *program,
framework::Scope *recv_scope) const { framework::Scope *recv_scope) const {
VLOG(2) << "RunAsyncLoop"; VLOG(20) << "RunAsyncLoop";
auto grad_to_block_id_str = auto grad_to_block_id_str =
Attr<std::vector<std::string>>("grad_to_block_id"); Attr<std::vector<std::string>>("grad_to_block_id");
DoubleFindMap<std::string, int32_t> grad_to_block_id; DoubleFindMap<std::string, int32_t> grad_to_block_id;
...@@ -225,7 +225,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, ...@@ -225,7 +225,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
const std::string &grad_and_id) { const std::string &grad_and_id) {
std::vector<std::string> pieces; std::vector<std::string> pieces;
split(grad_and_id, ':', &pieces); split(grad_and_id, ':', &pieces);
VLOG(3) << "after split, key = " << pieces[0] << ", id=" << pieces[1]; VLOG(30) << "after split, key = " << pieces[0] << ", id=" << pieces[1];
PADDLE_ENFORCE_EQ(pieces.size(), 2); PADDLE_ENFORCE_EQ(pieces.size(), 2);
PADDLE_ENFORCE_EQ(out_map->count(pieces[0]), 0); PADDLE_ENFORCE_EQ(out_map->count(pieces[0]), 0);
...@@ -270,7 +270,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, ...@@ -270,7 +270,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
while (true) { while (true) {
if (rpc_service_->IsExit()) { if (rpc_service_->IsExit()) {
VLOG(4) << "get exit!rpc_processor break!"; VLOG(40) << "get exit!rpc_processor break!";
break; break;
} }
...@@ -332,9 +332,9 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -332,9 +332,9 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
std::string endpoint = Attr<std::string>("endpoint"); std::string endpoint = Attr<std::string>("endpoint");
int checkpoint_block_id = Attr<int>(kCheckpointBlockId); int checkpoint_block_id = Attr<int>(kCheckpointBlockId);
VLOG(4) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in VLOG(40) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in
<< ", end_point:" << endpoint << ", end_point:" << endpoint
<< ", checkpoint_block_id: " << checkpoint_block_id; << ", checkpoint_block_id: " << checkpoint_block_id;
rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in)); rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in));
...@@ -383,8 +383,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -383,8 +383,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
prefetch_var_name_to_block_id_str) { prefetch_var_name_to_block_id_str) {
std::vector<std::string> pieces; std::vector<std::string> pieces;
split(prefetch_var_name_and_id, ':', &pieces); split(prefetch_var_name_and_id, ':', &pieces);
VLOG(3) << "after split, prefetch_var = " << pieces[0] VLOG(30) << "after split, prefetch_var = " << pieces[0]
<< ", id=" << pieces[1]; << ", id=" << pieces[1];
PADDLE_ENFORCE_EQ(pieces.size(), 2); PADDLE_ENFORCE_EQ(pieces.size(), 2);
int block_id = std::stoi(pieces[1]); int block_id = std::stoi(pieces[1]);
...@@ -415,7 +415,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -415,7 +415,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
// start the server listening after all member initialized. // start the server listening after all member initialized.
server_thread_.reset(new std::thread(RunServer, rpc_service_)); server_thread_.reset(new std::thread(RunServer, rpc_service_));
VLOG(3) << "wait server thread to become ready..."; VLOG(30) << "wait server thread to become ready...";
rpc_service_->WaitServerReady(); rpc_service_->WaitServerReady();
// register SIGINT(from ctrl+C) and SIGTERM(from kill) signal handlers // register SIGINT(from ctrl+C) and SIGTERM(from kill) signal handlers
......
...@@ -30,9 +30,9 @@ class LoDRankTableOp : public framework::OperatorBase { ...@@ -30,9 +30,9 @@ class LoDRankTableOp : public framework::OperatorBase {
auto x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>(); auto x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
auto *out = auto *out =
scope.FindVar(Output("Out"))->GetMutable<framework::LoDRankTable>(); scope.FindVar(Output("Out"))->GetMutable<framework::LoDRankTable>();
VLOG(10) << "Level = " << static_cast<size_t>(Attr<int>("level")); VLOG(100) << "Level = " << static_cast<size_t>(Attr<int>("level"));
out->Reset(x.lod(), static_cast<size_t>(Attr<int>("level"))); out->Reset(x.lod(), static_cast<size_t>(Attr<int>("level")));
VLOG(10) << Input("X") << "'s lod information is " << *out; VLOG(100) << Input("X") << "'s lod information is " << *out;
} }
}; };
......
...@@ -134,13 +134,13 @@ class LookupTableOpGradVarTypeInference : public framework::VarTypeInference { ...@@ -134,13 +134,13 @@ class LookupTableOpGradVarTypeInference : public framework::VarTypeInference {
auto attr = op_desc.GetAttr("is_sparse"); auto attr = op_desc.GetAttr("is_sparse");
bool is_sparse = boost::get<bool>(attr); bool is_sparse = boost::get<bool>(attr);
if (is_sparse) { if (is_sparse) {
VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W") VLOG(30) << "lookup_table_grad op " << framework::GradVarName("W")
<< " is set to SelectedRows"; << " is set to SelectedRows";
block->Var(out_var_name) block->Var(out_var_name)
->SetType(framework::proto::VarType::SELECTED_ROWS); ->SetType(framework::proto::VarType::SELECTED_ROWS);
} else { } else {
VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W") VLOG(30) << "lookup_table_grad op " << framework::GradVarName("W")
<< " is set to LoDTensor"; << " is set to LoDTensor";
block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR); block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR);
} }
block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType()); block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType());
......
...@@ -96,8 +96,8 @@ void TestAndBench(const int n, std::function<void(const int, const T*, T*)> tgt, ...@@ -96,8 +96,8 @@ void TestAndBench(const int n, std::function<void(const int, const T*, T*)> tgt,
} }
auto et = GetCurrentUS(); auto et = GetCurrentUS();
VLOG(3) << "Vec size " << n << ": refer takes: " << (et - mt) / repeat VLOG(30) << "Vec size " << n << ": refer takes: " << (et - mt) / repeat
<< " us, tgt takes: " << (mt - st) / repeat; << " us, tgt takes: " << (mt - st) / repeat;
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
EXPECT_NEAR(ytgt_data[i], yref_data[i], 1e-3); EXPECT_NEAR(ytgt_data[i], yref_data[i], 1e-3);
} }
......
...@@ -87,7 +87,7 @@ TEST(JitKernel, vrelu) { ...@@ -87,7 +87,7 @@ TEST(JitKernel, vrelu) {
vrelu_intri8(d, x_data, zref_data); vrelu_intri8(d, x_data, zref_data);
} }
auto si1 = GetCurrentUS(); auto si1 = GetCurrentUS();
VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat; VLOG(30) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
} }
#endif #endif
auto ttgts = GetCurrentUS(); auto ttgts = GetCurrentUS();
...@@ -95,8 +95,9 @@ TEST(JitKernel, vrelu) { ...@@ -95,8 +95,9 @@ TEST(JitKernel, vrelu) {
ker->Compute(x_data, ztgt_data); ker->Compute(x_data, ztgt_data);
} }
auto ttgte = GetCurrentUS(); auto ttgte = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat VLOG(30) << "Vec size " << d
<< " us, tgt takes: " << (ttgte - ttgts) / repeat; << ": refer takes: " << (trefe - trefs) / repeat
<< " us, tgt takes: " << (ttgte - ttgts) / repeat;
for (int i = 0; i < d; ++i) { for (int i = 0; i < d; ++i) {
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
} }
...@@ -132,8 +133,9 @@ TEST(JitKernel, vaddbias) { ...@@ -132,8 +133,9 @@ TEST(JitKernel, vaddbias) {
} }
auto ttgte = GetCurrentUS(); auto ttgte = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat VLOG(30) << "Vec size " << d
<< " us, tgt takes: " << (ttgte - ttgts) / repeat; << ": refer takes: " << (trefe - trefs) / repeat
<< " us, tgt takes: " << (ttgte - ttgts) / repeat;
for (int i = 0; i < d; ++i) { for (int i = 0; i < d; ++i) {
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
} }
...@@ -183,13 +185,14 @@ TEST(JitKernel, vexp) { ...@@ -183,13 +185,14 @@ TEST(JitKernel, vexp) {
} }
auto ttgte = GetCurrentUS(); auto ttgte = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat VLOG(30) << "Vec size " << d
<< ": refer takes: " << (trefe - trefs) / repeat
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
<< " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, " << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
#else #else
<< " us, " << " us, "
#endif #endif
<< "tgt takes: " << (ttgte - ttgts) / repeat; << "tgt takes: " << (ttgte - ttgts) / repeat;
for (int i = 0; i < d; ++i) { for (int i = 0; i < d; ++i) {
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
} }
...@@ -254,9 +257,10 @@ TEST(JitKernel, vsigmoid) { ...@@ -254,9 +257,10 @@ TEST(JitKernel, vsigmoid) {
} }
auto ttgte = GetCurrentUS(); auto ttgte = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat VLOG(30) << "Vec size " << d
<< " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat << ": refer takes: " << (trefe - trefs) / repeat
<< " us, tgt takes: " << (ttgte - ttgts) / repeat; << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat
<< " us, tgt takes: " << (ttgte - ttgts) / repeat;
for (int i = 0; i < d; ++i) { for (int i = 0; i < d; ++i) {
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
} }
...@@ -321,9 +325,10 @@ TEST(JitKernel, vtanh) { ...@@ -321,9 +325,10 @@ TEST(JitKernel, vtanh) {
} }
auto ttgte = GetCurrentUS(); auto ttgte = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat VLOG(30) << "Vec size " << d
<< " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat << ": refer takes: " << (trefe - trefs) / repeat
<< " us, tgt takes: " << (ttgte - ttgts) / repeat; << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat
<< " us, tgt takes: " << (ttgte - ttgts) / repeat;
for (int i = 0; i < d; ++i) { for (int i = 0; i < d; ++i) {
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
} }
...@@ -441,9 +446,10 @@ TEST(JitKernel, lstm) { ...@@ -441,9 +446,10 @@ TEST(JitKernel, lstm) {
ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data); ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data);
} }
auto ttgte = GetCurrentUS(); auto ttgte = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat VLOG(30) << "Vec size " << d
<< " us, better(jit) takes: " << (tmkle - tmkls) / repeat << ": refer takes: " << (trefe - trefs) / repeat
<< " us, tgt takes: " << (ttgte - ttgts) / repeat; << " us, better(jit) takes: " << (tmkle - tmkls) / repeat
<< " us, tgt takes: " << (ttgte - ttgts) / repeat;
} }
} }
...@@ -525,8 +531,8 @@ TEST(JitKernel, vscal) { ...@@ -525,8 +531,8 @@ TEST(JitKernel, vscal) {
vscal_inp_intri8(d, a, y_data); vscal_inp_intri8(d, a, y_data);
} }
auto si3 = GetCurrentUS(); auto si3 = GetCurrentUS();
VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat VLOG(30) << "Vec size 8 intr takes: " << (si1 - si0) / repeat
<< " us, inplace: " << (si3 - si2) / repeat; << " us, inplace: " << (si3 - si2) / repeat;
} }
#endif #endif
...@@ -540,15 +546,17 @@ TEST(JitKernel, vscal) { ...@@ -540,15 +546,17 @@ TEST(JitKernel, vscal) {
ker->Compute(&a, y_data, y_data, d); ker->Compute(&a, y_data, y_data, d);
} }
auto ttgte1 = GetCurrentUS(); auto ttgte1 = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat VLOG(30) << "Vec size " << d
<< " us, inplace takes: " << (trefe1 - trefs1) / repeat << ": refer takes: " << (trefe - trefs) / repeat
<< " us, inplace takes: " << (trefe1 - trefs1) / repeat
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
<< " us, mkl inplace takes: " << (tmkle - tmkls) / repeat << " us, " << " us, mkl inplace takes: " << (tmkle - tmkls) / repeat
<< " us, "
#else #else
<< " us, " << " us, "
#endif #endif
<< "tgt takes: " << (ttgte - ttgts) / repeat << "tgt takes: " << (ttgte - ttgts) / repeat
<< "us, tgt inplace takes: " << (ttgte1 - ttgts1) / repeat; << "us, tgt inplace takes: " << (ttgte1 - ttgts1) / repeat;
for (int i = 0; i < d; ++i) { for (int i = 0; i < d; ++i) {
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
} }
...@@ -611,7 +619,7 @@ TEST(JitKernel, vmul) { ...@@ -611,7 +619,7 @@ TEST(JitKernel, vmul) {
vmul_intri8(d, x_data, y_data, zref_data); vmul_intri8(d, x_data, y_data, zref_data);
} }
auto si1 = GetCurrentUS(); auto si1 = GetCurrentUS();
VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat; VLOG(30) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
} }
#endif #endif
...@@ -621,13 +629,14 @@ TEST(JitKernel, vmul) { ...@@ -621,13 +629,14 @@ TEST(JitKernel, vmul) {
} }
auto ttgte = GetCurrentUS(); auto ttgte = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat VLOG(30) << "Vec size " << d
<< ": refer takes: " << (trefe - trefs) / repeat
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
<< " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, " << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
#else #else
<< " us, " << " us, "
#endif #endif
<< "tgt takes: " << (ttgte - ttgts) / repeat; << "tgt takes: " << (ttgte - ttgts) / repeat;
for (int i = 0; i < d; ++i) { for (int i = 0; i < d; ++i) {
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
} }
...@@ -690,7 +699,7 @@ TEST(JitKernel, vadd) { ...@@ -690,7 +699,7 @@ TEST(JitKernel, vadd) {
vadd_intri8(d, x_data, y_data, zref_data); vadd_intri8(d, x_data, y_data, zref_data);
} }
auto si1 = GetCurrentUS(); auto si1 = GetCurrentUS();
VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat; VLOG(30) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
} }
#endif #endif
...@@ -700,13 +709,14 @@ TEST(JitKernel, vadd) { ...@@ -700,13 +709,14 @@ TEST(JitKernel, vadd) {
} }
auto ttgte = GetCurrentUS(); auto ttgte = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat VLOG(30) << "Vec size " << d
<< ": refer takes: " << (trefe - trefs) / repeat
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
<< " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, " << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
#else #else
<< " us, " << " us, "
#endif #endif
<< "tgt takes: " << (ttgte - ttgts) / repeat; << "tgt takes: " << (ttgte - ttgts) / repeat;
for (int i = 0; i < d; ++i) { for (int i = 0; i < d; ++i) {
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
} }
...@@ -761,9 +771,10 @@ TEST(JitKernel, vaddrelu) { ...@@ -761,9 +771,10 @@ TEST(JitKernel, vaddrelu) {
ker->Compute(x_data, y_data, ztgt_data, d); ker->Compute(x_data, y_data, ztgt_data, d);
} }
auto ttgte = GetCurrentUS(); auto ttgte = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat VLOG(30) << "Vec size " << d
<< " us, better takes: " << (tmkle - tmkls) / repeat << " us, " << ": refer takes: " << (trefe - trefs) / repeat
<< "tgt takes: " << (ttgte - ttgts) / repeat; << " us, better takes: " << (tmkle - tmkls) / repeat << " us, "
<< "tgt takes: " << (ttgte - ttgts) / repeat;
for (int i = 0; i < d; ++i) { for (int i = 0; i < d; ++i) {
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
} }
......
...@@ -270,7 +270,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> { ...@@ -270,7 +270,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
const std::vector<const framework::SelectedRows*>& inputs, const std::vector<const framework::SelectedRows*>& inputs,
framework::SelectedRows* output) { framework::SelectedRows* output) {
if (inputs.size() == 0) { if (inputs.size() == 0) {
VLOG(3) << "no input! return"; VLOG(30) << "no input! return";
return; return;
} }
const framework::SelectedRows* has_value_input = nullptr; const framework::SelectedRows* has_value_input = nullptr;
...@@ -281,7 +281,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> { ...@@ -281,7 +281,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
} }
} }
if (has_value_input == nullptr) { if (has_value_input == nullptr) {
VLOG(3) << "no input has value! just return" << std::endl; VLOG(30) << "no input has value! just return" << std::endl;
return; return;
} }
auto input_width = has_value_input->value().dims()[1]; auto input_width = has_value_input->value().dims()[1];
......
...@@ -314,7 +314,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> { ...@@ -314,7 +314,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
const std::vector<const framework::SelectedRows*>& inputs, const std::vector<const framework::SelectedRows*>& inputs,
framework::SelectedRows* output) { framework::SelectedRows* output) {
if (inputs.size() == 0) { if (inputs.size() == 0) {
VLOG(3) << "no input! return"; VLOG(30) << "no input! return";
return; return;
} }
const framework::SelectedRows* has_value_input = nullptr; const framework::SelectedRows* has_value_input = nullptr;
...@@ -325,7 +325,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> { ...@@ -325,7 +325,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
} }
} }
if (has_value_input == nullptr) { if (has_value_input == nullptr) {
VLOG(3) << "no input has value! just return" << std::endl; VLOG(30) << "no input has value! just return" << std::endl;
return; return;
} }
auto input_width = has_value_input->value().dims()[1]; auto input_width = has_value_input->value().dims()[1];
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/mean_op.h" #include "paddle/fluid/operators/mean_op.h"
#include <string>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -42,6 +42,14 @@ Mean Operator calculates the mean of all elements in X. ...@@ -42,6 +42,14 @@ Mean Operator calculates the mean of all elements in X.
} }
}; };
class MeanOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
const override {
return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
}
};
class MeanGradOp : public framework::OperatorWithKernel { class MeanGradOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -50,6 +58,14 @@ class MeanGradOp : public framework::OperatorWithKernel { ...@@ -50,6 +58,14 @@ class MeanGradOp : public framework::OperatorWithKernel {
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
ctx->ShareLoD("X", framework::GradVarName("X")); ctx->ShareLoD("X", framework::GradVarName("X"));
} }
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto input_data_type =
framework::ToDataType(ctx.Input<Tensor>("X")->type());
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
}; };
class MeanGradMaker : public framework::SingleGradOpDescMaker { class MeanGradMaker : public framework::SingleGradOpDescMaker {
...@@ -71,7 +87,8 @@ class MeanGradMaker : public framework::SingleGradOpDescMaker { ...@@ -71,7 +87,8 @@ class MeanGradMaker : public framework::SingleGradOpDescMaker {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanGradMaker); REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanOpInferVarType,
ops::MeanGradMaker);
REGISTER_OPERATOR(mean_grad, ops::MeanGradOp); REGISTER_OPERATOR(mean_grad, ops::MeanGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
mean, ops::MeanKernel<paddle::platform::CPUDeviceContext, float>, mean, ops::MeanKernel<paddle::platform::CPUDeviceContext, float>,
......
...@@ -346,7 +346,7 @@ class MomentumOpKernel : public framework::OpKernel<T> { ...@@ -346,7 +346,7 @@ class MomentumOpKernel : public framework::OpKernel<T> {
// sparse update maybe empty. // sparse update maybe empty.
if (grad->rows().size() == 0) { if (grad->rows().size() == 0) {
VLOG(3) << "Grad SelectedRows contains no data!"; VLOG(30) << "Grad SelectedRows contains no data!";
return; return;
} }
auto* merged_grad = const_cast<framework::Scope&>(ctx.scope()) auto* merged_grad = const_cast<framework::Scope&>(ctx.scope())
......
...@@ -38,9 +38,9 @@ class MulOp : public framework::OperatorWithKernel { ...@@ -38,9 +38,9 @@ class MulOp : public framework::OperatorWithKernel {
int x_num_col_dims = ctx->Attrs().Get<int>("x_num_col_dims"); int x_num_col_dims = ctx->Attrs().Get<int>("x_num_col_dims");
int y_num_col_dims = ctx->Attrs().Get<int>("y_num_col_dims"); int y_num_col_dims = ctx->Attrs().Get<int>("y_num_col_dims");
VLOG(3) << "mul operator x.shape=" << x_dims << " y.shape=" << y_dims VLOG(30) << "mul operator x.shape=" << x_dims << " y.shape=" << y_dims
<< " x_num_col_dims=" << x_num_col_dims << " x_num_col_dims=" << x_num_col_dims
<< " y_num_col_dims=" << y_num_col_dims; << " y_num_col_dims=" << y_num_col_dims;
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
x_dims.size(), x_num_col_dims, x_dims.size(), x_num_col_dims,
...@@ -126,6 +126,14 @@ or not. But the output only shares the LoD information with input $X$. ...@@ -126,6 +126,14 @@ or not. But the output only shares the LoD information with input $X$.
} }
}; };
class MulOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
const override {
return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
}
};
class MulGradOp : public framework::OperatorWithKernel { class MulGradOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -178,7 +186,8 @@ class MulOpGradMaker : public framework::SingleGradOpDescMaker { ...@@ -178,7 +186,8 @@ class MulOpGradMaker : public framework::SingleGradOpDescMaker {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker, ops::MulOpGradMaker); REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker, ops::MulOpInferVarType,
ops::MulOpGradMaker);
REGISTER_OPERATOR(mul_grad, ops::MulGradOp); REGISTER_OPERATOR(mul_grad, ops::MulGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>, mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>,
......
...@@ -63,16 +63,16 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> { ...@@ -63,16 +63,16 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
// device id // device id
int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId(); int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
int idx = comm->GetCommId(gpu_id); int idx = comm->GetCommId(gpu_id);
VLOG(3) << "gpu : " VLOG(30) << "gpu : "
<< " invoke allreduce. send " << x->numel() << " recv " << " invoke allreduce. send " << x->numel() << " recv "
<< out->numel(); << out->numel();
PADDLE_ENFORCE(platform::dynload::ncclAllReduce( PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
x->data<T>(), out->mutable_data<T>(ctx.GetPlace()), out->numel(), x->data<T>(), out->mutable_data<T>(ctx.GetPlace()), out->numel(),
NCCLTypeWrapper<T>::type, reduction_op_, comm->comms().at(idx), NCCLTypeWrapper<T>::type, reduction_op_, comm->comms().at(idx),
ctx.cuda_device_context().stream())); ctx.cuda_device_context().stream()));
VLOG(3) << "gpu : " VLOG(30) << "gpu : "
<< " finished allreduce. send " << x->numel() << " recv " << " finished allreduce. send " << x->numel() << " recv "
<< out->numel(); << out->numel();
} }
}; };
...@@ -109,14 +109,14 @@ class NCCLReduceKernel : public framework::OpKernel<T> { ...@@ -109,14 +109,14 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
} else { } else {
out->Resize(framework::make_ddim({0})); out->Resize(framework::make_ddim({0}));
} }
VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel() VLOG(30) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel()
<< " recv " << out->numel(); << " recv " << out->numel();
PADDLE_ENFORCE(platform::dynload::ncclReduce( PADDLE_ENFORCE(platform::dynload::ncclReduce(
x->data<T>(), recvbuffer, x->numel(), NCCLTypeWrapper<T>::type, x->data<T>(), recvbuffer, x->numel(), NCCLTypeWrapper<T>::type,
reduction_op_, root, comm->comms().at(idx), reduction_op_, root, comm->comms().at(idx),
ctx.cuda_device_context().stream())); ctx.cuda_device_context().stream()));
VLOG(3) << "gpu : " << gpu_id << " finished reduce. send " << x->numel() VLOG(30) << "gpu : " << gpu_id << " finished reduce. send " << x->numel()
<< " recv " << out->numel(); << " recv " << out->numel();
} }
}; };
...@@ -133,21 +133,22 @@ class NCCLBcastKernel : public framework::OpKernel<T> { ...@@ -133,21 +133,22 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
int idx = comm->GetCommId(gpu_id); int idx = comm->GetCommId(gpu_id);
if (idx == root) { if (idx == root) {
auto* x = ctx.Input<LoDTensor>("X"); auto* x = ctx.Input<LoDTensor>("X");
VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel(); VLOG(30) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel();
PADDLE_ENFORCE(platform::dynload::ncclBcast( PADDLE_ENFORCE(platform::dynload::ncclBcast(
reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), x->numel(), reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), x->numel(),
NCCLTypeWrapper<T>::type, root, comm->comms().at(idx), NCCLTypeWrapper<T>::type, root, comm->comms().at(idx),
ctx.cuda_device_context().stream())); ctx.cuda_device_context().stream()));
VLOG(3) << "gpu : " << gpu_id << " finished Bcast."; VLOG(30) << "gpu : " << gpu_id << " finished Bcast.";
} else { } else {
auto* out = ctx.Output<LoDTensor>("Out"); auto* out = ctx.Output<LoDTensor>("Out");
VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. recv buffer " VLOG(30) << "gpu : " << gpu_id << " invoke Bcast. recv buffer "
<< framework::product(out->dims()); << framework::product(out->dims());
PADDLE_ENFORCE(platform::dynload::ncclBcast( PADDLE_ENFORCE(platform::dynload::ncclBcast(
out->mutable_data<T>(ctx.GetPlace()), out->numel(), out->mutable_data<T>(ctx.GetPlace()), out->numel(),
NCCLTypeWrapper<T>::type, root, comm->comms().at(idx), NCCLTypeWrapper<T>::type, root, comm->comms().at(idx),
ctx.cuda_device_context().stream())); ctx.cuda_device_context().stream()));
VLOG(3) << "gpu : " << gpu_id << " finished Bcast. recv " << out->numel(); VLOG(30) << "gpu : " << gpu_id << " finished Bcast. recv "
<< out->numel();
} }
} }
}; };
......
...@@ -86,9 +86,9 @@ class NCCLTester : public ::testing::Test { ...@@ -86,9 +86,9 @@ class NCCLTester : public ::testing::Test {
(*p_scopes).resize(gpu_list_.size()); (*p_scopes).resize(gpu_list_.size());
auto op = f::OpRegistry::CreateOp(*op1); auto op = f::OpRegistry::CreateOp(*op1);
VLOG(1) << "invoke NCCLInitOp."; VLOG(10) << "invoke NCCLInitOp.";
op->Run(g_scope_, cpu_place); op->Run(g_scope_, cpu_place);
VLOG(1) << "NCCLInitOp finished."; VLOG(10) << "NCCLInitOp finished.";
} }
int GetGPUData(int gpu_id) { return gpu_id + 42; } int GetGPUData(int gpu_id) { return gpu_id + 42; }
...@@ -109,7 +109,7 @@ class NCCLTester : public ::testing::Test { ...@@ -109,7 +109,7 @@ class NCCLTester : public ::testing::Test {
std::vector<T> send_vector(f::product(kDims), GetGPUData(gpu_id)); std::vector<T> send_vector(f::product(kDims), GetGPUData(gpu_id));
paddle::framework::TensorFromVector<T>(send_vector, *ctx, send_tensor); paddle::framework::TensorFromVector<T>(send_vector, *ctx, send_tensor);
VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel(); VLOG(10) << "Send Tensor filled with elements " << send_tensor->numel();
} }
lk.unlock(); lk.unlock();
...@@ -119,11 +119,11 @@ class NCCLTester : public ::testing::Test { ...@@ -119,11 +119,11 @@ class NCCLTester : public ::testing::Test {
auto op = f::OpRegistry::CreateOp(*op1); auto op = f::OpRegistry::CreateOp(*op1);
VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type(); VLOG(10) << "Device : " << gpu_id << " invoke " << op_desc.Type();
VLOG(1) << " send_tensor : " << send_tensor->numel() VLOG(10) << " send_tensor : " << send_tensor->numel()
<< " recv_tensor : " << recv_tensor->numel(); << " recv_tensor : " << recv_tensor->numel();
op->Run(*scope, place); op->Run(*scope, place);
VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type(); VLOG(10) << "Device : " << gpu_id << " finished " << op_desc.Type();
} }
public: public:
......
...@@ -48,7 +48,7 @@ static void SplitTensorAndMoveTensorToScopes( ...@@ -48,7 +48,7 @@ static void SplitTensorAndMoveTensorToScopes(
auto lod_tensors = tensor.SplitLoDTensor(places); auto lod_tensors = tensor.SplitLoDTensor(places);
for (auto &lod : lod_tensors) { for (auto &lod : lod_tensors) {
VLOG(3) << lod.dims(); VLOG(30) << lod.dims();
} }
if (num_sub_scopes == 0) { if (num_sub_scopes == 0) {
num_sub_scopes = lod_tensors.size(); num_sub_scopes = lod_tensors.size();
...@@ -263,7 +263,7 @@ class ParallelDoGradOp : public framework::OperatorBase { ...@@ -263,7 +263,7 @@ class ParallelDoGradOp : public framework::OperatorBase {
if (s == framework::kEmptyVarName) { if (s == framework::kEmptyVarName) {
continue; continue;
} }
VLOG(3) << "Moving " << s; VLOG(30) << "Moving " << s;
CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s)); CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s));
} }
WaitOnPlaces(places); WaitOnPlaces(places);
...@@ -277,7 +277,7 @@ class ParallelDoGradOp : public framework::OperatorBase { ...@@ -277,7 +277,7 @@ class ParallelDoGradOp : public framework::OperatorBase {
if (s == framework::kEmptyVarName) { if (s == framework::kEmptyVarName) {
continue; continue;
} }
VLOG(3) << "Accumulating " << s; VLOG(30) << "Accumulating " << s;
if (s == framework::kEmptyVarName) continue; if (s == framework::kEmptyVarName) continue;
std::string tmp_name; std::string tmp_name;
auto *tmp = sub_scopes[0]->Var(&tmp_name); auto *tmp = sub_scopes[0]->Var(&tmp_name);
...@@ -289,7 +289,7 @@ class ParallelDoGradOp : public framework::OperatorBase { ...@@ -289,7 +289,7 @@ class ParallelDoGradOp : public framework::OperatorBase {
auto sum_op = framework::OpRegistry::CreateOp( auto sum_op = framework::OpRegistry::CreateOp(
"sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}}, "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
framework::AttributeMap{{"use_mkldnn", {false}}}); framework::AttributeMap{{"use_mkldnn", {false}}});
VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]); VLOG(100) << sum_op->DebugStringEx(sub_scopes[0]);
sum_op->Run(*sub_scopes[0], places[0]); sum_op->Run(*sub_scopes[0], places[0]);
WaitOnPlace(places[0]); WaitOnPlace(places[0]);
} }
...@@ -316,7 +316,7 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker { ...@@ -316,7 +316,7 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
auto *grad = new framework::OpDesc(); auto *grad = new framework::OpDesc();
grad->SetType("parallel_do_grad"); grad->SetType("parallel_do_grad");
for (auto &input_param : this->InputNames()) { for (auto &input_param : this->InputNames()) {
VLOG(3) << input_param; VLOG(30) << input_param;
grad->SetInput(input_param, this->Input(input_param)); grad->SetInput(input_param, this->Input(input_param));
if (input_param != kPlaces) { if (input_param != kPlaces) {
grad->SetOutput(framework::GradVarName(input_param), grad->SetOutput(framework::GradVarName(input_param),
......
...@@ -40,7 +40,7 @@ int PoolOutputSize(int input_size, int filter_size, int padding, int stride, ...@@ -40,7 +40,7 @@ int PoolOutputSize(int input_size, int filter_size, int padding, int stride,
return output_size; return output_size;
} }
void PoolOp::InferShape(framework::InferShapeContext *ctx) const { void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) of Pooling should not be null."); PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) of Pooling should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Out(Output) of Pooling should not be null."); "Out(Output) of Pooling should not be null.");
...@@ -81,7 +81,7 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const { ...@@ -81,7 +81,7 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
} }
framework::OpKernelType PoolOp::GetExpectedKernelType( framework::OpKernelType PoolOp::GetExpectedKernelType(
const framework::ExecutionContext &ctx) const { const framework::ExecutionContext& ctx) const {
framework::LibraryType library_{framework::LibraryType::kPlain}; framework::LibraryType library_{framework::LibraryType::kPlain};
std::string data_format = ctx.Attr<std::string>("data_format"); std::string data_format = ctx.Attr<std::string>("data_format");
framework::DataLayout layout_ = framework::StringToDataLayout(data_format); framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
...@@ -104,7 +104,7 @@ framework::OpKernelType PoolOp::GetExpectedKernelType( ...@@ -104,7 +104,7 @@ framework::OpKernelType PoolOp::GetExpectedKernelType(
layout_, library_); layout_, library_);
} }
void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const { void PoolOpGrad::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
"Input(X@GRAD) should not be null."); "Input(X@GRAD) should not be null.");
...@@ -112,7 +112,7 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const { ...@@ -112,7 +112,7 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
} }
framework::OpKernelType PoolOpGrad::GetExpectedKernelType( framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
const framework::ExecutionContext &ctx) const { const framework::ExecutionContext& ctx) const {
framework::LibraryType library_{framework::LibraryType::kPlain}; framework::LibraryType library_{framework::LibraryType::kPlain};
std::string data_format = ctx.Attr<std::string>("data_format"); std::string data_format = ctx.Attr<std::string>("data_format");
framework::DataLayout layout_ = framework::StringToDataLayout(data_format); framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
...@@ -262,6 +262,14 @@ Example: ...@@ -262,6 +262,14 @@ Example:
)DOC"); )DOC");
} }
class PoolOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
const override {
return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
}
};
void Pool3dOpMaker::Make() { void Pool3dOpMaker::Make() {
AddInput("X", AddInput("X",
"(Tensor) The input tensor of pooling operator. " "(Tensor) The input tensor of pooling operator. "
...@@ -372,6 +380,7 @@ Example: ...@@ -372,6 +380,7 @@ Example:
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(pool2d, ops::PoolOp, ops::Pool2dOpMaker, REGISTER_OPERATOR(pool2d, ops::PoolOp, ops::Pool2dOpMaker,
ops::PoolOpInferVarType,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(pool2d_grad, ops::PoolOpGrad); REGISTER_OPERATOR(pool2d_grad, ops::PoolOpGrad);
...@@ -383,6 +392,7 @@ REGISTER_OP_CPU_KERNEL( ...@@ -383,6 +392,7 @@ REGISTER_OP_CPU_KERNEL(
ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>); ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OPERATOR(pool3d, ops::PoolOp, ops::Pool3dOpMaker, REGISTER_OPERATOR(pool3d, ops::PoolOp, ops::Pool3dOpMaker,
ops::PoolOpInferVarType,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(pool3d_grad, ops::PoolOpGrad); REGISTER_OPERATOR(pool3d_grad, ops::PoolOpGrad);
......
...@@ -48,12 +48,12 @@ class PrefetchOp : public framework::OperatorBase { ...@@ -48,12 +48,12 @@ class PrefetchOp : public framework::OperatorBase {
std::vector<distributed::VarHandlePtr> rets; std::vector<distributed::VarHandlePtr> rets;
for (size_t i = 0; i < ins.size(); i++) { for (size_t i = 0; i < ins.size(); i++) {
if (NeedSend(scope, ins[i])) { if (NeedSend(scope, ins[i])) {
VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get " VLOG(30) << "sending " << ins[i] << " to " << epmap[i] << " to get "
<< outs[i] << " back"; << outs[i] << " back";
rets.push_back(rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope, rets.push_back(rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope,
ins[i], outs[i])); ins[i], outs[i]));
} else { } else {
VLOG(3) << "don't send no-initialied variable: " << ins[i]; VLOG(30) << "don't send no-initialied variable: " << ins[i];
} }
} }
for (size_t i = 0; i < rets.size(); i++) { for (size_t i = 0; i < rets.size(); i++) {
......
...@@ -155,8 +155,8 @@ class RandomCropKernel : public framework::OpKernel<T> { ...@@ -155,8 +155,8 @@ class RandomCropKernel : public framework::OpKernel<T> {
seed = *cpu_seed.data<int64_t>(); seed = *cpu_seed.data<int64_t>();
} }
} else { } else {
VLOG(5) << "WARNING: The input 'Seed' is not initialized, use attribute " VLOG(50) << "WARNING: The input 'Seed' is not initialized, use attribute "
"'startup_seed' instead."; "'startup_seed' instead.";
seed = ctx.Attr<int>("startup_seed"); seed = ctx.Attr<int>("startup_seed");
} }
auto shape = ctx.Attr<std::vector<int>>("shape"); auto shape = ctx.Attr<std::vector<int>>("shape");
......
...@@ -42,7 +42,7 @@ class BlockingQueue { ...@@ -42,7 +42,7 @@ class BlockingQueue {
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
send_cv_.wait(lock, [&] { return queue_.size() < capacity_ || closed_; }); send_cv_.wait(lock, [&] { return queue_.size() < capacity_ || closed_; });
if (closed_) { if (closed_) {
VLOG(5) VLOG(50)
<< "WARNING: Sending an element to a closed reader::BlokcingQueue."; << "WARNING: Sending an element to a closed reader::BlokcingQueue.";
return false; return false;
} }
...@@ -56,7 +56,7 @@ class BlockingQueue { ...@@ -56,7 +56,7 @@ class BlockingQueue {
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
send_cv_.wait(lock, [&] { return queue_.size() < capacity_ || closed_; }); send_cv_.wait(lock, [&] { return queue_.size() < capacity_ || closed_; });
if (closed_) { if (closed_) {
VLOG(5) VLOG(50)
<< "WARNING: Sending an element to a closed reader::BlokcingQueue."; << "WARNING: Sending an element to a closed reader::BlokcingQueue.";
return false; return false;
} }
......
...@@ -26,7 +26,7 @@ class ShuffleReader : public framework::DecoratedReader { ...@@ -26,7 +26,7 @@ class ShuffleReader : public framework::DecoratedReader {
ShuffleReader(const std::shared_ptr<ReaderBase>& reader, size_t buffer_size, ShuffleReader(const std::shared_ptr<ReaderBase>& reader, size_t buffer_size,
size_t seed = 0) size_t seed = 0)
: DecoratedReader(reader), buffer_size_(buffer_size), seed_(seed) { : DecoratedReader(reader), buffer_size_(buffer_size), seed_(seed) {
VLOG(10) << "Create shuffle reader of " << reader_; VLOG(100) << "Create shuffle reader of " << reader_;
if (seed_ == 0) { if (seed_ == 0) {
std::random_device device; std::random_device device;
seed_ = device(); seed_ = device();
...@@ -37,7 +37,7 @@ class ShuffleReader : public framework::DecoratedReader { ...@@ -37,7 +37,7 @@ class ShuffleReader : public framework::DecoratedReader {
void ReadNextImpl(std::vector<framework::LoDTensor>* out) override { void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
out->clear(); out->clear();
if (iteration_pos_ >= buffer_.size()) { if (iteration_pos_ >= buffer_.size()) {
VLOG(10) << "Resetting shuffle buffer"; VLOG(100) << "Resetting shuffle buffer";
ReloadBuffer(); ReloadBuffer();
if (buffer_.empty()) { if (buffer_.empty()) {
return; return;
...@@ -73,7 +73,7 @@ class ShuffleReader : public framework::DecoratedReader { ...@@ -73,7 +73,7 @@ class ShuffleReader : public framework::DecoratedReader {
std::mt19937 g(seed_); std::mt19937 g(seed_);
std::shuffle(buffer_.begin(), buffer_.end(), g); std::shuffle(buffer_.begin(), buffer_.end(), g);
seed_ = g(); // update seed_; seed_ = g(); // update seed_;
VLOG(10) << "random buffer size = " << buffer_.size(); VLOG(100) << "random buffer size = " << buffer_.size();
} }
size_t buffer_size_; size_t buffer_size_;
......
...@@ -160,7 +160,7 @@ class RecurrentBase : public framework::OperatorBase { ...@@ -160,7 +160,7 @@ class RecurrentBase : public framework::OperatorBase {
Callback callback) { Callback callback) {
PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size()); PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size());
for (size_t i = 0; i < dst_vars.size(); ++i) { for (size_t i = 0; i < dst_vars.size(); ++i) {
VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i]; VLOG(100) << "Link " << src_vars[i] << " to " << dst_vars[i];
AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback); AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback);
} }
} }
...@@ -176,7 +176,7 @@ class RecurrentBase : public framework::OperatorBase { ...@@ -176,7 +176,7 @@ class RecurrentBase : public framework::OperatorBase {
Callback callback) { Callback callback) {
PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size()); PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size());
for (size_t i = 0; i < dst_vars.size(); ++i) { for (size_t i = 0; i < dst_vars.size(); ++i) {
VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i]; VLOG(100) << "Link " << src_vars[i] << " to " << dst_vars[i];
AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback); AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback);
} }
} }
...@@ -230,7 +230,7 @@ class RecurrentOp : public RecurrentBase { ...@@ -230,7 +230,7 @@ class RecurrentOp : public RecurrentBase {
void RunImpl(const framework::Scope &scope, void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override { const platform::Place &place) const override {
auto seq_len = static_cast<size_t>(this->GetSequenceLength(scope)); auto seq_len = static_cast<size_t>(this->GetSequenceLength(scope));
VLOG(3) << "Static RNN input sequence length = " << seq_len; VLOG(30) << "Static RNN input sequence length = " << seq_len;
StepScopes scopes = CreateStepScopes(scope, seq_len); StepScopes scopes = CreateStepScopes(scope, seq_len);
auto reverse = Attr<bool>(kReverse); auto reverse = Attr<bool>(kReverse);
...@@ -241,7 +241,7 @@ class RecurrentOp : public RecurrentBase { ...@@ -241,7 +241,7 @@ class RecurrentOp : public RecurrentBase {
for (size_t i = 0; i < seq_len; ++i) { for (size_t i = 0; i < seq_len; ++i) {
size_t seq_offset = reverse ? seq_len - i - 1 : i; size_t seq_offset = reverse ? seq_len - i - 1 : i;
VLOG(3) << "Recurrent operate at the time step " << seq_offset; VLOG(30) << "Recurrent operate at the time step " << seq_offset;
auto &cur_scope = scopes.CurScope(); auto &cur_scope = scopes.CurScope();
...@@ -334,7 +334,7 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -334,7 +334,7 @@ class RecurrentGradOp : public RecurrentBase {
for (size_t step_id = 0; step_id < seq_len; ++step_id) { for (size_t step_id = 0; step_id < seq_len; ++step_id) {
size_t seq_offset = reverse ? step_id : seq_len - step_id - 1; size_t seq_offset = reverse ? step_id : seq_len - step_id - 1;
VLOG(3) << "Recurrent backward operate at the time step " << seq_offset; VLOG(30) << "Recurrent backward operate at the time step " << seq_offset;
auto &cur_scope = scopes.CurScope(); auto &cur_scope = scopes.CurScope();
// Link outside::output_grads --> inside::output_grads // Link outside::output_grads --> inside::output_grads
// inside::output_grad = outside::output_grad[seq_offset:seq_offset+1] // inside::output_grad = outside::output_grad[seq_offset:seq_offset+1]
...@@ -348,11 +348,11 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -348,11 +348,11 @@ class RecurrentGradOp : public RecurrentBase {
}); });
auto og_set = List2Set(Inputs(kOutputGrads)); auto og_set = List2Set(Inputs(kOutputGrads));
if (VLOG_IS_ON(10)) { if (VLOG_IS_ON(100)) {
std::ostringstream sout; std::ostringstream sout;
std::copy(og_set.begin(), og_set.end(), std::copy(og_set.begin(), og_set.end(),
std::ostream_iterator<std::string>(sout, ",")); std::ostream_iterator<std::string>(sout, ","));
VLOG(10) << " RNN output gradients = [" << sout.str() << "]"; VLOG(100) << " RNN output gradients = [" << sout.str() << "]";
} }
// Link states // Link states
...@@ -374,7 +374,7 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -374,7 +374,7 @@ class RecurrentGradOp : public RecurrentBase {
auto &ex_tensor = auto &ex_tensor =
ex_scope.FindVar(ex_grad)->Get<framework::LoDTensor>(); ex_scope.FindVar(ex_grad)->Get<framework::LoDTensor>();
VLOG(10) << " RNN link " << cur_grad << " from " << ex_grad; VLOG(100) << " RNN link " << cur_grad << " from " << ex_grad;
auto *cur_grad_var = cur_scope.Var(cur_grad); auto *cur_grad_var = cur_scope.Var(cur_grad);
auto cur_grad_tensor = auto cur_grad_tensor =
cur_grad_var->GetMutable<framework::LoDTensor>(); cur_grad_var->GetMutable<framework::LoDTensor>();
...@@ -382,12 +382,12 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -382,12 +382,12 @@ class RecurrentGradOp : public RecurrentBase {
} }
} }
VLOG(5) << "Recurrent memory linking finished "; VLOG(50) << "Recurrent memory linking finished ";
// Run step block with cur_scope // Run step block with cur_scope
executor.Run(*program, &cur_scope, block->ID(), executor.Run(*program, &cur_scope, block->ID(),
false /*create_local_scope*/); false /*create_local_scope*/);
VLOG(5) << "executor.Run finished "; VLOG(50) << "executor.Run finished ";
auto local_var_names = LocalVarNames(cur_scope); auto local_var_names = LocalVarNames(cur_scope);
...@@ -436,7 +436,7 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -436,7 +436,7 @@ class RecurrentGradOp : public RecurrentBase {
cur_scope.Rename(new_inside_name, inside_grad_name); cur_scope.Rename(new_inside_name, inside_grad_name);
} }
} }
VLOG(5) << "Accumulate Parameter finished "; VLOG(50) << "Accumulate Parameter finished ";
// Copy input gradient from inside to outside // Copy input gradient from inside to outside
// outside::input_grad[seq_offset: seq_offset + 1] = inside::input_grad // outside::input_grad[seq_offset: seq_offset + 1] = inside::input_grad
...@@ -455,7 +455,7 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -455,7 +455,7 @@ class RecurrentGradOp : public RecurrentBase {
auto dst = outside->Slice(seq_offset, seq_offset + 1); auto dst = outside->Slice(seq_offset, seq_offset + 1);
framework::TensorCopy(inside, place, dev_ctx, &dst); framework::TensorCopy(inside, place, dev_ctx, &dst);
}); });
VLOG(5) << "Link outside gradient finished "; VLOG(50) << "Link outside gradient finished ";
if (step_id + 1 == seq_len) { // at_end if (step_id + 1 == seq_len) { // at_end
// copy initialize states gradient from inside to outside // copy initialize states gradient from inside to outside
...@@ -468,7 +468,7 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -468,7 +468,7 @@ class RecurrentGradOp : public RecurrentBase {
outside->mutable_data(place, inside.type()); outside->mutable_data(place, inside.type());
framework::TensorCopy(inside, place, dev_ctx, outside); framework::TensorCopy(inside, place, dev_ctx, outside);
}); });
VLOG(5) << "Link initialize state gradient finished "; VLOG(50) << "Link initialize state gradient finished ";
} }
scopes.Next(); scopes.Next();
} }
......
...@@ -47,7 +47,7 @@ class RecvOp : public framework::OperatorBase { ...@@ -47,7 +47,7 @@ class RecvOp : public framework::OperatorBase {
std::vector<distributed::VarHandlePtr> rets; std::vector<distributed::VarHandlePtr> rets;
for (size_t i = 0; i < outs.size(); i++) { for (size_t i = 0; i < outs.size(); i++) {
VLOG(3) << "getting " << outs[i] << " from " << epmap[i]; VLOG(30) << "getting " << outs[i] << " from " << epmap[i];
rets.push_back(rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i])); rets.push_back(rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]));
} }
if (sync_mode) { if (sync_mode) {
......
...@@ -93,7 +93,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase { ...@@ -93,7 +93,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
in_grad_var_name); in_grad_var_name);
if (out_grad_var == nullptr) { if (out_grad_var == nullptr) {
VLOG(5) << "Using fill constant 0 as starting gradient"; VLOG(50) << "Using fill constant 0 as starting gradient";
auto in_var_name = Input("X"); auto in_var_name = Input("X");
auto *in_var = scope.FindVar(in_var_name); auto *in_var = scope.FindVar(in_var_name);
auto &in_var_tensor = in_var->Get<framework::LoDTensor>(); auto &in_var_tensor = in_var->Get<framework::LoDTensor>();
......
...@@ -110,7 +110,7 @@ class SaveOp : public framework::OperatorBase { ...@@ -110,7 +110,7 @@ class SaveOp : public framework::OperatorBase {
lt_var != nullptr, lt_var != nullptr,
"Can not find variable kLookupTablePath for SaveSelectedRows"); "Can not find variable kLookupTablePath for SaveSelectedRows");
std::string filename = lt_var->data(); std::string filename = lt_var->data();
VLOG(4) << "SaveSelectedRows get File name: " << filename; VLOG(40) << "SaveSelectedRows get File name: " << filename;
MkDirRecursively(DirName(filename).c_str()); MkDirRecursively(DirName(filename).c_str());
......
...@@ -42,12 +42,12 @@ class SendBarrierOp : public framework::OperatorBase { ...@@ -42,12 +42,12 @@ class SendBarrierOp : public framework::OperatorBase {
distributed::RPCClient::GetInstance<RPCCLIENT_T>( distributed::RPCClient::GetInstance<RPCCLIENT_T>(
Attr<int>("trainer_id")); Attr<int>("trainer_id"));
VLOG(3) << "SendBarrierOp sync"; VLOG(30) << "SendBarrierOp sync";
// need to wait before sending send_barrier message // need to wait before sending send_barrier message
PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
for (auto& ep : eps) { for (auto& ep : eps) {
VLOG(3) << "send barrier, ep: " << ep; VLOG(30) << "send barrier, ep: " << ep;
rpc_client->AsyncSendBatchBarrier(ep); rpc_client->AsyncSendBatchBarrier(ep);
} }
PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
......
...@@ -50,10 +50,10 @@ class SendOp : public framework::OperatorBase { ...@@ -50,10 +50,10 @@ class SendOp : public framework::OperatorBase {
std::vector<distributed::VarHandlePtr> rets; std::vector<distributed::VarHandlePtr> rets;
for (size_t i = 0; i < ins.size(); i++) { for (size_t i = 0; i < ins.size(); i++) {
if (NeedSend(scope, ins[i])) { if (NeedSend(scope, ins[i])) {
VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; VLOG(30) << "sending " << ins[i] << " to " << epmap[i];
rets.push_back(rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i])); rets.push_back(rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]));
} else { } else {
VLOG(3) << "don't send no-initialied variable: " << ins[i]; VLOG(30) << "don't send no-initialied variable: " << ins[i];
} }
} }
if (sync_send) { if (sync_send) {
......
...@@ -120,7 +120,7 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs, ...@@ -120,7 +120,7 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs,
void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) { void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) {
f::Scope scope; f::Scope scope;
p::CPUPlace place; p::CPUPlace place;
VLOG(4) << "before init tensor"; VLOG(40) << "before init tensor";
if (is_sparse) { if (is_sparse) {
InitSelectedRowsInScope(place, &scope); InitSelectedRowsInScope(place, &scope);
} else { } else {
...@@ -146,7 +146,7 @@ void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) { ...@@ -146,7 +146,7 @@ void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) {
attrs.insert({"PrefetchBlock", prefetch_block}); attrs.insert({"PrefetchBlock", prefetch_block});
attrs.insert({"grad_to_block_id", std::vector<std::string>({""})}); attrs.insert({"grad_to_block_id", std::vector<std::string>({""})});
attrs.insert({"sync_mode", true}); attrs.insert({"sync_mode", true});
VLOG(4) << "before init op"; VLOG(40) << "before init op";
listen_and_serv_op = listen_and_serv_op =
f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs); f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs);
*initialized = true; *initialized = true;
......
...@@ -127,7 +127,7 @@ class SequenceMaskKernel : public framework::OpKernel<Tx> { ...@@ -127,7 +127,7 @@ class SequenceMaskKernel : public framework::OpKernel<Tx> {
auto x_numel = x->numel(); auto x_numel = x->numel();
if (maxlen < 0) { if (maxlen < 0) {
#ifdef __NVCC__ #ifdef __NVCC__
VLOG(10) VLOG(100)
<< "SequenceMaskOp on GPU may be slow when maxlen is not provided."; << "SequenceMaskOp on GPU may be slow when maxlen is not provided.";
maxlen = static_cast<int>( maxlen = static_cast<int>(
thrust::reduce(thrust::device_pointer_cast(x_data), thrust::reduce(thrust::device_pointer_cast(x_data),
......
...@@ -98,10 +98,10 @@ class SGDOpKernel : public framework::OpKernel<T> { ...@@ -98,10 +98,10 @@ class SGDOpKernel : public framework::OpKernel<T> {
auto param_row_width = param.value().dims()[1]; auto param_row_width = param.value().dims()[1];
auto grad_row_width = grad.value().dims()[1]; auto grad_row_width = grad.value().dims()[1];
VLOG(4) << " param rows: " << param.rows().size() VLOG(40) << " param rows: " << param.rows().size()
<< " param memory rows: " << param.value().dims()[0] << " param memory rows: " << param.value().dims()[0]
<< " grad rows: " << grad.rows().size() << " grad rows: " << grad.rows().size()
<< " grad memory rows: " << grad.value().dims()[0]; << " grad memory rows: " << grad.value().dims()[0];
PADDLE_ENFORCE_EQ(param_row_width, grad_row_width, PADDLE_ENFORCE_EQ(param_row_width, grad_row_width,
"param_row should have the same size with grad_row"); "param_row should have the same size with grad_row");
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/similarity_focus_op.h"
namespace paddle {
namespace operators {
class SimilarityFocusOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(Tensor, default Tensor<float>), a 4-D tensor with shape,"
" [BatchSize, X, Y, Z]");
AddOutput("Out",
"(Tensor, default Tensor<float>), the similarity focus mask"
" with the same shape of input X.");
AddAttr<int>("axis",
"(int32), indicating the dimension to be select. It can"
" only be 1, 2, or 3.");
AddAttr<std::vector<int>>("indexes",
"(std::vector<int32>), indicating the indexes"
" of the selected dimension.");
AddComment(R"DOC(
SimilarityFocus Operator.
Generate a similarity focus mask with the same shape of input using the following method:
1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding
to the axis according to the indexes. For example, if axis=1 and indexes=[a],
it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X
is (BatchSize, A, B, C), the shape of tensor T is (BatchSize, B, C).
2. For each index, find the largest numbers in the tensor T, so that the same
row and same column has at most one number(what it means is that if the
largest number has been found in the i-th row and the j-th column, then
the numbers in the i-th row or j-th column will be skipped. And then the
next largest number will be selected from the remaining numbers. Obviously
there will be min(B, C) numbers), and mark the corresponding position of the
3-D similarity focus mask as 1, otherwise as 0. Do elementwise-or for
each index.
3. Broadcast the 3-D similarity focus mask to the same shape of input X.
Refer to `Similarity Focus Layer <http://www.aclweb.org/anthology/N16-1108>`_
)DOC");
}
};
class SimilarityFocusOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
auto x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(x_dims.size(), 4, "Input(X)'s rank should be 4.");
ctx->SetOutputDim("Out", x_dims);
ctx->ShareLoD("X", /*->*/ "Out");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()),
platform::CPUPlace());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(similarity_focus, ops::SimilarityFocusOp,
ops::SimilarityFocusOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(similarity_focus, ops::SimilarityFocusKernel<float>,
ops::SimilarityFocusKernel<double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <cstring>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
class SimilarityFocusKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
Tensor* out = context.Output<Tensor>("Out");
const Tensor* x = context.Input<Tensor>("X");
T* out_data = out->mutable_data<T>(context.GetPlace());
const T* x_data = x->data<T>();
int axis = context.Attr<int>("axis");
std::vector<int> indexes = context.Attr<std::vector<int>>("indexes");
int64_t batch_size = x->dims()[0];
int64_t dim[4];
for (int i = 1; i <= 3; ++i) {
dim[i] = x->dims()[i];
}
if (indexes.size() < 1) {
PADDLE_THROW("Indexes' size can not be 0.");
}
for (auto index : indexes) {
if (dim[axis] < index) {
PADDLE_THROW("Index exceeds tensor shape limit.");
}
}
int64_t array_size = 1;
for (int i = 1; i <= 3; ++i) {
if (i != axis) {
array_size *= dim[i];
}
}
std::vector<std::pair<T, int64_t>> array(array_size);
bool (*cmp)(std::pair<T, int64_t>, std::pair<T, int64_t>) = [](
std::pair<T, int64_t> x, std::pair<T, int64_t> y) {
return x.first > y.first;
};
int64_t (*compute_index)(int64_t*, int, int, int, int) = [](
int64_t* dim, int d1, int d2, int d3, int d4) {
return d1 * dim[1] * dim[2] * dim[3] + d2 * dim[2] * dim[3] +
d3 * dim[3] + d4;
};
memset(out_data, 0, sizeof(T) * batch_size * dim[1] * dim[2] * dim[3]);
for (int i = 0; i < batch_size; ++i) {
for (auto index : indexes) {
if (axis == 1) {
for (int j = 0; j < dim[2]; ++j) {
for (int k = 0; k < dim[3]; ++k) {
array[j * dim[3] + k] = std::make_pair(
x_data[compute_index(dim, i, index, j, k)], j * dim[3] + k);
}
}
std::sort(array.begin(), array.end(), cmp);
int tag_num = 0;
std::vector<bool> tag2(dim[2]), tag3(dim[3]);
for (auto x : array) {
int idx2 = x.second / dim[3];
int idx3 = x.second % dim[3];
if (tag2[idx2] || tag3[idx3]) {
continue;
}
tag_num++;
tag2[idx2] = true;
tag3[idx3] = true;
for (int j = 0; j < dim[1]; ++j) {
out_data[compute_index(dim, i, j, idx2, idx3)] = 1;
}
if (tag_num == std::min(dim[2], dim[3])) {
break;
}
}
} else if (axis == 2) {
for (int j = 0; j < dim[1]; ++j) {
for (int k = 0; k < dim[3]; ++k) {
array[j * dim[3] + k] = std::make_pair(
x_data[compute_index(dim, i, j, index, k)], j * dim[3] + k);
}
}
std::sort(array.begin(), array.end(), cmp);
int tag_num = 0;
std::vector<bool> tag1(dim[1]), tag3(dim[3]);
for (auto x : array) {
int idx1 = x.second / dim[3];
int idx3 = x.second % dim[3];
if (tag1[idx1] || tag3[idx3]) {
continue;
}
tag_num++;
tag1[idx1] = true;
tag3[idx3] = true;
for (int j = 0; j < dim[2]; ++j) {
out_data[compute_index(dim, i, idx1, j, idx3)] = 1;
}
if (tag_num == std::min(dim[1], dim[3])) {
break;
}
}
} else if (axis == 3) {
for (int j = 0; j < dim[1]; ++j) {
for (int k = 0; k < dim[2]; ++k) {
array[j * dim[2] + k] = std::make_pair(
x_data[compute_index(dim, i, j, k, index)], j * dim[2] + k);
}
}
std::sort(array.begin(), array.end(), cmp);
int tag_num = 0;
std::vector<bool> tag1(dim[1]), tag2(dim[2]);
for (auto x : array) {
int idx1 = x.second / dim[2];
int idx2 = x.second % dim[2];
if (tag1[idx1] || tag2[idx2]) {
continue;
}
tag_num++;
tag1[idx1] = true;
tag2[idx2] = true;
for (int j = 0; j < dim[3]; ++j) {
out_data[compute_index(dim, i, idx1, idx2, j)] = 1;
}
if (tag_num == std::min(dim[1], dim[2])) {
break;
}
}
} else {
PADDLE_THROW("Axis must be 1 or 2 or 3");
}
}
}
}
};
} // namespace operators
} // namespace paddle
...@@ -124,6 +124,14 @@ For each row $i$ and each column $j$ in the matrix, we have: ...@@ -124,6 +124,14 @@ For each row $i$ and each column $j$ in the matrix, we have:
} }
}; };
class SoftmaxOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
const override {
return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
}
};
class SoftmaxOpGrad : public framework::OperatorWithKernel { class SoftmaxOpGrad : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -196,7 +204,7 @@ class SoftmaxOpGradMaker : public framework::SingleGradOpDescMaker { ...@@ -196,7 +204,7 @@ class SoftmaxOpGradMaker : public framework::SingleGradOpDescMaker {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker,
ops::SoftmaxOpGradMaker); ops::SoftmaxOpInferVarType, ops::SoftmaxOpGradMaker);
REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad); REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>, softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
......
...@@ -32,7 +32,7 @@ class SplitByrefOpKernel : public framework::OpKernel<T> { ...@@ -32,7 +32,7 @@ class SplitByrefOpKernel : public framework::OpKernel<T> {
for (size_t i = 0; i < outs.size(); ++i) { for (size_t i = 0; i < outs.size(); ++i) {
// NOTE: no need to call mutable_data here to allocate memory. // NOTE: no need to call mutable_data here to allocate memory.
auto* out = outs[i]; auto* out = outs[i];
VLOG(3) << "spliting by ref: " << row_offset << " " << out->dims()[0]; VLOG(30) << "spliting by ref: " << row_offset << " " << out->dims()[0];
*out = in->Slice(row_offset, row_offset + out->dims()[0]); *out = in->Slice(row_offset, row_offset + out->dims()[0]);
row_offset += out->dims()[0]; row_offset += out->dims()[0];
} }
......
...@@ -44,7 +44,7 @@ class SplitIdsOpKernel : public framework::OpKernel<T> { ...@@ -44,7 +44,7 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
for (size_t i = 0; i < ids_tensors.size(); ++i) { for (size_t i = 0; i < ids_tensors.size(); ++i) {
batch_size += ids_tensors[i]->dims()[0]; batch_size += ids_tensors[i]->dims()[0];
} }
VLOG(4) << "Get Total BatchSize is: " << batch_size; VLOG(40) << "Get Total BatchSize is: " << batch_size;
std::vector<T> all_ids(batch_size); std::vector<T> all_ids(batch_size);
int offset = 0; int offset = 0;
......
...@@ -186,7 +186,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -186,7 +186,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
} }
if (in_dim.empty()) { if (in_dim.empty()) {
VLOG(3) << "WARNING: all the inputs are empty"; VLOG(30) << "WARNING: all the inputs are empty";
in_dim = framework::vectorize(get_selected_row(N - 1).value().dims()); in_dim = framework::vectorize(get_selected_row(N - 1).value().dims());
} else { } else {
in_dim[0] = static_cast<int64_t>(first_dim); in_dim[0] = static_cast<int64_t>(first_dim);
......
...@@ -45,7 +45,7 @@ class SumOp : public framework::OperatorWithKernel { ...@@ -45,7 +45,7 @@ class SumOp : public framework::OperatorWithKernel {
size_t N = x_dims.size(); size_t N = x_dims.size();
PADDLE_ENFORCE_GT(N, 0, "Input tensors count should > 0."); PADDLE_ENFORCE_GT(N, 0, "Input tensors count should > 0.");
if (N == 1) { if (N == 1) {
VLOG(3) << "Warning: sum have only one input, may waste memory"; VLOG(30) << "Warning: sum have only one input, may waste memory";
} }
framework::DDim in_dim({0}); framework::DDim in_dim({0});
...@@ -157,8 +157,8 @@ class SumOpVarTypeInference : public framework::VarTypeInference { ...@@ -157,8 +157,8 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
auto& inputs = op_desc.Input("X"); auto& inputs = op_desc.Input("X");
auto var_type = framework::proto::VarType::SELECTED_ROWS; auto var_type = framework::proto::VarType::SELECTED_ROWS;
for (auto& name : op_desc.Input("X")) { for (auto& name : op_desc.Input("X")) {
VLOG(10) << name << " " VLOG(100) << name << " "
<< block->FindRecursiveOrCreateVar(name).GetType(); << block->FindRecursiveOrCreateVar(name).GetType();
} }
bool any_input_is_lod_tensor = std::any_of( bool any_input_is_lod_tensor = std::any_of(
......
...@@ -34,8 +34,8 @@ class WriteToArrayOp : public ArrayOp { ...@@ -34,8 +34,8 @@ class WriteToArrayOp : public ArrayOp {
auto *out = auto *out =
scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensorArray>(); scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensorArray>();
if (offset >= out->size()) { if (offset >= out->size()) {
VLOG(10) << "Resize " << Output("Out") << " from " << out->size() VLOG(100) << "Resize " << Output("Out") << " from " << out->size()
<< " to " << offset + 1; << " to " << offset + 1;
out->resize(offset + 1); out->resize(offset + 1);
} }
auto *out_tensor = &out->at(offset); auto *out_tensor = &out->at(offset);
...@@ -47,9 +47,9 @@ class WriteToArrayOp : public ArrayOp { ...@@ -47,9 +47,9 @@ class WriteToArrayOp : public ArrayOp {
TensorCopy(x_tensor, place, dev_ctx, out_tensor); TensorCopy(x_tensor, place, dev_ctx, out_tensor);
} else { } else {
VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so " VLOG(100) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
"nothing has been written to output array[" "nothing has been written to output array["
<< offset << "]."; << offset << "].";
} }
} }
}; };
...@@ -104,7 +104,7 @@ class WriteToArrayInferVarType : public framework::VarTypeInference { ...@@ -104,7 +104,7 @@ class WriteToArrayInferVarType : public framework::VarTypeInference {
framework::BlockDesc *block) const override { framework::BlockDesc *block) const override {
auto x_name = op_desc.Input("X")[0]; auto x_name = op_desc.Input("X")[0];
auto out_name = op_desc.Output("Out")[0]; auto out_name = op_desc.Output("Out")[0];
VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY"; VLOG(100) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY";
auto &out = block->FindRecursiveOrCreateVar(out_name); auto &out = block->FindRecursiveOrCreateVar(out_name);
out.SetType(framework::proto::VarType::LOD_TENSOR_ARRAY); out.SetType(framework::proto::VarType::LOD_TENSOR_ARRAY);
auto *x = block->FindVarRecursive(x_name); auto *x = block->FindVarRecursive(x_name);
...@@ -139,7 +139,7 @@ class ReadFromArrayOp : public ArrayOp { ...@@ -139,7 +139,7 @@ class ReadFromArrayOp : public ArrayOp {
framework::TensorCopy(x_array[offset], place, dev_ctx, out_tensor); framework::TensorCopy(x_array[offset], place, dev_ctx, out_tensor);
out_tensor->set_lod(x_array[offset].lod()); out_tensor->set_lod(x_array[offset].lod());
} else { } else {
VLOG(10) << "offset " << offset << " >= " << x_array.size(); VLOG(100) << "offset " << offset << " >= " << x_array.size();
} }
} }
}; };
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/variable.h"
namespace paddle {
namespace operators {
using framework::Tensor;
void LodTensorArray2LodTensorVector(const framework::Scope &scope,
const std::string &base_name,
const std::string &lod_tensor_array_name,
std::vector<std::string> *res_names) {
auto &inx =
scope.FindVar(lod_tensor_array_name)->Get<framework::LoDTensorArray>();
for (size_t i = 0; i < inx.size(); i++) {
std::string var_name = base_name + std::to_string(i);
framework::Variable *g_feed_value =
const_cast<framework::Scope &>(scope).Var(var_name);
auto &feed_input =
*(g_feed_value->GetMutable<paddle::framework::LoDTensor>());
feed_input.ShareDataWith(inx[i]);
res_names->push_back(var_name);
}
}
void LodTensorVectorResizeFromLodTensorArray(
const framework::Scope &scope, const std::string &base_name,
const std::string &lod_tensor_array_name,
std::vector<std::string> *res_names) {
auto &inx =
scope.FindVar(lod_tensor_array_name)->Get<framework::LoDTensorArray>();
for (size_t i = 0; i < inx.size(); i++) {
std::string var_name = base_name + std::to_string(i);
framework::Variable *g_feed_value =
const_cast<framework::Scope &>(scope).Var(var_name);
auto &feed_input =
*(g_feed_value->GetMutable<paddle::framework::LoDTensor>());
auto dims = inx[i].dims();
feed_input.Resize(dims);
res_names->push_back(var_name);
}
}
void LodTensorArrayCreateFromLodTensorArray(
const framework::Scope &scope,
const std::string &input_lod_tensor_array_name,
const std::string &output_lod_tensor_array_name) {
auto &inx = scope.FindVar(input_lod_tensor_array_name)
->Get<framework::LoDTensorArray>();
auto &grad_inx = *scope.FindVar(output_lod_tensor_array_name)
->GetMutable<framework::LoDTensorArray>();
for (size_t i = 0; i < inx.size(); i++) {
std::string var_name = output_lod_tensor_array_name + std::to_string(i);
framework::Variable *g_feed_value =
const_cast<framework::Scope &>(scope).Var(var_name);
auto &feed_input =
*(g_feed_value->GetMutable<paddle::framework::LoDTensor>());
grad_inx.push_back(feed_input);
}
}
class LoDTensorArray2TensorOp : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto axis = Attr<int>("axis");
framework::AttributeMap attrs;
attrs["axis"] = axis;
auto &inx = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
auto &out =
*scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
auto &out_inx =
*scope.FindVar(Output("OutIndex"))->GetMutable<framework::LoDTensor>();
const size_t n = inx.size();
PADDLE_ENFORCE_GT(n, 0, "Input tensorarray size should > 0.");
std::string base_name = Inputs("X")[0];
std::vector<std::string> names;
// get the input tensorarray items' dim in out_inx
auto out_inx_dim = out_inx.dims();
out_inx_dim[0] = inx.size();
out_inx.Resize(out_inx_dim);
std::string var_name = "out_index";
framework::Variable *tmp_index_var =
const_cast<framework::Scope &>(scope).Var(var_name);
auto &tmp_index_tensor =
*(tmp_index_var->GetMutable<paddle::framework::LoDTensor>());
tmp_index_tensor.Resize(out_inx_dim);
int *tmp_index_data =
tmp_index_tensor.mutable_data<int>(platform::CPUPlace());
auto out_dims = inx[0].dims();
size_t out_dim_sum = 0;
for (size_t index = 0; index < inx.size(); index++) {
auto inx_dims = inx[index].dims();
out_dim_sum += inx_dims[axis];
tmp_index_data[index] = inx_dims[axis];
}
out_inx.ShareDataWith(tmp_index_tensor);
// get input array items' dims
out_dims[axis] = out_dim_sum;
out.Resize(out_dims);
LodTensorArray2LodTensorVector(scope, base_name, Input("X"), &names);
// Invoke Reshape Op
auto concat_op = framework::OpRegistry::CreateOp(
"concat", {{"X", names}}, {{"Out", {Output("Out")}}}, attrs);
concat_op->Run(scope, place);
}
};
class LoDTensorArray2TensorOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "Input LoDTensorArray of tensor_array_to_tensor operator.");
AddOutput("Out", "Output tensor of tensor_array_to_tensor operator.");
AddOutput("OutIndex",
"Output input LoDTensorArray items' dims of "
"tensor_array_to_tensor operator.");
AddAttr<int>("axis",
"The axis along which the input tensors will be concatenated.")
.SetDefault(0);
AddComment(R"DOC(
tensor_array_to_tensor Operator.
Concatenate the input LoDTensorArray along dimension axis to the output Tensor.
Examples:
Input = {[1,2], [3,4], [5,6]}
axis = 0
Output = [[1,2],
[3,4],
[5,6]]
OutputIndex = [1,1,1]
)DOC");
}
};
class LoDTensorArray2TensorOpInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *ctx) const override {}
};
class LoDTensorArray2TensorGradInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *context) const override {}
};
class LoDTensorArray2TensorGradInferVarType
: public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
for (auto &out_var : op_desc.Output(framework::GradVarName("X"))) {
block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR_ARRAY);
}
}
};
class LoDTensorArray2TensorGradOp : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto axis = Attr<int>("axis");
framework::AttributeMap attrs;
attrs["axis"] = axis;
auto &inx = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
const size_t n = inx.size();
PADDLE_ENFORCE_GT(n, 0, "Input tensorarray size should > 0.");
std::string base_name = Inputs("X")[0];
std::vector<std::string> names;
LodTensorArray2LodTensorVector(scope, base_name, Input("X"), &names);
// grad
auto dx_name = Output(framework::GradVarName("X"));
auto dout_name = Input(framework::GradVarName("Out"));
std::vector<std::string> grad_names;
LodTensorVectorResizeFromLodTensorArray(scope, "grad_name", Input("X"),
&grad_names);
auto concat_grad_op = framework::OpRegistry::CreateOp(
"concat_grad", {{"X", names}, {"Out@GRAD", {dout_name}}},
{{"X@GRAD", grad_names}}, attrs);
concat_grad_op->Run(scope, place);
LodTensorArrayCreateFromLodTensorArray(scope, Input("X"), dx_name);
auto &grad_inx =
*scope.FindVar(dx_name)->GetMutable<framework::LoDTensorArray>();
for (size_t i = 0; i < grad_names.size(); i++) {
std::string var_name = grad_names[i];
auto &feed_input = scope.FindVar(var_name)->Get<framework::LoDTensor>();
grad_inx[i].ShareDataWith(feed_input);
}
}
};
} // namespace operators
} // namespace paddle
USE_OP(concat);
namespace ops = paddle::operators;
REGISTER_OPERATOR(tensor_array_to_tensor, ops::LoDTensorArray2TensorOp,
ops::LoDTensorArray2TensorOpMaker,
ops::LoDTensorArray2TensorOpInferShape,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(tensor_array_to_tensor_grad, ops::LoDTensorArray2TensorGradOp,
ops::LoDTensorArray2TensorGradInferShape,
ops::LoDTensorArray2TensorGradInferVarType);
...@@ -34,7 +34,7 @@ namespace operators { ...@@ -34,7 +34,7 @@ namespace operators {
using FluidDT = framework::proto::VarType_Type; using FluidDT = framework::proto::VarType_Type;
using TRT_DT = nvinfer1::DataType; using TRT_DT = nvinfer1::DataType;
namespace { namespace { // NOLINT
TRT_DT FluidDataType2TRT(FluidDT type) { TRT_DT FluidDataType2TRT(FluidDT type) {
switch (type) { switch (type) {
...@@ -60,7 +60,7 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) { ...@@ -60,7 +60,7 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
return nvinfer1::DimsCHW(shape[1], 1, 1); return nvinfer1::DimsCHW(shape[1], 1, 1);
} }
} // namespace } // namespace // NOLINT
using inference::Singleton; using inference::Singleton;
using inference::tensorrt::TRT_EngineManager; using inference::tensorrt::TRT_EngineManager;
...@@ -127,9 +127,9 @@ class TensorRTEngineKernel : public framework::OpKernel<T> { ...@@ -127,9 +127,9 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
// Convert output tensor from engine to fluid // Convert output tensor from engine to fluid
int output_index = 0; int output_index = 0;
VLOG(4) << "TensorRT Engine Op Outputs:"; VLOG(40) << "TensorRT Engine Op Outputs:";
for (const auto& y : context.Outputs("Ys")) { for (const auto& y : context.Outputs("Ys")) {
VLOG(4) << y; VLOG(40) << y;
// convert output and copy to fluid. // convert output and copy to fluid.
nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]); nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]);
auto dims = trt_t->getDimensions(); auto dims = trt_t->getDimensions();
...@@ -167,7 +167,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> { ...@@ -167,7 +167,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
protected: protected:
void Prepare(const framework::ExecutionContext& context) const { void Prepare(const framework::ExecutionContext& context) const {
VLOG(4) << "Prepare engine"; VLOG(40) << "Prepare engine";
// Get the ProgramDesc and pass to convert. // Get the ProgramDesc and pass to convert.
framework::proto::BlockDesc block_desc; framework::proto::BlockDesc block_desc;
block_desc.ParseFromString(context.Attr<std::string>("subgraph")); block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
...@@ -192,12 +192,12 @@ class TensorRTEngineKernel : public framework::OpKernel<T> { ...@@ -192,12 +192,12 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
engine->InitNetwork(); engine->InitNetwork();
framework::BlockDesc block(nullptr /*programdesc*/, &block_desc); framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
VLOG(4) << "parsed var size " << block.AllVars().size(); VLOG(40) << "parsed var size " << block.AllVars().size();
// Add inputs // Add inputs
VLOG(4) << "declare inputs"; VLOG(40) << "declare inputs";
for (auto& input : context.Inputs("Xs")) { for (auto& input : context.Inputs("Xs")) {
if (parameters.count(input)) continue; if (parameters.count(input)) continue;
VLOG(4) << "declare input " << input; VLOG(40) << "declare input " << input;
auto* var = block.FindVar(input); auto* var = block.FindVar(input);
// TensorRT engine need to create parameters. The parameter's description // TensorRT engine need to create parameters. The parameter's description
// should be set in // should be set in
......
...@@ -129,15 +129,15 @@ class WhileGradOp : public framework::OperatorBase { ...@@ -129,15 +129,15 @@ class WhileGradOp : public framework::OperatorBase {
for (auto cur_scope_iter = step_scopes->rbegin(); for (auto cur_scope_iter = step_scopes->rbegin();
cur_scope_iter != step_scopes->rend(); ++cur_scope_iter) { cur_scope_iter != step_scopes->rend(); ++cur_scope_iter) {
VLOG(3) << "Start backward at time_step " VLOG(30) << "Start backward at time_step "
<< cur_scope_iter - step_scopes->rbegin(); << cur_scope_iter - step_scopes->rbegin();
framework::Scope &cur_scope = **cur_scope_iter; framework::Scope &cur_scope = **cur_scope_iter;
// Link OG from outside to inside // Link OG from outside to inside
for (size_t i = 0; i < outside_og_names.size(); ++i) { for (size_t i = 0; i < outside_og_names.size(); ++i) {
auto outside_og_name = outside_og_names[i]; auto outside_og_name = outside_og_names[i];
auto inside_og_name = inside_og_names[i]; auto inside_og_name = inside_og_names[i];
VLOG(8) << "Linking outside " << outside_og_name << " --> inside " VLOG(80) << "Linking outside " << outside_og_name << " --> inside "
<< inside_og_name; << inside_og_name;
if (scope.FindVar(outside_og_name) == nullptr) { if (scope.FindVar(outside_og_name) == nullptr) {
continue; continue;
} }
...@@ -159,11 +159,11 @@ class WhileGradOp : public framework::OperatorBase { ...@@ -159,11 +159,11 @@ class WhileGradOp : public framework::OperatorBase {
auto &outside_array = og_outside.Get<framework::LoDTensorArray>(); auto &outside_array = og_outside.Get<framework::LoDTensorArray>();
auto &inside_array = auto &inside_array =
detail::Ref(og_inside.GetMutable<framework::LoDTensorArray>()); detail::Ref(og_inside.GetMutable<framework::LoDTensorArray>());
VLOG(8) << outside_og_name << " size = " << outside_array.size(); VLOG(80) << outside_og_name << " size = " << outside_array.size();
inside_array.resize(outside_array.size()); inside_array.resize(outside_array.size());
for (size_t j = 0; j < inside_array.size(); ++j) { for (size_t j = 0; j < inside_array.size(); ++j) {
VLOG(8) << j << " " << outside_array[j].numel(); VLOG(80) << j << " " << outside_array[j].numel();
if (outside_array[j].numel() != 0) { if (outside_array[j].numel() != 0) {
inside_array[j].set_lod(outside_array[j].lod()); inside_array[j].set_lod(outside_array[j].lod());
inside_array[j].ShareDataWith(outside_array[j]); inside_array[j].ShareDataWith(outside_array[j]);
...@@ -289,7 +289,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { ...@@ -289,7 +289,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
auto igs = InputGrad(kX, /*do not drop empty gradient*/ false); auto igs = InputGrad(kX, /*do not drop empty gradient*/ false);
for (auto &each_ig : igs) { for (auto &each_ig : igs) {
if (inner_op_outputs.find(each_ig) == inner_op_outputs.end()) { if (inner_op_outputs.find(each_ig) == inner_op_outputs.end()) {
VLOG(8) << "Ignore " << each_ig; VLOG(80) << "Ignore " << each_ig;
each_ig = framework::kEmptyVarName; each_ig = framework::kEmptyVarName;
} }
} }
...@@ -353,8 +353,8 @@ class WhileGradOpVarTypeInference : public framework::VarTypeInference { ...@@ -353,8 +353,8 @@ class WhileGradOpVarTypeInference : public framework::VarTypeInference {
auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i])); auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i]));
auto *g_var = block->FindVarRecursive(pg_ig_names[i]); auto *g_var = block->FindVarRecursive(pg_ig_names[i]);
if (g_var != nullptr) { // Gradient could be @EMPTY@ if (g_var != nullptr) { // Gradient could be @EMPTY@
VLOG(5) << "Setting " << pg_ig_names[i] << " following " << p_names[i] VLOG(50) << "Setting " << pg_ig_names[i] << " following " << p_names[i]
<< " type: " << p_var.GetType(); << " type: " << p_var.GetType();
g_var->SetType(p_var.GetType()); g_var->SetType(p_var.GetType());
g_var->SetDataType(p_var.GetDataType()); g_var->SetDataType(p_var.GetDataType());
} }
......
...@@ -207,7 +207,10 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) ...@@ -207,7 +207,10 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
<< "." << (driver_version_ % 100) / 10 << "." << (driver_version_ % 100) / 10
<< ", Runtime Version: " << runtime_version_ / 1000 << ", Runtime Version: " << runtime_version_ / 1000
<< "." << (runtime_version_ % 100) / 10; << "." << (runtime_version_ % 100) / 10;
size_t cudnn_dso_ver = dynload::cudnnGetVersion();
LOG_FIRST_N(WARNING, 1) << "device: " << place_.device
<< ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
<< (cudnn_dso_ver % 100) / 10 << ".";
callback_manager_.reset(new StreamCallbackManager(stream_)); callback_manager_.reset(new StreamCallbackManager(stream_));
} }
......
...@@ -203,7 +203,7 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -203,7 +203,7 @@ class DeviceTracerImpl : public DeviceTracer {
void AddCPURecords(const std::string &anno, uint64_t start_ns, void AddCPURecords(const std::string &anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id, int64_t thread_id) { uint64_t end_ns, int64_t device_id, int64_t thread_id) {
if (anno.empty()) { if (anno.empty()) {
VLOG(1) << "Empty timeline annotation."; VLOG(10) << "Empty timeline annotation.";
return; return;
} }
std::lock_guard<std::mutex> l(trace_mu_); std::lock_guard<std::mutex> l(trace_mu_);
...@@ -216,7 +216,7 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -216,7 +216,7 @@ class DeviceTracerImpl : public DeviceTracer {
uint32_t correlation_id, uint64_t bytes) { uint32_t correlation_id, uint64_t bytes) {
// 0 means timestamp information could not be collected for the kernel. // 0 means timestamp information could not be collected for the kernel.
if (start_ns == 0 || end_ns == 0) { if (start_ns == 0 || end_ns == 0) {
VLOG(3) << name << " cannot be traced"; VLOG(30) << name << " cannot be traced";
return; return;
} }
std::lock_guard<std::mutex> l(trace_mu_); std::lock_guard<std::mutex> l(trace_mu_);
...@@ -228,7 +228,7 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -228,7 +228,7 @@ class DeviceTracerImpl : public DeviceTracer {
int64_t stream_id, uint32_t correlation_id) { int64_t stream_id, uint32_t correlation_id) {
// 0 means timestamp information could not be collected for the kernel. // 0 means timestamp information could not be collected for the kernel.
if (start == 0 || end == 0) { if (start == 0 || end == 0) {
VLOG(3) << correlation_id << " cannot be traced"; VLOG(30) << correlation_id << " cannot be traced";
return; return;
} }
std::lock_guard<std::mutex> l(trace_mu_); std::lock_guard<std::mutex> l(trace_mu_);
...@@ -347,7 +347,7 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -347,7 +347,7 @@ class DeviceTracerImpl : public DeviceTracer {
tracer->AddAnnotation(cbInfo->correlationId, anno); tracer->AddAnnotation(cbInfo->correlationId, anno);
} }
} else { } else {
VLOG(1) << "Unhandled API Callback for " << domain << " " << cbid; VLOG(10) << "Unhandled API Callback for " << domain << " " << cbid;
} }
} }
CUpti_SubscriberHandle subscriber_; CUpti_SubscriberHandle subscriber_;
......
...@@ -65,51 +65,54 @@ extern void EnforceCUDNNLoaded(const char* fn_name); ...@@ -65,51 +65,54 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
* include all needed cudnn functions in HPPL * include all needed cudnn functions in HPPL
* different cudnn version has different interfaces * different cudnn version has different interfaces
**/ **/
#define CUDNN_DNN_ROUTINE_EACH(__macro) \ #define CUDNN_DNN_ROUTINE_EACH(__macro) \
__macro(cudnnSetTensor4dDescriptor); \ __macro(cudnnSetTensor4dDescriptor); \
__macro(cudnnSetTensor4dDescriptorEx); \ __macro(cudnnSetTensor4dDescriptorEx); \
__macro(cudnnSetTensorNdDescriptor); \ __macro(cudnnSetTensorNdDescriptor); \
__macro(cudnnGetTensorNdDescriptor); \ __macro(cudnnGetTensorNdDescriptor); \
__macro(cudnnGetConvolutionNdForwardOutputDim); \ __macro(cudnnGetConvolutionNdForwardOutputDim); \
__macro(cudnnGetConvolutionForwardAlgorithm); \ __macro(cudnnGetConvolutionForwardAlgorithm); \
__macro(cudnnCreateTensorDescriptor); \ __macro(cudnnCreateTensorDescriptor); \
__macro(cudnnDestroyTensorDescriptor); \ __macro(cudnnDestroyTensorDescriptor); \
__macro(cudnnCreateFilterDescriptor); \ __macro(cudnnCreateFilterDescriptor); \
__macro(cudnnSetFilter4dDescriptor); \ __macro(cudnnSetFilter4dDescriptor); \
__macro(cudnnSetFilterNdDescriptor); \ __macro(cudnnSetFilterNdDescriptor); \
__macro(cudnnGetFilterNdDescriptor); \ __macro(cudnnGetFilterNdDescriptor); \
__macro(cudnnSetPooling2dDescriptor); \ __macro(cudnnSetPooling2dDescriptor); \
__macro(cudnnSetPoolingNdDescriptor); \ __macro(cudnnSetPoolingNdDescriptor); \
__macro(cudnnGetPoolingNdDescriptor); \ __macro(cudnnGetPoolingNdDescriptor); \
__macro(cudnnDestroyFilterDescriptor); \ __macro(cudnnDestroyFilterDescriptor); \
__macro(cudnnCreateConvolutionDescriptor); \ __macro(cudnnCreateConvolutionDescriptor); \
__macro(cudnnCreatePoolingDescriptor); \ __macro(cudnnCreatePoolingDescriptor); \
__macro(cudnnDestroyPoolingDescriptor); \ __macro(cudnnDestroyPoolingDescriptor); \
__macro(cudnnSetConvolution2dDescriptor); \ __macro(cudnnSetConvolution2dDescriptor); \
__macro(cudnnDestroyConvolutionDescriptor); \ __macro(cudnnDestroyConvolutionDescriptor); \
__macro(cudnnSetConvolutionNdDescriptor); \ __macro(cudnnSetConvolutionNdDescriptor); \
__macro(cudnnGetConvolutionNdDescriptor); \ __macro(cudnnGetConvolutionNdDescriptor); \
__macro(cudnnDeriveBNTensorDescriptor); \ __macro(cudnnDeriveBNTensorDescriptor); \
__macro(cudnnCreateSpatialTransformerDescriptor); \ __macro(cudnnCreateSpatialTransformerDescriptor); \
__macro(cudnnSetSpatialTransformerNdDescriptor); \ __macro(cudnnSetSpatialTransformerNdDescriptor); \
__macro(cudnnDestroySpatialTransformerDescriptor); \ __macro(cudnnDestroySpatialTransformerDescriptor); \
__macro(cudnnSpatialTfGridGeneratorForward); \ __macro(cudnnSpatialTfGridGeneratorForward); \
__macro(cudnnSpatialTfGridGeneratorBackward); \ __macro(cudnnSpatialTfGridGeneratorBackward); \
__macro(cudnnSpatialTfSamplerForward); \ __macro(cudnnSpatialTfSamplerForward); \
__macro(cudnnSpatialTfSamplerBackward); \ __macro(cudnnSpatialTfSamplerBackward); \
__macro(cudnnCreate); \ __macro(cudnnCreate); \
__macro(cudnnDestroy); \ __macro(cudnnDestroy); \
__macro(cudnnSetStream); \ __macro(cudnnSetStream); \
__macro(cudnnActivationForward); \ __macro(cudnnActivationForward); \
__macro(cudnnConvolutionForward); \ __macro(cudnnConvolutionForward); \
__macro(cudnnConvolutionBackwardBias); \ __macro(cudnnConvolutionBackwardBias); \
__macro(cudnnGetConvolutionForwardWorkspaceSize); \ __macro(cudnnGetConvolutionForwardWorkspaceSize); \
__macro(cudnnTransformTensor); \ __macro(cudnnTransformTensor); \
__macro(cudnnPoolingForward); \ __macro(cudnnPoolingForward); \
__macro(cudnnPoolingBackward); \ __macro(cudnnPoolingBackward); \
__macro(cudnnSoftmaxBackward); \ __macro(cudnnSoftmaxBackward); \
__macro(cudnnSoftmaxForward); \ __macro(cudnnSoftmaxForward); \
__macro(cudnnGetVersion); \ __macro(cudnnGetVersion); \
__macro(cudnnFindConvolutionForwardAlgorithmEx); \
__macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \
__macro(cudnnFindConvolutionBackwardDataAlgorithmEx); \
__macro(cudnnGetErrorString); __macro(cudnnGetErrorString);
CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
......
...@@ -72,8 +72,8 @@ static inline std::string join(const std::string& part1, ...@@ -72,8 +72,8 @@ static inline std::string join(const std::string& part1,
static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path, static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
int dynload_flags) { int dynload_flags) {
VLOG(3) << "Try to find library: " << dso_path VLOG(30) << "Try to find library: " << dso_path
<< " from default system path."; << " from default system path.";
// default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
// and /usr/local/lib path // and /usr/local/lib path
void* dso_handle = dlopen(dso_path.c_str(), dynload_flags); void* dso_handle = dlopen(dso_path.c_str(), dynload_flags);
......
...@@ -124,8 +124,8 @@ size_t GpuMaxChunkSize() { ...@@ -124,8 +124,8 @@ size_t GpuMaxChunkSize() {
size_t available = 0; size_t available = 0;
GpuMemoryUsage(&available, &total); GpuMemoryUsage(&available, &total);
VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/" VLOG(100) << "GPU Usage " << available / 1024 / 1024 << "M/"
<< total / 1024 / 1024 << "M"; << total / 1024 / 1024 << "M";
size_t reserving = static_cast<size_t>(0.05 * total); size_t reserving = static_cast<size_t>(0.05 * total);
// If available less than minimum chunk size, no usable memory exists. // If available less than minimum chunk size, no usable memory exists.
available = available =
......
...@@ -48,7 +48,7 @@ void InitGflags(std::vector<std::string> argv) { ...@@ -48,7 +48,7 @@ void InitGflags(std::vector<std::string> argv) {
line += ' '; line += ' ';
} }
google::ParseCommandLineFlags(&argc, &arr, true); google::ParseCommandLineFlags(&argc, &arr, true);
VLOG(1) << "Init commandline: " << line; VLOG(10) << "Init commandline: " << line;
}); });
} }
......
...@@ -112,7 +112,7 @@ struct NCCLContextMap { ...@@ -112,7 +112,7 @@ struct NCCLContextMap {
NCCLGroupGuard gurad; NCCLGroupGuard gurad;
for (auto &gpu_id : order_) { for (auto &gpu_id : order_) {
int rank = trainer_id * order_.size() + gpu_id; int rank = trainer_id * order_.size() + gpu_id;
VLOG(3) << "init nccl rank: " << rank << " nranks: " << nranks; VLOG(30) << "init nccl rank: " << rank << " nranks: " << nranks;
PADDLE_ENFORCE(cudaSetDevice(gpu_id)); PADDLE_ENFORCE(cudaSetDevice(gpu_id));
PADDLE_ENFORCE(platform::dynload::ncclCommInitRank( PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
comms.get() + gpu_id, nranks, *nccl_id, rank)); comms.get() + gpu_id, nranks, *nccl_id, rank));
......
...@@ -61,9 +61,9 @@ struct variant_caster<V<Ts...>> { ...@@ -61,9 +61,9 @@ struct variant_caster<V<Ts...>> {
if (std::is_same<T, std::vector<float>>::value) { if (std::is_same<T, std::vector<float>>::value) {
auto caster_ints = make_caster<std::vector<int64_t>>(); auto caster_ints = make_caster<std::vector<int64_t>>();
if (caster_ints.load(src, convert)) { if (caster_ints.load(src, convert)) {
VLOG(4) << "This value are floats and int64_ts satisfy " VLOG(40) << "This value are floats and int64_ts satisfy "
"simultaneously, will set it's type to " "simultaneously, will set it's type to "
"std::vector<int64_t>"; "std::vector<int64_t>";
value = cast_op<std::vector<int64_t>>(caster_ints); value = cast_op<std::vector<int64_t>>(caster_ints);
return true; return true;
} }
......
...@@ -40,7 +40,7 @@ void ReadBinaryFile(const std::string& filename, std::string* contents) { ...@@ -40,7 +40,7 @@ void ReadBinaryFile(const std::string& filename, std::string* contents) {
std::unique_ptr<paddle::framework::ProgramDesc> Load( std::unique_ptr<paddle::framework::ProgramDesc> Load(
paddle::framework::Executor* executor, const std::string& model_filename) { paddle::framework::Executor* executor, const std::string& model_filename) {
VLOG(3) << "loading model from " << model_filename; VLOG(30) << "loading model from " << model_filename;
std::string program_desc_str; std::string program_desc_str;
ReadBinaryFile(model_filename, &program_desc_str); ReadBinaryFile(model_filename, &program_desc_str);
......
...@@ -614,7 +614,24 @@ EOF ...@@ -614,7 +614,24 @@ EOF
CMD='"true"' CMD='"true"'
fi fi
cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF if [ "$1" == "cp35-cp35m" ]; then
cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
ADD python/dist/*.whl /
# run paddle version to install python packages first
RUN apt-get update && ${NCCL_DEPS}
RUN apt-get install -y wget python3 python3-pip libgtk2.0-dev dmidecode python3-tk && \
pip3 install opencv-python && pip3 install /*.whl; apt-get install -f -y && \
apt-get clean -y && \
rm -f /*.whl && \
${PADDLE_VERSION} && \
ldconfig
${DOCKERFILE_CUDNN_DSO}
${DOCKERFILE_CUBLAS_DSO}
${DOCKERFILE_GPU_ENV}
ENV NCCL_LAUNCH_MODE PARALLEL
EOF
else
cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
ADD python/dist/*.whl / ADD python/dist/*.whl /
# run paddle version to install python packages first # run paddle version to install python packages first
RUN apt-get update && ${NCCL_DEPS} RUN apt-get update && ${NCCL_DEPS}
...@@ -629,6 +646,8 @@ EOF ...@@ -629,6 +646,8 @@ EOF
${DOCKERFILE_GPU_ENV} ${DOCKERFILE_GPU_ENV}
ENV NCCL_LAUNCH_MODE PARALLEL ENV NCCL_LAUNCH_MODE PARALLEL
EOF EOF
fi
if [[ ${WITH_GOLANG:-OFF} == "ON" ]]; then if [[ ${WITH_GOLANG:-OFF} == "ON" ]]; then
cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
ADD go/cmd/pserver/pserver /usr/bin/ ADD go/cmd/pserver/pserver /usr/bin/
...@@ -703,7 +722,7 @@ function main() { ...@@ -703,7 +722,7 @@ function main() {
build) build)
cmake_gen ${PYTHON_ABI:-""} cmake_gen ${PYTHON_ABI:-""}
build build
gen_dockerfile gen_dockerfile ${PYTHON_ABI:-""}
;; ;;
build_android) build_android)
build_android build_android
...@@ -730,7 +749,7 @@ function main() { ...@@ -730,7 +749,7 @@ function main() {
gen_html gen_html
;; ;;
dockerfile) dockerfile)
gen_dockerfile gen_dockerfile ${PYTHON_ABI:-""}
;; ;;
capi) capi)
cmake_gen ${PYTHON_ABI:-""} cmake_gen ${PYTHON_ABI:-""}
......
...@@ -118,7 +118,7 @@ void generateSequenceStartPositions(size_t batchSize, ...@@ -118,7 +118,7 @@ void generateSequenceStartPositions(size_t batchSize,
} }
buf[i] = pos; buf[i] = pos;
pos += len; pos += len;
VLOG(1) << " len=" << len; VLOG(10) << " len=" << len;
} }
buf[numSeqs] = batchSize; buf[numSeqs] = batchSize;
} }
......
...@@ -34,6 +34,7 @@ from . import regularizer ...@@ -34,6 +34,7 @@ from . import regularizer
from . import average from . import average
from . import metrics from . import metrics
from . import transpiler from . import transpiler
from . import distribute_lookup_table
from .param_attr import ParamAttr, WeightNormParamAttr from .param_attr import ParamAttr, WeightNormParamAttr
from .data_feeder import DataFeeder from .data_feeder import DataFeeder
from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
...@@ -126,7 +127,8 @@ def __bootstrap__(): ...@@ -126,7 +127,8 @@ def __bootstrap__():
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
read_env_flags += [ read_env_flags += [
'fraction_of_gpu_memory_to_use', 'cudnn_deterministic' 'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
'conv_workspace_size_limit', 'cudnn_exhaustive_search'
] ]
core.init_gflags([sys.argv[0]] + core.init_gflags([sys.argv[0]] +
["--tryfromenv=" + ",".join(read_env_flags)]) ["--tryfromenv=" + ",".join(read_env_flags)])
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
LOOKUP_TABLE_TYPE = "lookup_table"
def find_distributed_lookup_table(program):
"""
Find distribute lookup table in program.
We only support one distribute table now.
:param program:
:return: table_name or None
"""
table_name = None
for op in program.global_block().ops:
if op.type == LOOKUP_TABLE_TYPE:
if op.attr('is_distributed') is True:
if table_name is None:
table_name = op.input("W")[0]
if table_name != op.input("W")[0]:
raise RuntimeError("all distributed lookup_table_ops"
" should have only one table")
else:
if table_name is not None:
assert op.input("W")[0] != table_name
return table_name
...@@ -27,6 +27,7 @@ from .tensor import concat ...@@ -27,6 +27,7 @@ from .tensor import concat
from . import utils from . import utils
from .. import unique_name from .. import unique_name
from functools import reduce from functools import reduce
from .. import core
__all__ = [ __all__ = [
'fc', 'fc',
...@@ -101,6 +102,7 @@ __all__ = [ ...@@ -101,6 +102,7 @@ __all__ = [
'image_resize', 'image_resize',
'image_resize_short', 'image_resize_short',
'resize_bilinear', 'resize_bilinear',
'resize_nearest',
'gather', 'gather',
'scatter', 'scatter',
'sequence_scatter', 'sequence_scatter',
...@@ -158,6 +160,7 @@ __all__ = [ ...@@ -158,6 +160,7 @@ __all__ = [
'affine_grid', 'affine_grid',
'sequence_reverse', 'sequence_reverse',
'affine_channel', 'affine_channel',
'similarity_focus',
'hash', 'hash',
'grid_sampler', 'grid_sampler',
'log_loss', 'log_loss',
...@@ -1665,6 +1668,20 @@ def conv2d(input, ...@@ -1665,6 +1668,20 @@ def conv2d(input,
pre_bias = helper.create_variable_for_type_inference(dtype) pre_bias = helper.create_variable_for_type_inference(dtype)
if use_cudnn:
helper.create_variable(
name="kCUDNNFwdAlgoCache",
persistable=True,
type=core.VarDesc.VarType.RAW)
helper.create_variable(
name="kCUDNNBwdDataAlgoCache",
persistable=True,
type=core.VarDesc.VarType.RAW)
helper.create_variable(
name="kCUDNNBwdFilterAlgoCache",
persistable=True,
type=core.VarDesc.VarType.RAW)
helper.append_op( helper.append_op(
type=l_type, type=l_type,
inputs={ inputs={
...@@ -1678,7 +1695,7 @@ def conv2d(input, ...@@ -1678,7 +1695,7 @@ def conv2d(input,
'dilations': dilation, 'dilations': dilation,
'groups': groups, 'groups': groups,
'use_cudnn': use_cudnn, 'use_cudnn': use_cudnn,
'use_mkldnn': False 'use_mkldnn': False,
}) })
pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2) pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
...@@ -5640,7 +5657,8 @@ def image_resize(input, ...@@ -5640,7 +5657,8 @@ def image_resize(input,
out_shape=None, out_shape=None,
scale=None, scale=None,
name=None, name=None,
resample='BILINEAR'): resample='BILINEAR',
actual_shape=None):
""" """
**Resize a Batch of Images** **Resize a Batch of Images**
...@@ -5650,6 +5668,7 @@ def image_resize(input, ...@@ -5650,6 +5668,7 @@ def image_resize(input,
Supporting resample methods: Supporting resample methods:
'BILINEAR' : Bilinear interpolation 'BILINEAR' : Bilinear interpolation
'NEAREST' : Nearest neighbor interpolation
Args: Args:
input (Variable): The input tensor of image resize layer, input (Variable): The input tensor of image resize layer,
...@@ -5664,25 +5683,51 @@ def image_resize(input, ...@@ -5664,25 +5683,51 @@ def image_resize(input,
Default: None Default: None
name(str|None): A name for this layer(optional). If set None, the layer name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically. will be named automatically.
resample(str): The resample method. It can only be 'BILINEAR' currently. resample(str): The resample method. It supports 'BILINEAR' and 'NEAREST'
currently.
Default: 'BILINEAR' Default: 'BILINEAR'
actual_shape(Variable): An optional input to specify output shape
dynamically. If provided, image resize
according to this given shape rather than
:attr:`out_shape` and :attr:`scale` specifying
shape. That is to say actual_shape has the
highest priority. It is recommended to use
actual_shape instead of :attr:`out_shape` if you
want to specify output shape dynamically. When
using actual_shape to specify output shape, one of
:attr:`out_shape` and :attr:`scale` should also be
set, otherwise errors would be occured in graph
constructing stage.
Default: None
Returns: Returns:
Variable: The output is a 4-D tensor of the shape Variable: The output is a 4-D tensor of the shape
(num_batches, channls, out_h, out_w). (num_batches, channls, out_h, out_w).
Raises:
TypeError: out_shape should be a list or tuple or Variable.
TypeError: actual_shape should either be Variable or None.
ValueError: The 'resample' of image_resize can only be 'BILINEAR'
or 'NEAREST' currently.
ValueError: One of out_shape and scale must not be None.
ValueError: out_shape length should be 2.
Examples: Examples:
.. code-block:: python .. code-block:: python
out = fluid.layers.image_resize(input, out_shape=[12, 12]) out = fluid.layers.image_resize(input, out_shape=[12, 12])
""" """
resample_methods = {'BILINEAR': 'bilinear_interp'} resample_methods = {
'BILINEAR': 'bilinear',
'NEAREST': 'nearest',
}
if resample not in resample_methods: if resample not in resample_methods:
raise ValueError( raise ValueError(
"The 'resample' of image_resize can only be 'BILINEAR' currently.") "The 'resample' of image_resize can only be 'BILINEAR' or 'NEAREST' currently."
)
if out_shape is None and scale is None: if out_shape is None and scale is None:
raise ValueError("One of out_shape and scale must not be None") raise ValueError("One of out_shape and scale must not be None.")
helper = LayerHelper('bilinear_interp', **locals()) helper = LayerHelper('interpolate', **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
def _is_list_or_turple_(data): def _is_list_or_turple_(data):
...@@ -5692,33 +5737,106 @@ def image_resize(input, ...@@ -5692,33 +5737,106 @@ def image_resize(input,
out_w = 0 out_w = 0
inputs = {"X": input} inputs = {"X": input}
if out_shape is not None: if out_shape is not None:
if not (_is_list_or_turple_(out_shape) and if isinstance(out_shape, Variable):
len(out_shape) == 2) and not isinstance(out_shape, Variable): warnings.warn("out_shape as Variable type is deprecated, \
raise ValueError('out_shape should be a list or tuple or variable') it is recommended to use actual_shape instead of \
if _is_list_or_turple_(out_shape): out_shape to specify output shape dynamically.")
out_shape = list(map(int, out_shape))
out_h = out_shape[0]
out_w = out_shape[1]
else:
inputs['OutSize'] = out_shape inputs['OutSize'] = out_shape
elif not (_is_list_or_turple_(out_shape)):
raise TypeError("out_shape should be a list or tuple or Variable.")
elif len(out_shape) != 2:
raise ValueError("out_shape length should be 2.")
out_shape = list(map(int, out_shape))
out_h = out_shape[0]
out_w = out_shape[1]
else: else:
out_h = int(input.shape[2] * scale) out_h = int(input.shape[2] * scale)
out_w = int(input.shape[3] * scale) out_w = int(input.shape[3] * scale)
if isinstance(actual_shape, Variable):
inputs["OutSize"] = actual_shape
elif actual_shape is not None:
raise TypeError("actual_shape should either be Variable or None.")
out = helper.create_variable_for_type_inference(dtype) out = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type=resample_methods[resample], type='interpolate',
inputs=inputs, inputs=inputs,
outputs={"Out": out}, outputs={"Out": out},
attrs={"out_h": out_h, attrs={
"out_w": out_w}) "out_h": out_h,
"out_w": out_w,
"interp_method": resample_methods[resample]
})
return out return out
@templatedoc(op_type="bilinear_interp") @templatedoc(op_type="interpolate")
def resize_bilinear(input, out_shape=None, scale=None, name=None): def resize_bilinear(input,
out_shape=None,
scale=None,
name=None,
actual_shape=None):
""" """
${comment} Resize input by performing bilinear interpolation based on given
output shape which specified by actual_shape, out_shape and scale
in priority order.
Bilinear interpolation is an extension of linear interpolation for
interpolating functions of two variables (e.g. H-direction and
W-direction in this op) on a rectilinear 2D grid. The key idea is
to perform linear interpolation first in one direction, and then
again in the other direction.
For details of bilinear interpolation, please refer to Wikipedia:
https://en.wikipedia.org/wiki/Bilinear_interpolation
Args:
input(${x_type}): ${x_comment}.
out_shape(${out_size_type}): ${out_size_comment}.
scale(float|None): The multiplier for the input height or width. At
least one of out_shape or scale must be set. And out_shape has
a higher priority than scale. Default: None.
name(str|None): The output variable name.
actual_shape(Variable): An optional input to specify output shape
dynamically. If provided, image resize
according to this given shape rather than
:attr:`out_shape` and :attr:`scale` specifying
shape. That is to say actual_shape has the
highest priority. It is recommended to use
actual_shape instead of :attr:`out_shape` if you
want to specify output shape dynamically. When
using actual_shape to specify output shape, one of
:attr:`out_shape` and :attr:`scale` should also be
set, otherwise errors would be occured in graph
constructing stage.
Default: None
Returns:
${out_comment}.
"""
return image_resize(input, out_shape, scale, name, 'BILINEAR', actual_shape)
@templatedoc(op_type="interpolate")
def resize_nearest(input,
out_shape=None,
scale=None,
name=None,
actual_shape=None):
"""
Resize input by performing nearest neighbor interpolation in both the
3rd dimention(in height direction) and the 4th dimention(in width
direction) based on given output shape which specified by actual_shape,
out_shape and scale in priority order.
For details of nearest neighbor interpolation, please refer to Wikipedia:
https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation
Args: Args:
input(${x_type}): ${x_comment}. input(${x_type}): ${x_comment}.
...@@ -5730,12 +5848,25 @@ def resize_bilinear(input, out_shape=None, scale=None, name=None): ...@@ -5730,12 +5848,25 @@ def resize_bilinear(input, out_shape=None, scale=None, name=None):
a higher priority than scale. Default: None. a higher priority than scale. Default: None.
name(str|None): The output variable name. name(str|None): The output variable name.
actual_shape(Variable): An optional input to specify output shape
dynamically. If provided, image resize
according to this given shape rather than
:attr:`out_shape` and :attr:`scale` specifying
shape. That is to say actual_shape has the
highest priority. It is recommended to use
actual_shape instead of :attr:`out_shape` if you
want to specify output shape dynamically. When
using actual_shape to specify output shape, one of
:attr:`out_shape` and :attr:`scale` should also be
set, otherwise errors would be occured in graph
constructing stage.
Default: None
Returns: Returns:
${out_comment}. ${out_comment}.
""" """
return image_resize(input, out_shape, scale, name, 'BILINEAR') return image_resize(input, out_shape, scale, name, 'NEAREST', actual_shape)
def image_resize_short(input, out_short_len, resample='BILINEAR'): def image_resize_short(input, out_short_len, resample='BILINEAR'):
...@@ -7803,6 +7934,118 @@ def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None): ...@@ -7803,6 +7934,118 @@ def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None):
return out return out
def similarity_focus(input, axis, indexes, name=None):
"""
SimilarityFocus Operator
Generate a similarity focus mask with the same shape of input using the following method:
1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding
to the axis according to the indexes. For example, if axis=1 and indexes=[a],
it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X
is (BatchSize, A, B, C), the shape of tensor T is (BatchSize, B, C).
2. For each index, find the largest numbers in the tensor T, so that the same
row and same column has at most one number(what it means is that if the
largest number has been found in the i-th row and the j-th column, then
the numbers in the i-th row or j-th column will be skipped. And then the
next largest number will be selected from the remaining numbers. Obviously
there will be min(B, C) numbers), and mark the corresponding position of the
3-D similarity focus mask as 1, otherwise as 0. Do elementwise-or for
each index.
3. Broadcast the 3-D similarity focus mask to the same shape of input X.
Refer to `Similarity Focus Layer <http://www.aclweb.org/anthology/N16-1108>`_
.. code-block:: text
* Example :
Given a 4-D tensor x with the shape (BatchSize, C, A, B), where C is
the number of channels and the shape of feature map is (A, B):
x.shape = (2, 3, 2, 2)
x.data = [[[[0.8, 0.1],
[0.4, 0.5]],
[[0.9, 0.7],
[0.9, 0.9]],
[[0.8, 0.9],
[0.1, 0.2]]],
[[[0.2, 0.5],
[0.3, 0.4]],
[[0.9, 0.7],
[0.8, 0.4]],
[[0.0, 0.2],
[0.4, 0.7]]]]
Given axis: 1 (the axis of the channel)
Given indexes: [0]
then we get a 4-D tensor out with the same shape of input x:
out.shape = (2, 3, 2, 2)
out.data = [[[[1.0, 0.0],
[0.0, 1.0]],
[[1.0, 0.0],
[0.0, 1.0]],
[[1.0, 0.0],
[0.0, 1.0]]],
[[[0.0, 1.0],
[1.0, 0.0]],
[[0.0, 1.0],
[1.0, 0.0]],
[[0.0, 1.0],
[1.0, 0.0]]]]
Args:
input(Variable): The input tensor variable(default float). It should
be a 4-D tensor with shape [BatchSize, A, B, C].
axis(int): Indicating the dimension to be selected. It can only be
1, 2 or 3.
indexes(list): Indicating the indexes of the selected dimension.
Returns:
Variable: A tensor variable with the same shape and same type
as the input.
Examples:
.. code-block:: python
data = fluid.layers.data(
name='data', shape=[2, 3, 2, 2], dtype='float32')
x = fluid.layers.layer_norm(input=data, axis=1, indexes=[0])
"""
helper = LayerHelper('similarity_focus', **locals())
# check attrs
if isinstance(axis, int) is False:
raise TypeError("axis must be int type.")
if isinstance(indexes, list) is False:
raise TypeError("indexes must be list type.")
if axis != 1 and axis != 2 and axis != 3:
raise ValueError("axis must be 1, 2 or 3.")
if len(indexes) == 0:
raise ValueError("indexes can not be empty.")
if name is None:
out = helper.create_variable_for_type_inference(dtype=input.dtype)
else:
out = helper.create_variable(
name=name, dtype=input.dtype, persistable=False)
helper.append_op(
type='similarity_focus',
inputs={'X': input},
outputs={'Out': out},
attrs={"axis": axis,
"indexes": indexes})
return out
def hash(input, hash_size, num_hash=1, name=None): def hash(input, hash_size, num_hash=1, name=None):
""" """
Hash the input to an integer whose value is less than the given hash size. Hash the input to an integer whose value is less than the given hash size.
......
...@@ -24,10 +24,10 @@ from .layer_function_generator import templatedoc ...@@ -24,10 +24,10 @@ from .layer_function_generator import templatedoc
import numpy import numpy
__all__ = [ __all__ = [
'create_tensor', 'create_parameter', 'create_global_var', 'cast', 'concat', 'create_tensor', 'create_parameter', 'create_global_var', 'cast',
'sums', 'assign', 'fill_constant_batch_size_like', 'fill_constant', 'tensor_array_to_tensor', 'concat', 'sums', 'assign',
'argmin', 'argmax', 'argsort', 'ones', 'zeros', 'reverse', 'has_inf', 'fill_constant_batch_size_like', 'fill_constant', 'argmin', 'argmax',
'has_nan', 'isfinite' 'argsort', 'ones', 'zeros', 'reverse', 'has_inf', 'has_nan', 'isfinite'
] ]
...@@ -193,6 +193,60 @@ def concat(input, axis=0, name=None): ...@@ -193,6 +193,60 @@ def concat(input, axis=0, name=None):
return out return out
def tensor_array_to_tensor(input, axis=1, name=None):
"""
This function concatenates the input LodTensorArray along the axis mentioned
and returns that as the output.
A simple example as below:
.. code-block:: text
Given:
input.data = {[[0.6, 0.1, 0.3],
[0.5, 0.3, 0.2]],
[[1.3],
[1.8]],
[[2.3, 2.1],
[2.5, 2.4]]}
axis = 1
Then:
output.data = [[0.6, 0.1, 0.3, 1.3, 2.3, 2.1],
[0.5, 0.3, 0.2, 1.8, 2.5, 2.4]]
output_index.data = [3, 1, 2]
Args:
input(list): Input LodTensorArray
axis(int): Integer axis along which the tensors will be concatenated
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: Output variable of the concatenation
Variable: The input LodTensorArray items' dims along the axis
Examples:
.. code-block:: python
output, output_index = fluid.layers.tensor_array_to_tensor(input=tensor_array)
"""
helper = LayerHelper('tensor_array_concat', **locals())
out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
out_index = helper.create_variable_for_type_inference(dtype="int32")
helper.append_op(
type='tensor_array_concat',
inputs={'X': input},
outputs={'Out': [out],
'OutIndex': [out_index]},
attrs={'axis': axis})
return out, out_index
def sums(input, out=None): def sums(input, out=None):
""" """
This function performs the sum operation on the input and returns the This function performs the sum operation on the input and returns the
......
...@@ -13,21 +13,23 @@ ...@@ -13,21 +13,23 @@
# limitations under the License. # limitations under the License.
from __future__ import print_function from __future__ import print_function
import re
import sys
from collections import defaultdict from collections import defaultdict
from contextlib import contextmanager
from paddle.fluid.framework import Program, Variable, name_scope, default_main_program from paddle.fluid.framework import Program, Variable, name_scope, default_main_program
from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
from . import framework from . import framework
from . import layers from . import layers
from . import unique_name
from .backward import append_backward from .backward import append_backward
from .clip import append_gradient_clip_ops, error_clip_callback
from .framework import program_guard from .framework import program_guard
from . import unique_name
from .initializer import Constant from .initializer import Constant
from .layer_helper import LayerHelper from .layer_helper import LayerHelper
from .regularizer import append_regularization_ops
from .clip import append_gradient_clip_ops, error_clip_callback
from contextlib import contextmanager
from .layers import ops from .layers import ops
from .regularizer import append_regularization_ops
__all__ = [ __all__ = [
'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl', 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl',
...@@ -85,7 +87,7 @@ class Optimizer(object): ...@@ -85,7 +87,7 @@ class Optimizer(object):
name=unique_name.generate("learning_rate"), name=unique_name.generate("learning_rate"),
shape=[1], shape=[1],
value=float(self._learning_rate), value=float(self._learning_rate),
dtype='float32' if self._dtype == None else self._dtype, dtype='float32' if self._dtype is None else self._dtype,
persistable=True) persistable=True)
def _global_learning_rate(self, program=None): def _global_learning_rate(self, program=None):
...@@ -245,6 +247,50 @@ class Optimizer(object): ...@@ -245,6 +247,50 @@ class Optimizer(object):
end = len(global_block.ops) end = len(global_block.ops)
return global_block._slice_ops(start, end) return global_block._slice_ops(start, end)
def _process_distribute_lookuptable(self, param_grads, loss,
startup_program):
"""
Because distribute lookup table only support SGD optimizer for now, not support
other optimizer and regularization, so we should find the table parameter out,
and avoid to add regularization and other op for it, and add sgd optimize op
for it independently.
:param param_grads(list((Var, Var))): list of (param, grad) pair.
:param loss: the loss variable.
:param startup_program: the startup program
"""
program = loss.block.program
table_name = find_distributed_lookup_table(program)
table_param = None
table_grad = None
new_param_grads = []
for p, g in param_grads:
if p.name == table_name:
if table_param is not None:
raise RuntimeError(
"multi dist table var found, only support one now!")
table_param = p
table_grad = g
else:
new_param_grads.append((p, g))
sgd_op = None
if table_param is not None:
with program_guard(program, startup_program):
param_and_grad = [table_param, table_grad]
with table_param.block.program._optimized_guard(param_and_grad), \
framework.name_scope("optimizer"):
self._create_global_learning_rate()
# create the optimize op
sgd_op = loss.block.append_op(
type='sgd',
inputs={
"Param": table_param,
"Grad": table_grad,
"LearningRate":
self._create_param_lr(param_and_grad)
},
outputs={"ParamOut": param_and_grad[0]})
return new_param_grads, (table_param, table_grad), sgd_op
def minimize(self, def minimize(self,
loss, loss,
startup_program=None, startup_program=None,
...@@ -260,6 +306,9 @@ class Optimizer(object): ...@@ -260,6 +306,9 @@ class Optimizer(object):
params_grads = sorted(params_grads, key=lambda x: x[0].name) params_grads = sorted(params_grads, key=lambda x: x[0].name)
params_grads, table_param_and_grad, table_optimize_op = \
self._process_distribute_lookuptable(params_grads, loss, startup_program)
params_grads = append_gradient_clip_ops(params_grads) params_grads = append_gradient_clip_ops(params_grads)
# Add regularization if any # Add regularization if any
...@@ -268,6 +317,9 @@ class Optimizer(object): ...@@ -268,6 +317,9 @@ class Optimizer(object):
optimize_ops = self._create_optimization_pass(params_grads, loss, optimize_ops = self._create_optimization_pass(params_grads, loss,
startup_program) startup_program)
if table_optimize_op is not None:
optimize_ops.append(table_optimize_op)
params_grads.append(table_param_and_grad)
return optimize_ops, params_grads return optimize_ops, params_grads
......
...@@ -38,7 +38,7 @@ depth = 8 ...@@ -38,7 +38,7 @@ depth = 8
mix_hidden_lr = 1e-3 mix_hidden_lr = 1e-3
IS_SPARSE = True IS_SPARSE = True
PASS_NUM = 10 PASS_NUM = 1
BATCH_SIZE = 10 BATCH_SIZE = 10
embedding_name = 'emb' embedding_name = 'emb'
......
...@@ -67,6 +67,7 @@ class TestConv2dOp(OpTest): ...@@ -67,6 +67,7 @@ class TestConv2dOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "conv2d" self.op_type = "conv2d"
self.use_cudnn = False self.use_cudnn = False
self.exhaustive_search = False
self.use_cuda = False self.use_cuda = False
self.use_mkldnn = False self.use_mkldnn = False
self.data_format = "AnyLayout" self.data_format = "AnyLayout"
...@@ -98,7 +99,8 @@ class TestConv2dOp(OpTest): ...@@ -98,7 +99,8 @@ class TestConv2dOp(OpTest):
'dilations': self.dilations, 'dilations': self.dilations,
'use_cudnn': self.use_cudnn, 'use_cudnn': self.use_cudnn,
'use_mkldnn': self.use_mkldnn, 'use_mkldnn': self.use_mkldnn,
'data_format': self.data_format 'data_format': self.data_format,
'exhaustive_search': self.exhaustive_search
} }
self.outputs = {'Output': output} self.outputs = {'Output': output}
...@@ -361,6 +363,12 @@ class TestDepthwiseConvWithDilation2(TestConv2dOp): ...@@ -361,6 +363,12 @@ class TestDepthwiseConvWithDilation2(TestConv2dOp):
self.op_type = "depthwise_conv2d" self.op_type = "depthwise_conv2d"
class TestCUDNNExhaustiveSearch(TestConv2dOp):
def init_kernel_type(self):
self.use_cudnn = True
self.exhaustive_search = True
# Please Don't remove the following code. # Please Don't remove the following code.
# Currently, CI use cudnn V5.0 which not support dilation conv. # Currently, CI use cudnn V5.0 which not support dilation conv.
# class TestCUDNNWithDilation(TestWithDilation): # class TestCUDNNWithDilation(TestWithDilation):
......
...@@ -335,6 +335,12 @@ class TestFP16WithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1): ...@@ -335,6 +335,12 @@ class TestFP16WithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1):
self.check_output_with_place(place, atol=2e-2) self.check_output_with_place(place, atol=2e-2)
class TestCUDNNExhaustiveSearch(TestCUDNN):
def init_kernel_type(self):
self.use_cudnn = True
self.exhaustive_search = True
# FIXME(typhoonzero): find a way to determine if # FIXME(typhoonzero): find a way to determine if
# using cudnn > 6 in python # using cudnn > 6 in python
# class TestWithDilationCUDNN(TestWithDilation): # class TestWithDilationCUDNN(TestWithDilation):
......
...@@ -567,7 +567,6 @@ class TestDistLookupTable(TestDistLookupTableBase): ...@@ -567,7 +567,6 @@ class TestDistLookupTable(TestDistLookupTableBase):
'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
'fill_constant', 'fill_constant', 'uniform_random', 'fill_constant', 'fill_constant', 'uniform_random',
'uniform_random', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat', 'uniform_random', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat',
'fake_init' 'fake_init'
...@@ -639,7 +638,7 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase): ...@@ -639,7 +638,7 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
# 5 save table # 5 save table
self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"]) self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
trainer, _ = self.get_trainer(config) trainer, trainer_startup = self.get_trainer(config)
self.assertEqual(len(trainer.blocks), 1) self.assertEqual(len(trainer.blocks), 1)
ops = [ ops = [
'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids', 'prefetch', 'merge_ids', 'sequence_pool',
...@@ -653,6 +652,16 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase): ...@@ -653,6 +652,16 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
'recv', 'concat' 'recv', 'concat'
] ]
self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
startup_ops = [
'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
'fill_constant', 'fill_constant', 'uniform_random',
'uniform_random', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat',
'fake_init'
]
self.assertEqual([op.type for op in trainer_startup.blocks[0].ops],
startup_ops)
class TestDistLookupTableSliceSize(TestDistLookupTableBase): class TestDistLookupTableSliceSize(TestDistLookupTableBase):
......
...@@ -20,10 +20,44 @@ from op_test import OpTest ...@@ -20,10 +20,44 @@ from op_test import OpTest
import paddle.fluid.core as core import paddle.fluid.core as core
def bilinear_interp_np(input, out_h, out_w, out_size): def nearest_neighbor_interp_np(X,
out_h,
out_w,
out_size=None,
actual_shape=None):
"""nearest neighbor interpolation implement in shape [N, C, H, W]"""
if out_size is not None: if out_size is not None:
out_h = out_size[0] out_h = out_size[0]
out_w = out_size[1] out_w = out_size[1]
if actual_shape is not None:
out_h = actual_shape[0]
out_w = actual_shape[1]
n, c, in_h, in_w = X.shape
ratio_h = ratio_w = 0.0
if out_h > 1:
ratio_h = (in_h - 1.0) / (out_h - 1.0)
if out_w > 1:
ratio_w = (in_w - 1.0) / (out_w - 1.0)
out = np.zeros((n, c, out_h, out_w))
for i in range(out_h):
in_i = int(ratio_h * i + 0.5)
for j in range(out_w):
in_j = int(ratio_w * j + 0.5)
out[:, :, i, j] = X[:, :, in_i, in_j]
return out.astype(X.dtype)
def bilinear_interp_np(input, out_h, out_w, out_size=None, actual_shape=None):
"""bilinear interpolation implement in shape [N, C, H, W]"""
if out_size is not None:
out_h = out_size[0]
out_w = out_size[1]
if actual_shape is not None:
out_h = actual_shape[0]
out_w = actual_shape[1]
batch_size, channel, in_h, in_w = input.shape batch_size, channel, in_h, in_w = input.shape
if out_h > 1: if out_h > 1:
ratio_h = (in_h - 1.0) / (out_h - 1.0) ratio_h = (in_h - 1.0) / (out_h - 1.0)
...@@ -53,18 +87,32 @@ def bilinear_interp_np(input, out_h, out_w, out_size): ...@@ -53,18 +87,32 @@ def bilinear_interp_np(input, out_h, out_w, out_size):
return out.astype(input.dtype) return out.astype(input.dtype)
class TestBilinearInterpOp(OpTest): INTERPOLATE_FUNCS = {
'bilinear': bilinear_interp_np,
'nearest': nearest_neighbor_interp_np,
}
class TestInterpolateOp(OpTest):
def setUp(self): def setUp(self):
self.out_size = None self.out_size = None
self.actual_shape = None
self.init_test_case() self.init_test_case()
self.op_type = "bilinear_interp" self.op_type = "interpolate"
input_np = np.random.random(self.input_shape).astype("float32") input_np = np.random.random(self.input_shape).astype("float32")
output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
self.out_size) output_np = INTERPOLATE_FUNCS[self.interp_method](
input_np, self.out_h, self.out_w, self.out_size, self.actual_shape)
self.inputs = {'X': input_np} self.inputs = {'X': input_np}
if self.out_size is not None: if self.out_size is not None:
self.inputs['OutSize'] = self.out_size self.inputs['OutSize'] = self.out_size
self.attrs = {'out_h': self.out_h, 'out_w': self.out_w} if self.actual_shape is not None:
self.inputs['OutSize'] = self.actual_shape
self.attrs = {
'out_h': self.out_h,
'out_w': self.out_w,
'interp_method': self.interp_method
}
self.outputs = {'Out': output_np} self.outputs = {'Out': output_np}
def test_check_output(self): def test_check_output(self):
...@@ -74,90 +122,209 @@ class TestBilinearInterpOp(OpTest): ...@@ -74,90 +122,209 @@ class TestBilinearInterpOp(OpTest):
self.check_grad(['X'], 'Out', in_place=True) self.check_grad(['X'], 'Out', in_place=True)
def init_test_case(self): def init_test_case(self):
self.interp_method = 'bilinear'
self.input_shape = [2, 3, 4, 4] self.input_shape = [2, 3, 4, 4]
self.out_h = 2 self.out_h = 2
self.out_w = 2 self.out_w = 2
self.out_size = np.array([3, 3]).astype("int32") self.out_size = np.array([3, 3]).astype("int32")
class TestCase1(TestBilinearInterpOp): class TestBilinearInterpCase1(TestInterpolateOp):
def init_test_case(self): def init_test_case(self):
self.interp_method = 'bilinear'
self.input_shape = [4, 1, 7, 8] self.input_shape = [4, 1, 7, 8]
self.out_h = 1 self.out_h = 1
self.out_w = 1 self.out_w = 1
class TestCase2(TestBilinearInterpOp): class TestBilinearInterpCase2(TestInterpolateOp):
def init_test_case(self): def init_test_case(self):
self.interp_method = 'bilinear'
self.input_shape = [3, 3, 9, 6] self.input_shape = [3, 3, 9, 6]
self.out_h = 12 self.out_h = 12
self.out_w = 12 self.out_w = 12
class TestCase3(TestBilinearInterpOp): class TestBilinearInterpCase3(TestInterpolateOp):
def init_test_case(self): def init_test_case(self):
self.interp_method = 'bilinear'
self.input_shape = [1, 1, 128, 64] self.input_shape = [1, 1, 128, 64]
self.out_h = 64 self.out_h = 64
self.out_w = 128 self.out_w = 128
class TestCase4(TestBilinearInterpOp): class TestBilinearInterpCase4(TestInterpolateOp):
def init_test_case(self): def init_test_case(self):
self.interp_method = 'bilinear'
self.input_shape = [4, 1, 7, 8] self.input_shape = [4, 1, 7, 8]
self.out_h = 1 self.out_h = 1
self.out_w = 1 self.out_w = 1
self.out_size = np.array([2, 2]).astype("int32") self.out_size = np.array([2, 2]).astype("int32")
class TestCase5(TestBilinearInterpOp): class TestBilinearInterpCase5(TestInterpolateOp):
def init_test_case(self): def init_test_case(self):
self.interp_method = 'bilinear'
self.input_shape = [3, 3, 9, 6] self.input_shape = [3, 3, 9, 6]
self.out_h = 12 self.out_h = 12
self.out_w = 12 self.out_w = 12
self.out_size = np.array([11, 11]).astype("int32") self.out_size = np.array([11, 11]).astype("int32")
class TestCase6(TestBilinearInterpOp): class TestBilinearInterpCase6(TestInterpolateOp):
def init_test_case(self): def init_test_case(self):
self.interp_method = 'bilinear'
self.input_shape = [1, 1, 128, 64] self.input_shape = [1, 1, 128, 64]
self.out_h = 64 self.out_h = 64
self.out_w = 128 self.out_w = 128
self.out_size = np.array([65, 129]).astype("int32") self.out_size = np.array([65, 129]).astype("int32")
class TestBilinearInterpOpUint8(OpTest): class TestBilinearInterpActualShape(TestInterpolateOp):
def init_test_case(self):
self.interp_method = 'bilinear'
self.input_shape = [3, 2, 32, 16]
self.out_h = 64
self.out_w = 32
self.out_size = np.array([66, 40]).astype("int32")
class TestBilinearInterpBigScale(TestInterpolateOp):
def init_test_case(self):
self.interp_method = 'bilinear'
self.input_shape = [4, 4, 64, 32]
self.out_h = 100
self.out_w = 50
self.out_size = np.array([101, 51]).astype('int32')
class TestInterpolateOpUint8(OpTest):
def setUp(self): def setUp(self):
self.out_size = None self.out_size = None
self.actual_shape = None
self.init_test_case() self.init_test_case()
self.op_type = "bilinear_interp" self.op_type = "interpolate"
input_np = np.random.randint( input_np = np.random.randint(
low=0, high=256, size=self.input_shape).astype("uint8") low=0, high=256, size=self.input_shape).astype("uint8")
output_np = bilinear_interp_np(input_np, self.out_h, self.out_w, output_np = INTERPOLATE_FUNCS[self.interp_method](
self.out_size) input_np, self.out_h, self.out_w, self.out_size, self.actual_shape)
self.inputs = {'X': input_np} self.inputs = {'X': input_np}
if self.out_size is not None: if self.out_size is not None:
self.inputs['OutSize'] = self.out_size self.inputs['OutSize'] = self.out_size
self.attrs = {'out_h': self.out_h, 'out_w': self.out_w} self.attrs = {
'out_h': self.out_h,
'out_w': self.out_w,
'interp_method': self.interp_method
}
self.outputs = {'Out': output_np} self.outputs = {'Out': output_np}
def test_check_output(self): def test_check_output(self):
self.check_output_with_place(place=core.CPUPlace(), atol=1) self.check_output_with_place(place=core.CPUPlace(), atol=1)
def init_test_case(self): def init_test_case(self):
self.interp_method = 'bilinear'
self.input_shape = [1, 3, 9, 6] self.input_shape = [1, 3, 9, 6]
self.out_h = 10 self.out_h = 10
self.out_w = 9 self.out_w = 9
class TestCase1Uint8(TestBilinearInterpOpUint8): class TestBilinearInterpCase1Uint8(TestInterpolateOpUint8):
def init_test_case(self):
self.interp_method = 'bilinear'
self.input_shape = [2, 3, 128, 64]
self.out_h = 120
self.out_w = 50
class TestBilinearInterpCase2Uint8(TestInterpolateOpUint8):
def init_test_case(self):
self.interp_method = 'bilinear'
self.input_shape = [4, 1, 7, 8]
self.out_h = 5
self.out_w = 13
self.out_size = np.array([6, 15]).astype("int32")
class TestNearestNeighborInterpCase1(TestInterpolateOp):
def init_test_case(self):
self.interp_method = 'nearest'
self.input_shape = [4, 1, 7, 8]
self.out_h = 1
self.out_w = 1
class TestNearestNeighborInterpCase2(TestInterpolateOp):
def init_test_case(self):
self.interp_method = 'nearest'
self.input_shape = [3, 3, 9, 6]
self.out_h = 12
self.out_w = 12
class TestNearestNeighborInterpCase3(TestInterpolateOp):
def init_test_case(self):
self.interp_method = 'nearest'
self.input_shape = [1, 1, 128, 64]
self.out_h = 64
self.out_w = 128
class TestNearestNeighborInterpCase4(TestInterpolateOp):
def init_test_case(self):
self.interp_method = 'nearest'
self.input_shape = [4, 1, 7, 8]
self.out_h = 1
self.out_w = 1
self.out_size = np.array([2, 2]).astype("int32")
class TestNearestNeighborInterpCase5(TestInterpolateOp):
def init_test_case(self):
self.interp_method = 'nearest'
self.input_shape = [3, 3, 9, 6]
self.out_h = 12
self.out_w = 12
self.out_size = np.array([11, 11]).astype("int32")
class TestNearestNeighborInterpCase6(TestInterpolateOp):
def init_test_case(self):
self.interp_method = 'nearest'
self.input_shape = [1, 1, 128, 64]
self.out_h = 64
self.out_w = 128
self.out_size = np.array([65, 129]).astype("int32")
class TestNearestNeighborInterpActualShape(TestInterpolateOp):
def init_test_case(self):
self.interp_method = 'nearest'
self.input_shape = [3, 2, 32, 16]
self.out_h = 64
self.out_w = 32
self.out_size = np.array([66, 40]).astype("int32")
class TestNearestNeighborInterpBigScale(TestInterpolateOp):
def init_test_case(self):
self.interp_method = 'nearest'
self.input_shape = [4, 4, 64, 32]
self.out_h = 100
self.out_w = 50
self.out_size = np.array([101, 51]).astype('int32')
class TestNearestNeighborInterpCase1Uint8(TestInterpolateOpUint8):
def init_test_case(self): def init_test_case(self):
self.interp_method = 'nearest'
self.input_shape = [2, 3, 128, 64] self.input_shape = [2, 3, 128, 64]
self.out_h = 120 self.out_h = 120
self.out_w = 50 self.out_w = 50
class TestCase2Uint8(TestBilinearInterpOpUint8): class TestNearestNeighborInterpCase2Uint8(TestInterpolateOpUint8):
def init_test_case(self): def init_test_case(self):
self.interp_method = 'nearest'
self.input_shape = [4, 1, 7, 8] self.input_shape = [4, 1, 7, 8]
self.out_h = 5 self.out_h = 5
self.out_w = 13 self.out_w = 13
......
...@@ -496,6 +496,16 @@ class TestBook(unittest.TestCase): ...@@ -496,6 +496,16 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(output) self.assertIsNotNone(output)
print(str(program)) print(str(program))
def test_resize_nearest(self):
program = Program()
with program_guard(program):
x = layers.data(name='x', shape=[3, 9, 6], dtype="float32")
output = layers.resize_nearest(x, out_shape=[12, 12])
self.assertIsNotNone(output)
output = layers.resize_nearest(x, scale=3)
self.assertIsNotNone(output)
print(str(program))
def test_polygon_box_transform(self): def test_polygon_box_transform(self):
program = Program() program = Program()
with program_guard(program): with program_guard(program):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle.fluid.core as core
from op_test import OpTest
class TestSimilarityFocusOp(OpTest):
def setUp(self):
self.op_type = "similarity_focus"
batch_size = 2
x_dim, y_dim, z_dim = 3, 2, 2
self.inputs = {
'X': np.array([[[[0.8, 0.1], [0.4, 0.5]], [[0.9, 0.7], [0.9, 0.9]],
[[0.8, 0.9], [0.1, 0.2]]],
[[[0.2, 0.5], [0.3, 0.4]], [[0.9, 0.7], [0.8, 0.4]],
[[0.0, 0.2], [0.4, 0.7]]]]),
}
self.attrs = {
'axis': 1,
'indexes': [0],
}
output = None
for batch in range(batch_size):
res = np.zeros((1, y_dim, z_dim)).astype("float32").reshape(-1)
for index in self.attrs['indexes']:
channel = self.inputs['X'][batch, index, :, :].reshape(-1).copy(
)
tag1 = [0 for i in range(y_dim)]
tag2 = [0 for i in range(z_dim)]
cnt = 0
for i in range(channel.size):
index = channel.argmax()
idx1 = index // z_dim
idx2 = index % z_dim
if tag1[idx1] + tag2[idx2] == 0:
tag1[idx1] = 1
tag2[idx2] = 1
res[index] = 1
cnt += 1
if cnt == min(y_dim, z_dim):
break
channel[index] = -1
res = res.reshape(1, y_dim, z_dim).repeat([x_dim], axis=0)
res = res.reshape(1, x_dim, y_dim, z_dim)
if output is not None:
output = np.concatenate((output, res), axis=0)
else:
output = res
self.outputs = {'Out': output}
def test_check_output(self):
self.check_output()
class TestSimilarityFocusOp_axis1(OpTest):
def setUp(self):
self.op_type = "similarity_focus"
batch_size = 3
x_dim, y_dim, z_dim = 4, 5, 6
self.inputs = {
'X': np.random.random(
(batch_size, x_dim, y_dim, z_dim)).astype("float32"),
}
self.attrs = {
'axis': 1,
'indexes': [0, 3],
}
output = None
for batch in range(batch_size):
res = np.zeros((1, y_dim, z_dim)).astype("float32").reshape(-1)
for index in self.attrs['indexes']:
channel = self.inputs['X'][batch, index, :, :].reshape(-1).copy(
)
tag1 = [0 for i in range(y_dim)]
tag2 = [0 for i in range(z_dim)]
cnt = 0
for i in range(channel.size):
index = channel.argmax()
idx1 = index // z_dim
idx2 = index % z_dim
if tag1[idx1] + tag2[idx2] == 0:
tag1[idx1] = 1
tag2[idx2] = 1
res[index] = 1
cnt += 1
if cnt == min(y_dim, z_dim):
break
channel[index] = -1
res = res.reshape(1, y_dim, z_dim)
res = res.repeat([x_dim], axis=0)
res = res.reshape(1, x_dim, y_dim, z_dim)
if output is not None:
output = np.concatenate((output, res), axis=0)
else:
output = res
self.outputs = {'Out': output}
def test_check_output(self):
self.check_output()
class TestSimilarityFocusOp_axis2(OpTest):
def setUp(self):
self.op_type = "similarity_focus"
batch_size = 6
x_dim, y_dim, z_dim = 7, 8, 9
self.inputs = {
'X': np.random.random(
(batch_size, x_dim, y_dim, z_dim)).astype("float32"),
}
self.attrs = {
'axis': 2,
'indexes': [0, 3, 5],
}
output = None
for batch in range(batch_size):
res = np.zeros((x_dim, 1, z_dim)).astype("float32").reshape(-1)
for index in self.attrs['indexes']:
channel = self.inputs['X'][batch, :, index, :].reshape(-1).copy(
)
tag1 = [0 for i in range(x_dim)]
tag2 = [0 for i in range(z_dim)]
cnt = 0
for i in range(channel.size):
index = channel.argmax()
idx1 = index // z_dim
idx2 = index % z_dim
if tag1[idx1] + tag2[idx2] == 0:
tag1[idx1] = 1
tag2[idx2] = 1
res[index] = 1
cnt += 1
if cnt == min(x_dim, z_dim):
break
channel[index] = -1
res = res.reshape(x_dim, 1, z_dim)
res = res.repeat([y_dim], axis=1)
res = res.reshape(1, x_dim, y_dim, z_dim)
if output is not None:
output = np.concatenate((output, res), axis=0)
else:
output = res
self.outputs = {'Out': output}
def test_check_output(self):
self.check_output()
class TestSimilarityFocusOp_axis3(OpTest):
def setUp(self):
self.op_type = "similarity_focus"
batch_size = 64
x_dim, y_dim, z_dim = 48, 48, 13
self.inputs = {
'X': np.random.random(
(batch_size, x_dim, y_dim, z_dim)).astype("float32"),
}
self.attrs = {
'axis': 3,
'indexes': [0, 2, 7, 9],
}
output = None
for batch in range(batch_size):
res = np.zeros((x_dim, y_dim, 1)).astype("float32").reshape(-1)
for index in self.attrs['indexes']:
channel = self.inputs['X'][batch, :, :, index].reshape(-1).copy(
)
tag1 = [0 for i in range(x_dim)]
tag2 = [0 for i in range(y_dim)]
cnt = 0
for i in range(channel.size):
index = channel.argmax()
idx1 = index // y_dim
idx2 = index % y_dim
if tag1[idx1] + tag2[idx2] == 0:
tag1[idx1] = 1
tag2[idx2] = 1
res[index] = 1
cnt += 1
if cnt == min(x_dim, y_dim):
break
channel[index] = -1
res = res.reshape(x_dim, y_dim, 1)
res = res.repeat([z_dim], axis=2)
res = res.reshape(1, x_dim, y_dim, z_dim)
if output is not None:
output = np.concatenate((output, res), axis=0)
else:
output = res
self.outputs = {'Out': output}
def test_check_output(self):
self.check_output()
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.op import Operator
from paddle.fluid.executor import Executor
class TestLoDTensorArrayConcat(unittest.TestCase):
def setUp(self):
self.op_type = "tensor_array_to_tensor"
self.attrs = {"axis": 0}
self.outputs = ["Out"]
def test_get_set(self):
scope = core.Scope()
program = fluid.Program()
block = program.global_block()
input_arr = block.create_var(
name="tmp_lod_tensor_array",
type=core.VarDesc.VarType.LOD_TENSOR_ARRAY)
input_arr.persistable = True
input_arr_var = scope.var('tmp_lod_tensor_array')
input_tensor_array = input_arr_var.get_lod_tensor_array()
self.assertEqual(0, len(input_tensor_array))
cpu = core.CPUPlace()
for i in range(10):
t = core.LoDTensor()
if i == 0:
t.set(numpy.array([[i], [i]], dtype='float32'), cpu)
else:
t.set(numpy.array([[i]], dtype='float32'), cpu)
input_tensor_array.append(t)
self.assertEqual(10, len(input_tensor_array))
random_grad = numpy.random.random_sample([11]).astype(numpy.float32)
y_out = block.create_var(name="Out")
y_out.persistable = True
y_out_index = block.create_var(name="OutIndex")
y_out_index.persistable = True
y_grad_arr = block.create_var(
name='Out@GRAD', dtype='float32', shape=[11])
y_grad_arr.persistable = True
y_grad = scope.var('Out@GRAD')
y_grad_tensor = y_grad.get_tensor()
y_grad_tensor.set(random_grad, cpu)
op = block.append_op(
type=self.op_type,
inputs={"X": input_arr},
outputs={"Out": y_out,
"OutIndex": y_out_index},
attrs=self.attrs)
out_grad = block.create_var(
name="tmp_lod_tensor_array@GRAD",
type=core.VarDesc.VarType.LOD_TENSOR_ARRAY)
out_grad.persistable = True
grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(op.desc,
set(), [])
grad_op_desc = grad_op_desc_list[0]
new_op_desc = block.desc.append_op()
new_op_desc.copy_from(grad_op_desc)
for var_name in grad_op_desc.output_arg_names():
block.desc.var(var_name.encode("ascii"))
grad_op_desc.infer_var_type(block.desc)
grad_op_desc.infer_shape(block.desc)
for arg in grad_op_desc.output_arg_names():
grad_var = block.desc.find_var(arg.encode("ascii"))
grad_var.set_dtype(core.VarDesc.VarType.FP32)
fetch_list = []
fetch_list.append(block.var('Out'))
fetch_list.append(block.var('OutIndex'))
exe = fluid.Executor(fluid.CPUPlace())
out = exe.run(program, fetch_list=fetch_list, scope=scope)
#print ("index: ", numpy.array(out[1]))
# test forward
tensor_res = numpy.array(out[0])
tensor_res_out_idx = numpy.array(out[1])
tensor_gt = numpy.array(
[0] + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='float32')
self.assertEqual(len(tensor_res), len(tensor_gt))
self.assertEqual(len(tensor_res_out_idx), 10)
for i in range(len(tensor_res)):
self.assertEqual(tensor_res[i], tensor_gt[i])
for i in range(len(tensor_res_out_idx)):
if i == 0:
self.assertEqual(tensor_res_out_idx[i], 2)
else:
self.assertEqual(tensor_res_out_idx[i], 1)
# test backward
grad_tensor = scope.var('tmp_lod_tensor_array@GRAD')
grad_tensor_array = grad_tensor.get_lod_tensor_array()
self.assertEqual(10, len(grad_tensor_array))
for i in range(len(grad_tensor_array)):
if i == 0:
self.assertEqual(
numpy.array(grad_tensor_array[i])[0],
numpy.array(random_grad[i]))
self.assertEqual(
numpy.array(grad_tensor_array[i])[1],
numpy.array(random_grad[i + 1]))
if i == 1:
self.assertEqual(
numpy.array(grad_tensor_array[i]),
numpy.array(random_grad[i + 1]))
if __name__ == '__main__':
unittest.main()
...@@ -31,18 +31,17 @@ Steps to transpile pserver: ...@@ -31,18 +31,17 @@ Steps to transpile pserver:
""" """
import math import math
import sys
import numpy as np import numpy as np
import collections import collections
import six
import logging import logging
from .ps_dispatcher import RoundRobin, HashName, PSDispatcher from .ps_dispatcher import RoundRobin, PSDispatcher
from .. import core, framework, unique_name from .. import core, framework, unique_name
from ..framework import Program, default_main_program, \ from ..framework import Program, default_main_program, \
default_startup_program, Block, \ default_startup_program, Block, \
Parameter, grad_var_name Parameter, grad_var_name
from .details import * from .details import *
from ..distribute_lookup_table import find_distributed_lookup_table
from functools import reduce from functools import reduce
LOOKUP_TABLE_TYPE = "lookup_table" LOOKUP_TABLE_TYPE = "lookup_table"
...@@ -292,7 +291,8 @@ class DistributeTranspiler(object): ...@@ -292,7 +291,8 @@ class DistributeTranspiler(object):
self.optimize_ops, self.params_grads = self._get_optimize_pass() self.optimize_ops, self.params_grads = self._get_optimize_pass()
ps_dispatcher = self.config.split_method(self.pserver_endpoints) ps_dispatcher = self.config.split_method(self.pserver_endpoints)
self.has_distributed_lookup_table = self._has_distributed_lookup_table() self.table_name = find_distributed_lookup_table(self.origin_program)
self.has_distributed_lookup_table = self.table_name != None
self.param_name_to_grad_name = dict() self.param_name_to_grad_name = dict()
self.grad_name_to_param_name = dict() self.grad_name_to_param_name = dict()
for param_var, grad_var in self.params_grads: for param_var, grad_var in self.params_grads:
...@@ -966,28 +966,6 @@ to transpile() call.") ...@@ -966,28 +966,6 @@ to transpile() call.")
# ====================== private transpiler functions ===================== # ====================== private transpiler functions =====================
def _has_distributed_lookup_table(self):
# process lookup_table_op
# 1. check all lookup_table_op is distributed
# 2. check all lookup_table_op share the same table.
distributed_lookup_table_ops = []
# support only one distributed_lookup_table now
self.table_name = None
for op in self.origin_program.global_block().ops:
if op.type == LOOKUP_TABLE_TYPE:
if op.attr('is_distributed') is True:
if self.table_name is None:
self.table_name = op.input("W")[0]
if self.table_name != op.input("W")[0]:
raise RuntimeError("all distributed lookup_table_ops"
" should have only one table")
distributed_lookup_table_ops.append(op)
else:
if self.table_name is not None:
assert op.input("W")[0] != self.table_name
return len(distributed_lookup_table_ops) > 0
def _update_dist_lookup_table_vars(self, param_list, grad_list, def _update_dist_lookup_table_vars(self, param_list, grad_list,
params_grads): params_grads):
# TODO(wuyi): put find a way to put dist lookup table stuff all together. # TODO(wuyi): put find a way to put dist lookup table stuff all together.
...@@ -1341,7 +1319,6 @@ to transpile() call.") ...@@ -1341,7 +1319,6 @@ to transpile() call.")
""" """
create a new block to handle save checkpoint. create a new block to handle save checkpoint.
""" """
import os
pserver_program.global_block().create_var( pserver_program.global_block().create_var(
name="kLookupTablePath", name="kLookupTablePath",
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册