提交 d231e550 编写于 作者: S sneaxiy

merge develop

test=develop
......@@ -118,9 +118,10 @@ paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon',
paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0))
paddle.fluid.layers.roi_align ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None))
paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,))
paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR'))
paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None))
paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',))
paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.layers.resize_nearest ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,))
......@@ -178,6 +179,7 @@ paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], vara
paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
paddle.fluid.layers.similarity_focus ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None))
paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
......@@ -200,6 +202,7 @@ paddle.fluid.layers.create_tensor ArgSpec(args=['dtype', 'name', 'persistable'],
paddle.fluid.layers.create_parameter ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None))
paddle.fluid.layers.create_global_var ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None))
paddle.fluid.layers.cast ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.tensor_array_to_tensor ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
paddle.fluid.layers.concat ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None))
paddle.fluid.layers.sums ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.assign ArgSpec(args=['input', 'output'], varargs=None, keywords=None, defaults=(None,))
......
......@@ -18,8 +18,8 @@ namespace framework {
void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
Tensor *out) {
VLOG(3) << "DeviceTransform in, src_place " << in.place()
<< " dst_place: " << dst_place;
VLOG(30) << "DeviceTransform in, src_place " << in.place()
<< " dst_place: " << dst_place;
PADDLE_ENFORCE_NE(
in.place().which(), dst_place.which(),
......
......@@ -49,10 +49,10 @@ class TestOpWithKernel : public OperatorWithKernel {
OpKernelType GetExpectedKernelType(
const ExecutionContext& ctx) const override {
if (Attr<bool>("use_gpu")) {
VLOG(3) << "force use gpu kernel";
VLOG(30) << "force use gpu kernel";
return OpKernelType(proto::VarType::FP32, platform::CUDAPlace(0));
} else {
VLOG(3) << "use default kernel";
VLOG(30) << "use default kernel";
return OpKernelType(proto::VarType::FP32,
ctx.Input<Tensor>("input")->place());
}
......@@ -148,7 +148,7 @@ TEST(Operator, CPUtoGPU) {
// get output
auto* output2 = scope.Var("OUT2");
gpu_op->Run(scope, cuda_place);
VLOG(3) << "after gpu_op run";
VLOG(30) << "after gpu_op run";
// auto* output2_ptr = output2->Get<LoDTensor>().data<float>();
paddle::platform::DeviceContextPool& pool =
......
......@@ -60,7 +60,7 @@ void BroadcastOpHandle::BroadcastOneVar(
PADDLE_ENFORCE_NOT_NULL(in_var);
Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
if (UNLIKELY(!in_tensor.IsInitialized())) {
VLOG(3) << "in var " << in_var_handle.name_ << "not inited, return!";
VLOG(30) << "in var " << in_var_handle.name_ << "not inited, return!";
return;
}
......
......@@ -45,8 +45,8 @@ std::unique_ptr<ir::Graph> ModifyOpLockAndRecordEventPass::ApplyImpl(
IsLockAndRecordEventFreeComputationOpHandle(compute_op, graph_view);
compute_op->SetLockAndRecordEventFree(is_lock_and_record_event_free);
if (is_lock_and_record_event_free) {
VLOG(10) << "Set is_lock_and_record_event_free be true in op "
<< compute_op->DebugString();
VLOG(100) << "Set is_lock_and_record_event_free be true in op "
<< compute_op->DebugString();
}
}
return ir_graph;
......
......@@ -399,7 +399,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
for (size_t i = 0; i < backward_vars.size(); i += 2) {
auto &p_name = backward_vars[i];
auto &g_name = backward_vars[i + 1];
VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
VLOG(100) << "Bcast " << g_name << " for parameter " << p_name;
switch (strategy_.reduce_) {
case BuildStrategy::ReduceStrategy::kReduce:
......@@ -809,8 +809,8 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE_EQ(send_param_grad.size(), 2U);
op_dev_id = GetAppropriateDeviceID({send_param_grad[1]});
VLOG(10) << "send grad " << input_var_names[0] << " origin "
<< send_param_grad[1] << " place: " << op_dev_id;
VLOG(100) << "send grad " << input_var_names[0] << " origin "
<< send_param_grad[1] << " place: " << op_dev_id;
for (auto &varname : input_var_names) {
sharded_var_device->emplace(varname, op_dev_id);
}
......@@ -826,9 +826,9 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
if (recv_param_grad.size() == 2U) {
op_dev_id =
GetVarDeviceID(*result, recv_param_grad[1], *sharded_var_device);
VLOG(10) << "recv param " << recv_param_grad[0]
<< " get grad place: " << recv_param_grad[1]
<< " place: " << op_dev_id;
VLOG(100) << "recv param " << recv_param_grad[0]
<< " get grad place: " << recv_param_grad[1]
<< " place: " << op_dev_id;
} else {
op_dev_id = GetAppropriateDeviceID(output_var_names);
}
......
......@@ -140,8 +140,8 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
if (next_compute_op != nullptr) {
if (compute_ref_cnt_map.count(next_compute_op)) {
compute_ref_cnt_map[next_compute_op]->AddVar(var_name);
VLOG(5) << "Add reference count of " << var_name << " to Operator "
<< next_compute_op->Name();
VLOG(50) << "Add reference count of " << var_name << " to Operator "
<< next_compute_op->Name();
} else {
// Create new reference_count_op_handle
ir::Node *ref_cnt_node = graph->CreateEmptyNode(
......
......@@ -51,7 +51,7 @@ void ScaleLossGradOpHandle::RunImpl() {
->stream();
memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
platform::CPUPlace(), &coeff_, sizeof(float), stream);
VLOG(10) << place_ << "RUN Scale loss grad op";
VLOG(100) << place_ << "RUN Scale loss grad op";
});
#endif
}
......
......@@ -94,8 +94,8 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
op_node_list[i - 1]->outputs.push_back(dep_var);
dep_var->outputs.push_back(op_node_list[i]);
dep_var->inputs.push_back(op_node_list[i - 1]);
VLOG(10) << "Add dependencies between " << op_node_list[i - 1]->Name()
<< " and " << op_node_list[i]->Name();
VLOG(100) << "Add dependencies between " << op_node_list[i - 1]->Name()
<< " and " << op_node_list[i]->Name();
}
return graph;
}
......
......@@ -210,16 +210,16 @@ void ThreadedSSAGraphExecutor::RunOp(
details::OpHandleBase *op) {
auto op_run = [ready_var_q, op, this] {
try {
if (VLOG_IS_ON(10)) {
VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
if (VLOG_IS_ON(100)) {
VLOG(100) << op << " " << op->Name() << " : " << op->DebugString();
}
if (LIKELY(!strategy_.dry_run_)) {
op->Run(strategy_.use_cuda_);
}
VLOG(10) << op << " " << op->Name() << " Done ";
VLOG(100) << op << " " << op->Name() << " Done ";
running_ops_--;
ready_var_q->Extend(op->Outputs());
VLOG(10) << op << " " << op->Name() << "Signal posted";
VLOG(100) << op << " " << op->Name() << "Signal posted";
} catch (...) {
exception_holder_.Catch(std::current_exception());
}
......
......@@ -43,7 +43,7 @@ ExecutorPrepareContext::ExecutorPrepareContext(
}
ExecutorPrepareContext::~ExecutorPrepareContext() {
VLOG(5) << "destroy ExecutorPrepareContext";
VLOG(50) << "destroy ExecutorPrepareContext";
}
template <typename RefCntMap>
......@@ -60,7 +60,7 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
if ((it->second)-- == 1) {
auto* var = scope.FindVar(name);
if (var != nullptr) {
VLOG(10) << "Erase tensor \'" << name << "\'";
VLOG(100) << "Erase tensor \'" << name << "\'";
if (var->IsType<LoDTensor>()) {
erase_tensors.insert(var->GetMutable<LoDTensor>());
} else if (var->IsType<SelectedRows>()) {
......@@ -141,21 +141,21 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
if (var->Persistable()) {
auto* ptr = const_cast<Scope*>(ancestor_scope)->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " global, which pointer is " << ptr;
VLOG(30) << "Create Variable " << var->Name()
<< " global, which pointer is " << ptr;
} else {
auto* ptr = scope->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " locally, which pointer is " << ptr;
VLOG(30) << "Create Variable " << var->Name()
<< " locally, which pointer is " << ptr;
}
}
} else {
for (auto& var : global_block.AllVars()) {
auto* ptr = scope->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
<< ptr;
VLOG(30) << "Create variable " << var->Name() << ", which pointer is "
<< ptr;
}
}
}
......@@ -286,7 +286,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
int i = 0;
for (auto& feed_target : (*feed_targets)) {
std::string var_name = feed_target.first;
VLOG(3) << "feed target's name: " << var_name;
VLOG(30) << "feed target's name: " << var_name;
// prepend feed op
auto* op = global_block->PrependOp();
......@@ -309,7 +309,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
int i = 0;
for (auto& fetch_target : (*fetch_targets)) {
std::string var_name = fetch_target.first;
VLOG(3) << "fetch target's name: " << var_name;
VLOG(30) << "fetch target's name: " << var_name;
// append fetch op
auto* op = global_block->AppendOp();
......@@ -459,7 +459,7 @@ void Executor::RunPreparedContext(
void Executor::EnableMKLDNN(const ProgramDesc& program) {
#ifdef PADDLE_WITH_MKLDNN
VLOG(3) << "use_mkldnn=True";
VLOG(30) << "use_mkldnn=True";
for (size_t bid = 0; bid < program.Size(); ++bid) {
auto* block = const_cast<ProgramDesc&>(program).MutableBlock(bid);
for (auto* op : block->AllOps()) {
......
......@@ -25,7 +25,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
const std::string& var_name, size_t index) {
// If var_name Variable is not found in GlobalScope, a new variable will
// be created.
VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
VLOG(30) << "SetFeedVariable name=" << var_name << " index=" << index;
Variable* g_feed_value = scope->Var(var_name);
auto& feed_inputs = *(g_feed_value->GetMutable<FeedFetchList>());
if (index >= feed_inputs.size()) {
......@@ -47,8 +47,8 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
typeid(FeedFetchList).name());
auto& fetch_outputs = *g_fetch_value->GetMutable<FeedFetchList>();
auto& tensor = fetch_outputs[index];
VLOG(3) << "Fetch " << var_name << " with index " << index
<< " shape= " << tensor.dims();
VLOG(30) << "Fetch " << var_name << " with index " << index
<< " shape= " << tensor.dims();
PADDLE_ENFORCE_LT(index, fetch_outputs.size());
return tensor;
}
......
......@@ -147,19 +147,19 @@ void PrepareParameters(Graph* graph, const Param& param) {
scope->Var(param.LSTMX)->GetMutable<LoDTensor>();
scope->Var(param.LSTMOUT)->GetMutable<LoDTensor>();
#define GATE_W(name__) \
auto* W_##name__##_w0 = scope->FindVar(#name__ ".w_0"); \
auto* W_##name__##_w1 = scope->FindVar(#name__ ".w_1"); \
auto* W_##name__##_b0 = scope->FindVar(#name__ ".b_0"); \
CHECK_P3(W_##name__##_w0, W_##name__##_w1, W_##name__##_b0); \
VLOG(4) << #name__ "_w0" \
<< " shape: " << W_##name__##_w0->Get<LoDTensor>().dims(); \
VLOG(4) << #name__ "_w1" \
<< " shape: " << W_##name__##_w1->Get<LoDTensor>().dims(); \
VLOG(4) << #name__ "_b0" \
<< " shape: " << W_##name__##_b0->Get<LoDTensor>().dims(); \
auto& W_##name__##_w0_t = W_##name__##_w0->Get<LoDTensor>(); \
auto& W_##name__##_w1_t = W_##name__##_w1->Get<LoDTensor>(); \
#define GATE_W(name__) \
auto* W_##name__##_w0 = scope->FindVar(#name__ ".w_0"); \
auto* W_##name__##_w1 = scope->FindVar(#name__ ".w_1"); \
auto* W_##name__##_b0 = scope->FindVar(#name__ ".b_0"); \
CHECK_P3(W_##name__##_w0, W_##name__##_w1, W_##name__##_b0); \
VLOG(40) << #name__ "_w0" \
<< " shape: " << W_##name__##_w0->Get<LoDTensor>().dims(); \
VLOG(40) << #name__ "_w1" \
<< " shape: " << W_##name__##_w1->Get<LoDTensor>().dims(); \
VLOG(40) << #name__ "_b0" \
<< " shape: " << W_##name__##_b0->Get<LoDTensor>().dims(); \
auto& W_##name__##_w0_t = W_##name__##_w0->Get<LoDTensor>(); \
auto& W_##name__##_w1_t = W_##name__##_w1->Get<LoDTensor>(); \
auto& W_##name__##_b0_t = W_##name__##_b0->Get<LoDTensor>();
GATE_W(forget);
......@@ -208,7 +208,7 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
int D = W_forget_w0.dims()[0];
int M = W_forget_w1.dims()[0];
out->Resize(make_ddim({D + M, 4 * D}));
VLOG(3) << "LSTMWeight resized to " << out->dims();
VLOG(30) << "LSTMWeight resized to " << out->dims();
float* out_data = out->mutable_data<float>(platform::CPUPlace());
std::array<const float*, 4> tensors(
......
......@@ -57,7 +57,7 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
int found_conv_bias_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(4) << "handle ConvBias fuse";
VLOG(40) << "handle ConvBias fuse";
GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
conv_bias_pattern); // Filter
GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_bias_pattern); // tmp
......@@ -74,7 +74,7 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
// check if fuse can be done and if MKL-DNN should be used
FuseOptions fuse_option = FindFuseOption(*conv, *eltwise);
if (fuse_option == DO_NOT_FUSE || fuse_option == FUSE_NATIVE) {
VLOG(3) << "do not perform conv+bias fuse";
VLOG(30) << "do not perform conv+bias fuse";
return;
}
......
......@@ -121,7 +121,7 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
int found_conv_bn_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(4) << "handle ConvBN fuse";
VLOG(40) << "handle ConvBN fuse";
// conv, batch_norm,
// conv_weight, conv_out,
......@@ -133,7 +133,7 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
// check if fuse can be done and if MKL-DNN should be used
FuseOptions fuse_option = FindFuseOption(*conv, *batch_norm);
if (fuse_option == DO_NOT_FUSE) {
VLOG(3) << "do not perform conv+bn fuse";
VLOG(30) << "do not perform conv+bn fuse";
return;
}
......@@ -241,7 +241,7 @@ std::unique_ptr<ir::Graph> ConvEltwiseAddBNFusePass::ApplyImpl(
int found_conv_bn_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(4) << "handle ConvBN fuse";
VLOG(40) << "handle ConvBN fuse";
// conv, batch_norm,
// conv_weight, conv_out,
......
......@@ -38,7 +38,7 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
int found_conv_relu_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(4) << "handle ConvReLU fuse";
VLOG(40) << "handle ConvReLU fuse";
GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
conv_relu_pattern); // Filter
GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern); // tmp
......@@ -48,7 +48,7 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
FuseOptions fuse_option = FindFuseOption(*conv, *relu);
if (fuse_option == DO_NOT_FUSE) {
VLOG(3) << "do not perform conv+relu fuse";
VLOG(30) << "do not perform conv+relu fuse";
return;
}
......
......@@ -39,7 +39,7 @@ std::unique_ptr<ir::Graph> DepthwiseConvMKLDNNPass::ApplyImpl(
int found_depthwise_conv_mkldnn_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(3) << "handle DepthwiseConvMKLDNN fuse";
VLOG(30) << "handle DepthwiseConvMKLDNN fuse";
GET_NODE(depthwise_conv, (*pattern));
depthwise_conv->Op()->SetType("conv2d");
found_depthwise_conv_mkldnn_count++;
......
......@@ -39,7 +39,7 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
int found_fc_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(4) << "handle FC fuse";
VLOG(40) << "handle FC fuse";
GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern);
......
......@@ -61,7 +61,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddAct(
auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
Graph *g) {
VLOG(4) << "handle FuseElewiseAddAct fuse";
VLOG(40) << "handle FuseElewiseAddAct fuse";
GET_IR_NODE_FROM_SUBGRAPH(ele_y, ele_y, elewise_add_act_pattern);
GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out,
elewise_add_act_pattern);
......@@ -77,10 +77,10 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddAct(
Node *elewise_add_act_node = CreateFuseElewiseAddActNode(
g, act, ele_add, ele_x_n, ele_y_n, ele_out_n, act_out_n);
VLOG(4) << "\n\t " << ele_x_n << " and " << ele_y_n << " -> "
<< ele_add->Name() << " -> " << ele_out_n << "\n"
<< "\t " << ele_out_n << " -> " << act->Name() << " -> "
<< act_out_n;
VLOG(40) << "\n\t " << ele_x_n << " and " << ele_y_n << " -> "
<< ele_add->Name() << " -> " << ele_out_n << "\n"
<< "\t " << ele_out_n << " -> " << act->Name() << " -> "
<< act_out_n;
ReLinkNodes(g, ele_out, ele_add, act, elewise_add_act_node);
found_elewise_add_act_count++;
......@@ -113,7 +113,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseActElewiseAdd(
auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
Graph *g) {
VLOG(4) << "handle FuseElewiseAddAct fuse";
VLOG(40) << "handle FuseElewiseAddAct fuse";
GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, act_elewise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(ele_x, ele_x, act_elewise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out,
......@@ -129,9 +129,9 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseActElewiseAdd(
Node *elewise_add_act_node = CreateFuseElewiseAddActNode(
g, ele_add, act, elewise_add_x_n, act_i_n, act_o_n, elewise_add_out_n);
VLOG(4) << "\n\t " << act_i_n << " -> " << act->Name() << " -> " << act_o_n
<< "\n\t " << act_o_n << " and " << elewise_add_x_n << " -> "
<< ele_add->Name() << " -> " << elewise_add_out_n;
VLOG(40) << "\n\t " << act_i_n << " -> " << act->Name() << " -> " << act_o_n
<< "\n\t " << act_o_n << " and " << elewise_add_x_n << " -> "
<< ele_add->Name() << " -> " << elewise_add_out_n;
ReLinkNodes(g, act_out, act, ele_add, elewise_add_act_node);
found_elewise_add_act_count++;
......@@ -165,7 +165,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad(
auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
Graph *g) {
VLOG(4) << "handle FuseElewiseAddActGrad1 fuse";
VLOG(40) << "handle FuseElewiseAddActGrad1 fuse";
GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, elewise_add_act_grad_pattern);
GET_IR_NODE_FROM_SUBGRAPH(act_grad, act_grad, elewise_add_act_grad_pattern);
GET_IR_NODE_FROM_SUBGRAPH(d_itermediate_out, d_itermediate_out,
......@@ -208,10 +208,10 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad(
auto fused_node = g->CreateOpNode(&desc);
VLOG(4) << "\n\t " << d_act_out_n << " and " << act_out_n << " -> "
<< act_grad->Name() << " -> " << d_itermediate_out_n << "\n\t "
<< d_itermediate_out_n << " and " << act_out_n << " -> "
<< ele_add_grad->Name() << " -> " << d_itermediate_out_n;
VLOG(40) << "\n\t " << d_act_out_n << " and " << act_out_n << " -> "
<< act_grad->Name() << " -> " << d_itermediate_out_n << "\n\t "
<< d_itermediate_out_n << " and " << act_out_n << " -> "
<< ele_add_grad->Name() << " -> " << d_itermediate_out_n;
ReLinkNodes(g, d_itermediate_out, act_grad, ele_add_grad, fused_node);
found_elewise_add_act_count++;
......
......@@ -92,7 +92,7 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
std::map<std::string, std::vector<ir::Node *>> Graph::InitFromProgram(
const ProgramDesc &program) {
VLOG(3) << "block in program:" << program_.Size();
VLOG(30) << "block in program:" << program_.Size();
std::unordered_map<std::string, VarDesc *> all_vars;
// var nodes for each var name, will have multiple versions in SSA
std::map<std::string, std::vector<ir::Node *>> var_nodes;
......@@ -160,7 +160,7 @@ void Graph::ResolveHazard(
auto it_old = versions.rbegin();
++it_old;
for (; it_old != versions.rend(); it_new = it_old, ++it_old) {
VLOG(3) << "deal with var: " << (*it_new)->Name();
VLOG(30) << "deal with var: " << (*it_new)->Name();
ir::Node *write_op =
(*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0];
const auto &read_ops = (*it_old)->outputs;
......
......@@ -89,7 +89,7 @@ class Graph {
attr_name);
attrs_[attr_name] = attr;
attr_dels_[attr_name] = [attr, attr_name]() {
VLOG(3) << "deleting " << attr_name;
VLOG(30) << "deleting " << attr_name;
delete attr;
};
}
......
......@@ -33,8 +33,9 @@ void SortHelper(
}
}
VLOG(3) << "topology sort insert: " << node->Name()
<< reinterpret_cast<void *>(node) << " input " << node->inputs.size();
VLOG(30) << "topology sort insert: " << node->Name()
<< reinterpret_cast<void *>(node) << " input "
<< node->inputs.size();
ret->push_back(node);
}
......@@ -103,9 +104,9 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
for (auto &var : n->inputs) {
for (auto &adj_n : var->inputs) {
PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
<< " -> " << n->Name() << reinterpret_cast<void *>(n)
<< " via " << var->Name() << reinterpret_cast<void *>(var);
VLOG(40) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
<< " -> " << n->Name() << reinterpret_cast<void *>(n)
<< " via " << var->Name() << reinterpret_cast<void *>(var);
adj_list[n].insert(adj_n);
}
}
......@@ -163,10 +164,10 @@ size_t GraphNum(const Graph &graph) {
graph_nodes.emplace_back(g_nodes);
}
if (VLOG_IS_ON(10)) {
VLOG(10) << "graph_num: " << graph_nodes.size();
if (VLOG_IS_ON(100)) {
VLOG(100) << "graph_num: " << graph_nodes.size();
for (auto &g_n : graph_nodes) {
VLOG(10) << "graph_nodes: " << g_n.size();
VLOG(100) << "graph_nodes: " << g_n.size();
if (g_n.size() < 10) {
std::stringstream out;
for (auto &node : g_n) {
......@@ -180,7 +181,7 @@ size_t GraphNum(const Graph &graph) {
}
out << "]";
}
VLOG(10) << out.str();
VLOG(100) << out.str();
}
}
}
......
......@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <array>
#include <string>
#include <vector>
......@@ -91,19 +92,19 @@ void GraphPatternDetector::operator()(Graph *graph,
PrettyLogEndl(Style::detail(), "--- detect %d subgraphs", subgraphs.size());
int id = 0;
for (auto &g : subgraphs) {
VLOG(3) << "optimizing #" << id++ << " subgraph";
VLOG(30) << "optimizing #" << id++ << " subgraph";
handler(g, graph);
}
}
bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
VLOG(3) << "mark pdnodes in graph";
VLOG(30) << "mark pdnodes in graph";
if (graph.Nodes().empty()) return false;
for (auto &node : GraphTraits::DFS(graph)) {
for (const auto &pdnode : pattern_.nodes()) {
if (pdnode->Tell(&node)) {
VLOG(4) << "pdnode " << pdnode->name() << " marked";
VLOG(40) << "pdnode " << pdnode->name() << " marked";
pdnodes2nodes_[pdnode.get()].insert(&node);
}
}
......@@ -111,7 +112,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
// Check to early stop if some PDNode can't find matched Node.
for (auto &pdnode : pattern_.nodes()) {
if (!pdnodes2nodes_.count(pdnode.get())) {
VLOG(4) << pdnode->name() << " can't find matched Node, early stop";
VLOG(40) << pdnode->name() << " can't find matched Node, early stop";
// return false;
}
}
......@@ -120,7 +121,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
GetMarkedNodes(const_cast<Graph *>(&graph)).insert(n);
}
}
VLOG(3) << pdnodes2nodes_.size() << " nodes marked";
VLOG(30) << pdnodes2nodes_.size() << " nodes marked";
return !pdnodes2nodes_.empty();
}
......@@ -213,7 +214,7 @@ GraphPatternDetector::DetectPatterns() {
// Extend a PDNode to subgraphs by deducing the connection relations defined
// in edges of PDNodes.
for (const auto &edge : pattern_.edges()) {
VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name();
VLOG(40) << "check " << edge.first->name() << " -> " << edge.second->name();
// TODO(Superjomn) Fix bug here, the groups might be duplicate here.
// Each role has two PDNodes, which indicates two roles.
// Detect two Nodes that can match these two roles and they are connected.
......@@ -224,7 +225,7 @@ GraphPatternDetector::DetectPatterns() {
// source -> target
for (Node *source : pdnodes2nodes_[edge.first]) {
for (Node *target : pdnodes2nodes_[edge.second]) {
VLOG(8) << "check " << source->id() << " -- " << target->id();
VLOG(80) << "check " << source->id() << " -- " << target->id();
// TODO(Superjomn) add some prune strategies.
for (const auto &group : pre_groups) {
HitGroup new_group = group;
......@@ -240,12 +241,13 @@ GraphPatternDetector::DetectPatterns() {
}
}
}
VLOG(3) << "step " << step << " get records: " << cur_groups.size();
VLOG(30) << "step " << step << " get records: " << cur_groups.size();
for (auto &group : cur_groups) {
for (auto &item : group.roles) {
VLOG(4) << "node " << item.second->id() << " as " << item.first->name();
VLOG(40) << "node " << item.second->id() << " as "
<< item.first->name();
}
VLOG(4) << "=========================================================";
VLOG(40) << "=========================================================";
}
}
......
......@@ -41,7 +41,7 @@ std::string FormatName(const Node* node) {
std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
const std::string graph_viz_path = Get<std::string>(kGraphVizPath);
VLOG(3) << "draw IR graph viz to " << graph_viz_path;
VLOG(30) << "draw IR graph viz to " << graph_viz_path;
std::unique_ptr<std::ostream> fout(new std::ofstream(graph_viz_path));
PADDLE_ENFORCE(fout->good());
std::ostream& sout = *fout;
......
......@@ -20,7 +20,7 @@ namespace ir {
std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
VLOG(3) << "Aplies MKL-DNN placement strategy.";
VLOG(30) << "Aplies MKL-DNN placement strategy.";
for (const Node* n : graph->Nodes()) {
if (n->IsOp() && n->Op()->HasAttr("use_mkldnn")) {
n->Op()->SetAttr("use_mkldnn", true);
......
......@@ -62,7 +62,7 @@ VarDesc UpdateGradVarDesc(
string::Sprintf("%s.repeat.%d", var_desc->Name(), repeat);
VarDesc repeated_var = CopyVarDesc(var_desc);
repeated_var.SetName(new_gname);
VLOG(3) << "update " << var_desc->Name() << " to repeat " << repeat;
VLOG(30) << "update " << var_desc->Name() << " to repeat " << repeat;
return repeated_var;
}
return *var_desc;
......@@ -78,7 +78,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
std::vector<ir::Node*> nodes = TopologySortOperations(*graph);
auto origin_nodes = graph->ReleaseNodes();
VLOG(3) << "origin nodes count: " << origin_nodes.size();
VLOG(30) << "origin nodes count: " << origin_nodes.size();
ir::Graph& result = *graph;
// 1. record op nodes of different roles
......@@ -137,8 +137,8 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
"%s.repeat.%d", repeated_op.Input("Variance")[0], i);
bn_vars_need_rename.insert(repeated_op.Input("Mean")[0]);
bn_vars_need_rename.insert(repeated_op.Input("Variance")[0]);
VLOG(3) << "renaming " << repeated_op.Input("Mean")[0] << " to "
<< new_mean_name;
VLOG(30) << "renaming " << repeated_op.Input("Mean")[0] << " to "
<< new_mean_name;
repeated_op.RenameInput(repeated_op.Input("Mean")[0], new_mean_name);
repeated_op.RenameInput(repeated_op.Input("Variance")[0], new_var_name);
repeated_op.RenameOutput(repeated_op.Output("MeanOut")[0],
......
......@@ -76,7 +76,7 @@ class Pass {
attr_name);
attrs_[attr_name] = attr;
attr_dels_[attr_name] = [attr, attr_name]() {
VLOG(3) << "deleting " << attr_name;
VLOG(30) << "deleting " << attr_name;
delete attr;
};
}
......
......@@ -12,10 +12,13 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h"
#include <set>
#include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h"
#include "paddle/fluid/framework/lod_tensor.h"
namespace paddle {
......@@ -159,10 +162,7 @@ PDNode* BuildFCPattern(PDPattern* pattern, PDNode* fc_x) {
std::set<std::string> acts({"sigmoid", "tanh", "relu", "identity"});
PDNode* act = pattern->NewNode(
[=](Node* x) {
return x && x->IsOp() && acts.count(x->Op()->Type());
},
[=](Node* x) { return x && x->IsOp() && acts.count(x->Op()->Type()); },
"act");
PDNode* fc_out = pattern->NewNode(
......@@ -196,7 +196,7 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
detector(graph.get(), [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* graph) {
VLOG(4) << "get one concat pattern";
VLOG(40) << "get one concat pattern";
// fc
GET_NODE(fc_w, detector.pattern());
GET_NODE(fc_bias, detector.pattern());
......
......@@ -60,7 +60,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope) {
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(4) << "handle SeqConv EltAdd Relu fuse";
VLOG(40) << "handle SeqConv EltAdd Relu fuse";
GET_IR_NODE_FROM_SUBGRAPH(seqconv, seqconv, fuse_pattern);
GET_IR_NODE_FROM_SUBGRAPH(seqconv_weight, seqconv_weight, fuse_pattern);
GET_IR_NODE_FROM_SUBGRAPH(seqconv_out, seqconv_out, fuse_pattern);
......
......@@ -31,7 +31,7 @@ void LoDRankTable::Reset(const LoD& lod, size_t level) {
TableItem item;
item.index = i;
item.length = vec[i + 1] - vec[i];
VLOG(10) << "Add item to rank table " << item.index << " " << item.length;
VLOG(100) << "Add item to rank table " << item.index << " " << item.length;
items_.emplace_back(item);
}
// NOTE(yuyang18):
......
......@@ -51,7 +51,7 @@ TEST(mixed_vector, InitWithCount) {
TEST(mixed_vector, ForEach) {
vec<int> tmp;
for (auto& v : tmp) {
VLOG(3) << v;
VLOG(30) << v;
}
}
......
......@@ -71,7 +71,7 @@ void NaiveExecutor::Prepare(Scope *parent_scope,
void NaiveExecutor::Run() {
for (auto &op : ops_) {
VLOG(4) << "run " << op->Type();
VLOG(40) << "run " << op->Type();
op->Run(*scope_, place_);
}
}
......@@ -95,21 +95,21 @@ void NaiveExecutor::CreateVariables(const ProgramDesc &desc, Scope *scope,
if (var->Persistable()) {
auto *ptr = const_cast<Scope *>(ancestor_scope)->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " global, which pointer is " << ptr;
VLOG(30) << "Create Variable " << var->Name()
<< " global, which pointer is " << ptr;
} else { // Create temporary variables in local scope.
auto *ptr = scope->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " locally, which pointer is " << ptr;
VLOG(30) << "Create Variable " << var->Name()
<< " locally, which pointer is " << ptr;
}
}
} else {
for (auto &var : global_block.AllVars()) {
auto *ptr = scope->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
<< ptr;
VLOG(30) << "Create variable " << var->Name() << ", which pointer is "
<< ptr;
}
}
}
......
......@@ -82,7 +82,7 @@ class CompileTimeInferShapeContext : public InferShapeContext {
auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
if (in_var->GetType() != proto::VarType::LOD_TENSOR) {
VLOG(3) << "input " << in << " is not LodTensor";
VLOG(30) << "input " << in << " is not LodTensor";
return;
}
out_var->SetLoDLevel(in_var->GetLoDLevel());
......@@ -241,32 +241,32 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
const proto::OpProto::Attr &attr = GetProtoAttr(name);
switch (attr.type()) {
case proto::AttrType::BOOLEANS: {
VLOG(11) << "SetAttr: " << Type() << ", " << name
<< " from INTS to BOOLEANS";
VLOG(110) << "SetAttr: " << Type() << ", " << name
<< " from INTS to BOOLEANS";
this->attrs_[name] = std::vector<bool>();
break;
}
case proto::AttrType::INTS: {
VLOG(11) << "SetAttr: " << Type() << ", " << name
<< " from INTS to INTS";
VLOG(110) << "SetAttr: " << Type() << ", " << name
<< " from INTS to INTS";
this->attrs_[name] = std::vector<int>();
break;
}
case proto::AttrType::FLOATS: {
VLOG(11) << "SetAttr: " << Type() << ", " << name
<< " from INTS to FLOATS";
VLOG(110) << "SetAttr: " << Type() << ", " << name
<< " from INTS to FLOATS";
this->attrs_[name] = std::vector<float>();
break;
}
case proto::AttrType::STRINGS: {
VLOG(11) << "SetAttr: " << Type() << ", " << name
<< " from INTS to STRINGS";
VLOG(110) << "SetAttr: " << Type() << ", " << name
<< " from INTS to STRINGS";
this->attrs_[name] = std::vector<std::string>();
break;
}
case proto::AttrType::BLOCKS: {
VLOG(11) << "SetAttr: " << Type() << ", " << name
<< " from INTS to BLOCKS";
VLOG(110) << "SetAttr: " << Type() << ", " << name
<< " from INTS to BLOCKS";
this->SetBlocksAttr(name, std::vector<BlockDesc *>());
return;
}
......@@ -499,13 +499,13 @@ void OpDesc::CheckAttrs() {
}
void OpDesc::InferShape(const BlockDesc &block) const {
VLOG(3) << "CompileTime infer shape on " << Type();
VLOG(30) << "CompileTime infer shape on " << Type();
InitInferShapeFuncs();
auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_;
PADDLE_ENFORCE(static_cast<bool>(infer_shape),
"%s's infer_shape has not been registered", this->Type());
CompileTimeInferShapeContext ctx(*this, block);
if (VLOG_IS_ON(10)) {
if (VLOG_IS_ON(100)) {
std::ostringstream sout;
auto inames = this->InputArgumentNames();
sout << " From [";
......@@ -516,7 +516,7 @@ void OpDesc::InferShape(const BlockDesc &block) const {
std::copy(onames.begin(), onames.end(),
std::ostream_iterator<std::string>(sout, ", "));
sout << "]";
VLOG(10) << sout.str();
VLOG(100) << sout.str();
}
infer_shape(&ctx);
}
......@@ -607,7 +607,7 @@ DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
auto shape = var->GetShape();
res = shape.empty() ? make_ddim({0UL}) : make_ddim(shape);
} catch (...) {
VLOG(5) << "GetDim of variable " << name << " error";
VLOG(50) << "GetDim of variable " << name << " error";
std::rethrow_exception(std::current_exception());
}
return res;
......@@ -624,7 +624,7 @@ std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims(
res.push_back(s.empty() ? make_ddim({0UL}) : make_ddim(s));
}
} catch (...) {
VLOG(5) << "GetRepeatedDim of variable " << name << " error.";
VLOG(50) << "GetRepeatedDim of variable " << name << " error.";
std::rethrow_exception(std::current_exception());
}
return res;
......
......@@ -46,9 +46,9 @@ static VariableNameMap ConvertOpDescVarsToVarNameMap(
std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
const proto::OpDesc& op_desc) {
VLOG(1) << "CreateOp directly from OpDesc is deprecated. It should only be"
"used in unit tests. Use CreateOp(const OpDesc& op_desc) "
"instead.";
VLOG(10) << "CreateOp directly from OpDesc is deprecated. It should only be"
"used in unit tests. Use CreateOp(const OpDesc& op_desc) "
"instead.";
VariableNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
VariableNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
AttributeMap attrs;
......
......@@ -140,7 +140,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
}
void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
VLOG(4) << place << " " << DebugStringEx(&scope);
VLOG(40) << place << " " << DebugStringEx(&scope);
if (platform::is_gpu_place(place)) {
#ifndef PADDLE_WITH_CUDA
PADDLE_THROW("Cannot run operator on place %s", place);
......@@ -160,7 +160,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
} else {
RunImpl(scope, place);
}
VLOG(3) << place << " " << DebugStringEx(&scope);
VLOG(30) << place << " " << DebugStringEx(&scope);
}
bool OperatorBase::HasInputs(const std::string& name) const {
......@@ -259,6 +259,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
if (row_size >= 0) {
ss << "[row_size=" << row_size << "]";
}
std::string dtype = GetDtype(*scope, output.second[i]);
ss << ":" << dtype;
ss << "[" << GetDims(*scope, var_name, true) << "]";
ss << "(" << GetLoD(*scope, var_name) << ")";
}
......@@ -715,14 +717,14 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
auto expected_kernel_key =
this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
VLOG(30) << "expected_kernel_key:" << expected_kernel_key;
auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_MKLDNN
// workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
if (kernel_iter == kernels.end() &&
expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
VLOG(30) << "missing MKLDNN kernel: fallbacking to PLAIN one";
expected_kernel_key.library_type_ = LibraryType::kPlain;
expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
kernel_iter = kernels.find(expected_kernel_key);
......@@ -774,7 +776,8 @@ void OperatorWithKernel::TransferInplaceVarsBack(
const Scope& scope, const std::vector<std::string>& inplace_vars,
const Scope& transfer_scope) const {
for (auto& var_name : inplace_vars) {
VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
VLOG(30) << "share inplace var " + var_name +
" back to it's original scope";
auto* original_tensor =
GetMutableLoDTensorOrSelectedRowsValueFromVar(scope.FindVar(var_name));
auto* var = transfer_scope.FindVar(var_name);
......@@ -815,8 +818,8 @@ Scope* OperatorWithKernel::TryTransferData(
transfered_inplace_vars->emplace_back(var_name);
}
VLOG(3) << "Transform Variable " << var_name << " from "
<< kernel_type_for_var << " to " << expected_kernel_key;
VLOG(30) << "Transform Variable " << var_name << " from "
<< kernel_type_for_var << " to " << expected_kernel_key;
if (new_scope == nullptr) {
new_scope = &scope.NewScope();
......
......@@ -199,7 +199,7 @@ void ParallelExecutor::BCastParamsToDevices(
auto &main_tensor = main_var->Get<LoDTensor>();
if (!main_tensor.IsInitialized()) {
VLOG(3) << "one in var not inited, return!";
VLOG(30) << "one in var not inited, return!";
continue;
}
auto &dims = main_tensor.dims();
......
......@@ -149,7 +149,7 @@ Variable* Scope::VarInternal(const std::string& name) {
v = new Variable();
vars_[name].reset(v);
VLOG(3) << "Create variable " << name;
VLOG(30) << "Create variable " << name;
v->name_ = &(vars_.find(name)->first);
return v;
}
......
......@@ -176,7 +176,7 @@ void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
PADDLE_ENFORCE(value->IsInitialized(),
"The value tensor should be initialized.");
if (ids.numel() == 0) {
VLOG(3) << "keys is empty, please check data!";
VLOG(30) << "keys is empty, please check data!";
} else {
int64_t value_width = value_->numel() / value_->dims()[0];
PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
......
......@@ -23,8 +23,8 @@ namespace framework {
void TensorCopy(const Tensor& src, const platform::Place& dst_place,
const platform::DeviceContext& ctx, Tensor* dst) {
VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
<< dst_place;
VLOG(30) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
<< dst_place;
src.check_memory_size();
dst->Resize(src.dims());
......@@ -38,8 +38,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
VLOG(30) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
return;
}
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
......@@ -78,8 +78,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
if (platform::is_same_place(src_place, dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
VLOG(30) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
return;
}
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
......@@ -115,8 +115,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
Tensor* dst) {
VLOG(3) << "TensorCopySync " << src.dims() << " from " << src.place()
<< " to " << dst_place;
VLOG(30) << "TensorCopySync " << src.dims() << " from " << src.place()
<< " to " << dst_place;
src.check_memory_size();
dst->Resize(src.dims());
dst->set_layout(src.layout());
......@@ -126,8 +126,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
auto size = src.numel() * SizeOfType(src.type());
if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data from " << src_place << " to "
<< dst_place;
VLOG(30) << "Skip copy the same data from " << src_place << " to "
<< dst_place;
return;
}
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
......@@ -147,8 +147,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
} else if (platform::is_gpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) {
VLOG(3) << "Skip copy the same data from " << src_place << " to "
<< dst_place;
VLOG(30) << "Skip copy the same data from " << src_place << " to "
<< dst_place;
return;
}
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
......
......@@ -39,7 +39,7 @@ void ThreadPool::Init() {
int num_threads = std::thread::hardware_concurrency();
if (FLAGS_dist_threadpool_size > 0) {
num_threads = FLAGS_dist_threadpool_size;
VLOG(1) << "set dist_threadpool_size to " << num_threads;
VLOG(10) << "set dist_threadpool_size to " << num_threads;
}
PADDLE_ENFORCE_GT(num_threads, 0);
threadpool_.reset(new ThreadPool(num_threads));
......
......@@ -61,10 +61,10 @@ size_t VarDesc::GetTensorDescNum() const {
void VarDesc::SetShapes(
const std::vector<std::vector<int64_t>> &multiple_dims) {
if (multiple_dims.size() != GetTensorDescNum()) {
VLOG(3) << "WARNING: The number of given shapes(" << multiple_dims.size()
<< ") doesn't match the existing tensor number("
<< GetTensorDescNum()
<< "). The Reader is going to be reinitialized.";
VLOG(30) << "WARNING: The number of given shapes(" << multiple_dims.size()
<< ") doesn't match the existing tensor number("
<< GetTensorDescNum()
<< "). The Reader is going to be reinitialized.";
SetTensorDescNum(multiple_dims.size());
}
std::vector<proto::VarType::TensorDesc *> tensors = mutable_tensor_descs();
......@@ -94,11 +94,11 @@ void VarDesc::SetDataType(proto::VarType::Type data_type) {
void VarDesc::SetDataTypes(
const std::vector<proto::VarType::Type> &multiple_data_type) {
if (multiple_data_type.size() != GetTensorDescNum()) {
VLOG(3) << "WARNING: The number of given data types("
<< multiple_data_type.size()
<< ") doesn't match the existing tensor number("
<< GetTensorDescNum()
<< "). The Reader is going to be reinitialized.";
VLOG(30) << "WARNING: The number of given data types("
<< multiple_data_type.size()
<< ") doesn't match the existing tensor number("
<< GetTensorDescNum()
<< "). The Reader is going to be reinitialized.";
SetTensorDescNum(multiple_data_type.size());
}
std::vector<proto::VarType::TensorDesc *> tensor_descs =
......@@ -139,11 +139,11 @@ void VarDesc::SetLoDLevel(int32_t lod_level) {
void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) {
if (multiple_lod_level.size() != GetTensorDescNum()) {
VLOG(3) << "WARNING: The number of given lod_levels("
<< multiple_lod_level.size()
<< ") doesn't match the existing tensor number("
<< GetTensorDescNum()
<< "). The Reader is going to be reinitialized.";
VLOG(30) << "WARNING: The number of given lod_levels("
<< multiple_lod_level.size()
<< ") doesn't match the existing tensor number("
<< GetTensorDescNum()
<< "). The Reader is going to be reinitialized.";
SetTensorDescNum(multiple_lod_level.size());
}
switch (desc_.type().type()) {
......
......@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/type_defs.h"
namespace paddle {
......@@ -24,5 +27,27 @@ class VarTypeInference {
virtual void operator()(const OpDesc& op_desc, BlockDesc* block) const = 0;
};
class PassInDtypeAndVarTypeToOutput : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const final {
auto in_out_var_names = this->GetInputOutputWithSameType();
for (auto& i_o_n : in_out_var_names) {
auto& x_name = op_desc.Input(i_o_n.first).at(0);
auto& out_name = op_desc.Output(i_o_n.second).at(0);
auto& x = block->FindRecursiveOrCreateVar(x_name);
auto& out = block->FindRecursiveOrCreateVar(out_name);
out.SetType(x.GetType());
out.SetDataType(x.GetDataType());
}
}
protected:
virtual std::unordered_map<std::string, std::string>
GetInputOutputWithSameType() const = 0;
};
} // namespace framework
} // namespace paddle
......@@ -60,7 +60,7 @@ class DfgPassManagerImpl final : public DfgPassManager {
private:
void AddPass(const std::string& name, AnalysisPass* pass) {
VLOG(3) << "Adding pass " << name;
VLOG(30) << "Adding pass " << name;
Register(name, pass);
AddGraphvizDebugerPass(pass);
}
......@@ -104,7 +104,7 @@ void Analyzer::Run(Argument* argument) {
passes.push_back("graph_viz_pass"); // add graphviz for debug.
#ifdef PADDLE_WITH_MKLDNN
if (use_mkldnn_) {
VLOG(3) << "Adding MKL-DNN placement pass";
VLOG(30) << "Adding MKL-DNN placement pass";
passes.push_back("mkldnn_placement_pass");
}
#endif
......@@ -113,7 +113,9 @@ void Analyzer::Run(Argument* argument) {
passes.push_back("infer_clean_graph_pass");
passes.push_back("graph_viz_pass"); // add graphviz for debug.
for (auto& pass : ir_passes_) {
if (!disabled_ir_passes_.count(pass)) {
// skip mkldnn pass when use_mkldnn_ = false;
bool skip_pass = (!use_mkldnn_) && pass.find("mkldnn") != std::string::npos;
if (!disabled_ir_passes_.count(pass) && !skip_pass) {
passes.push_back(pass);
passes.push_back("graph_viz_pass"); // add graphviz for debug.
}
......
......@@ -68,8 +68,8 @@ struct Argument {
key);
attrs_[key] = data;
attr_deleters_[key] = [data, key]() {
VLOG(3) << "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
VLOG(3) << "argument delete attr: " << key;
VLOG(30) << "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
VLOG(30) << "argument delete attr: " << key;
delete data;
};
}
......
......@@ -132,7 +132,7 @@ void DataFlowGraph::Build(const framework::ir::Graph &graph) {
Node *x{nullptr};
if (ir_node->IsOp()) {
PADDLE_ENFORCE(ir_node->Op());
VLOG(4) << "get op " << ir_node << " " << ir_node->Name();
VLOG(40) << "get op " << ir_node << " " << ir_node->Name();
x = nodes.Create(Node::Type::kFunction);
x->attr("ir_node").Pointer() = ir_node;
PADDLE_ENFORCE(ir_node->Op()->Proto());
......@@ -141,7 +141,7 @@ void DataFlowGraph::Build(const framework::ir::Graph &graph) {
} else if (ir_node->IsVar()) {
// Not create a Node for IR ControlDepVar, considering Inference currently
// just used in single thread scenerio.
VLOG(4) << "get var " << ir_node->Name();
VLOG(40) << "get var " << ir_node->Name();
x = nodes.Create(Node::Type::kValue);
x->attr("ir_node").Pointer() = ir_node;
x->SetName(ir_node->Name());
......@@ -151,9 +151,9 @@ void DataFlowGraph::Build(const framework::ir::Graph &graph) {
}
ir_node_map.emplace(ir_node, x);
}
VLOG(4) << "finish creating Nodes";
VLOG(40) << "finish creating Nodes";
VLOG(4) << "to create edge";
VLOG(40) << "to create edge";
// Create links
for (auto *ir_node : graph.Nodes()) {
auto it = ir_node_map.find(ir_node);
......@@ -175,7 +175,7 @@ void DataFlowGraph::Build(const framework::ir::Graph &graph) {
"Can't deduce any inputs from the graph, Is the graph empty?");
ir_graph = &graph;
VLOG(3) << "finished build from IR";
VLOG(30) << "finished build from IR";
}
void DataFlowGraph::Clean() {
......
......@@ -239,9 +239,10 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
framework::BlockDesc block_desc(nullptr, &proto);
block_desc.Proto()->set_parent_idx(-1);
block_desc.Proto()->set_idx(0);
VLOG(4) << "origin variable size: "
<< argument_->origin_program_desc->blocks(0).vars().size();
VLOG(4) << "transformed variable size: " << block_desc.Proto()->vars().size();
VLOG(40) << "origin variable size: "
<< argument_->origin_program_desc->blocks(0).vars().size();
VLOG(40) << "transformed variable size: "
<< block_desc.Proto()->vars().size();
// copy ops.
for (auto *node : block_node->subgraph) {
......
......@@ -29,7 +29,7 @@ void DFG_GraphvizDrawPass::Run(DataFlowGraph *graph) {
auto png_path = dot_path.substr(0, dot_path.size() - 4) + ".png";
std::string message;
VLOG(3) << "draw to " << png_path;
VLOG(30) << "draw to " << png_path;
ExecShellCommand("dot -Tpng " + dot_path + " -o " + png_path, &message);
}
......
......@@ -29,7 +29,7 @@ void FluidToIrPass::EnableParamModify(const std::string &model_dir,
PADDLE_ENFORCE(argument_);
argument_->Set(framework::ir::kParamScopeAttr, new framework::Scope);
// Load parameters.
VLOG(3) << "Loading parameters from " << model_dir;
VLOG(30) << "Loading parameters from " << model_dir;
LoadParams(&argument_->Get<framework::Scope>(framework::ir::kParamScopeAttr),
model_dir, prog_file, param_file);
}
......
......@@ -35,21 +35,21 @@ void ModelStorePass::Run(DataFlowGraph *x) {
std::stringstream ss;
// NOTE these commands only works on linux.
ss << "mkdir -p " << *argument_->model_output_store_path;
VLOG(3) << "run command: " << ss.str();
VLOG(30) << "run command: " << ss.str();
PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);
ss.str("");
ss << "cp " << *argument_->fluid_model_dir << "/*"
<< " " << *argument_->model_output_store_path;
VLOG(3) << "run command: " << ss.str();
VLOG(30) << "run command: " << ss.str();
PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);
// Store program
PADDLE_ENFORCE_NOT_NULL(argument_->transformed_program_desc,
"program desc is not transformed, should call "
"DataFlowGraphToFluidPass first.");
VLOG(3) << "store analyzed program to "
<< *argument_->model_output_store_path;
VLOG(30) << "store analyzed program to "
<< *argument_->model_output_store_path;
const std::string program_output_path =
*argument_->model_output_store_path + "/__model__";
std::ofstream file(program_output_path, std::ios::binary);
......
......@@ -23,7 +23,7 @@ namespace analysis {
bool PassManager::Initialize(Argument* argument) {
argument_ = argument;
for (auto& pass : data_) {
VLOG(3) << "Initializing pass [" << pass->repr() << "]";
VLOG(30) << "Initializing pass [" << pass->repr() << "]";
if (!pass->Initialize(argument)) {
LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]";
return false;
......@@ -34,7 +34,7 @@ bool PassManager::Initialize(Argument* argument) {
void DfgPassManager::RunAll() {
PADDLE_ENFORCE(argument_);
VLOG(3) << "Total " << data_.size() << " Analysys passes";
VLOG(30) << "Total " << data_.size() << " Analysys passes";
for (auto& pass : data_) {
string::PrettyLogEndl(string::Style::H1(), "* Running Analysis pass [%s]",
pass->repr());
......
......@@ -232,7 +232,7 @@ std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
BriefNode *brief_node = itr.second;
if (!brief_node->node->attr(kMarkerAttrName).Bool()) {
VLOG(4) << brief_node->node->id() << " node not a trt candicate.";
VLOG(40) << brief_node->node->id() << " node not a trt candicate.";
continue;
}
......
......@@ -25,9 +25,9 @@ TensorRTSubGraphPass::TensorRTSubGraphPass(
void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
SubGraphFuse(graph, node_inside_subgraph_teller_, argument_)();
VLOG(4) << "debug info "
<< graph->HumanReadableInfo(false /*show_values*/,
true /*show_functions*/);
VLOG(40) << "debug info "
<< graph->HumanReadableInfo(false /*show_values*/,
true /*show_functions*/);
}
} // namespace analysis
......
......@@ -38,7 +38,7 @@ using contrib::AnalysisConfig;
bool AnalysisPredictor::Init(
const std::shared_ptr<framework::Scope> &parent_scope,
const std::shared_ptr<framework::ProgramDesc> &program) {
VLOG(3) << "Predictor::init()";
VLOG(30) << "Predictor::init()";
#if !defined(_WIN32)
if (FLAGS_profile) {
LOG(WARNING) << "Profiler is actived, might affect the performance";
......@@ -89,7 +89,7 @@ bool AnalysisPredictor::Init(
bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data,
int batch_size) {
VLOG(3) << "Predictor::predict";
VLOG(30) << "Predictor::predict";
inference::Timer timer;
timer.tic();
// set feed variable
......@@ -109,7 +109,7 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
LOG(ERROR) << "fail to get fetches";
return false;
}
VLOG(3) << "predict cost: " << timer.toc() << "ms";
VLOG(30) << "predict cost: " << timer.toc() << "ms";
// Fix TensorArray reuse not cleaned bug.
tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
......@@ -119,7 +119,7 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
framework::Scope *scope) {
VLOG(3) << "Predictor::set_feed";
VLOG(30) << "Predictor::set_feed";
if (inputs.size() != feeds_.size()) {
LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get "
<< inputs.size();
......@@ -184,7 +184,7 @@ void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch,
bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
framework::Scope *scope) {
VLOG(3) << "Predictor::get_fetch";
VLOG(30) << "Predictor::get_fetch";
outputs->resize(fetchs_.size());
for (size_t i = 0; i < fetchs_.size(); ++i) {
int idx = boost::get<int>(fetchs_[i]->GetAttr("col"));
......@@ -246,7 +246,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
}
CHECK(argument_.transformed_program_desc);
VLOG(5) << "to prepare executor";
VLOG(50) << "to prepare executor";
inference_program_.reset(
new framework::ProgramDesc(*argument_.transformed_program_desc));
if (argument_.Has(framework::ir::kParamScopeAttr)) {
......@@ -260,7 +260,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
VLOG(3) << "create AnalysisConfig";
VLOG(30) << "create AnalysisConfig";
if (config.use_gpu) {
// 1. GPU memeroy
PADDLE_ENFORCE_GT(
......@@ -274,7 +274,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
std::string flag = "--fraction_of_gpu_memory_to_use=" +
std::to_string(config.fraction_of_gpu_memory);
flags.push_back(flag);
VLOG(3) << "set flag: " << flag;
VLOG(30) << "set flag: " << flag;
framework::InitGflags(flags);
}
}
......
......@@ -13,6 +13,8 @@
// limitations under the License.
#pragma once
#include <algorithm>
#include <map>
#include <string>
#include <vector>
#include "paddle/fluid/framework/naive_executor.h"
......
......@@ -16,7 +16,6 @@
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle_inference_api.h"
namespace paddle {
......
......@@ -157,7 +157,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
LOG(ERROR) << "fail to get fetches";
return false;
}
VLOG(3) << "predict cost: " << timer.toc() << "ms";
VLOG(30) << "predict cost: " << timer.toc() << "ms";
// Fix TensorArray reuse not cleaned bug.
tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
......
......@@ -34,7 +34,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
FLAGS_IA_enable_tensorrt_subgraph_engine = true;
VLOG(3) << "Predictor::init()";
VLOG(30) << "Predictor::init()";
if (config_.use_gpu) {
place_ = paddle::platform::CUDAPlace(config_.device);
} else {
......@@ -70,7 +70,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
OptimizeInferenceProgram();
ctx_ = executor_->Prepare(*inference_program_, 0);
VLOG(5) << "to create variables";
VLOG(50) << "to create variables";
executor_->CreateVariables(*inference_program_,
sub_scope_ ? sub_scope_ : scope_.get(), 0);
// Get the feed_target_names and fetch_target_names
......@@ -114,9 +114,9 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
new ProgramDesc(*inference_program_->Proto()));
Singleton<Analyzer>::Global().Run(&argument);
CHECK(argument.transformed_program_desc);
VLOG(5) << "transformed program:\n"
<< argument.transformed_program_desc->SerializeAsString();
VLOG(5) << "to prepare executor";
VLOG(50) << "transformed program:\n"
<< argument.transformed_program_desc->SerializeAsString();
VLOG(50) << "to prepare executor";
inference_program_.reset(
new framework::ProgramDesc(*argument.transformed_program_desc));
}
......@@ -129,7 +129,7 @@ template <>
std::unique_ptr<PaddlePredictor>
CreatePaddlePredictor<MixedRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
const MixedRTConfig& config) {
VLOG(3) << "create TensorRTSubgraphPredictor";
VLOG(30) << "create TensorRTSubgraphPredictor";
if (config.use_gpu) {
// 1. GPU memeroy
PADDLE_ENFORCE_GT(
......@@ -143,7 +143,7 @@ CreatePaddlePredictor<MixedRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
std::string flag = "--fraction_of_gpu_memory_to_use=" +
std::to_string(config.fraction_of_gpu_memory);
flags.push_back(flag);
VLOG(3) << "set flag: " << flag;
VLOG(30) << "set flag: " << flag;
framework::InitGflags(flags);
}
}
......
......@@ -45,7 +45,7 @@ void Main() {
config.fraction_of_gpu_memory = 0.1; // set by yourself
predictor = CreatePaddlePredictor<paddle::contrib::MixedRTConfig>(config);
VLOG(3) << "begin to process data";
VLOG(30) << "begin to process data";
// Just a single batch of data.
std::string line;
std::ifstream file(FLAGS_data);
......@@ -60,13 +60,13 @@ void Main() {
PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
input.dtype = PaddleDType::FLOAT32;
VLOG(3) << "run executor";
VLOG(30) << "run executor";
std::vector<PaddleTensor> output;
predictor->Run({input}, &output, 1);
VLOG(3) << "output.size " << output.size();
VLOG(30) << "output.size " << output.size();
auto& tensor = output.front();
VLOG(3) << "output: " << SummaryTensor(tensor);
VLOG(30) << "output: " << SummaryTensor(tensor);
// compare with reference result
CheckOutput(FLAGS_refer, tensor);
......
......@@ -47,7 +47,7 @@ static void split(const std::string& str, char sep,
}
Record ProcessALine(const std::string& line) {
VLOG(3) << "process a line";
VLOG(30) << "process a line";
std::vector<std::string> columns;
split(line, '\t', &columns);
CHECK_EQ(columns.size(), 2UL)
......@@ -65,8 +65,8 @@ Record ProcessALine(const std::string& line) {
for (auto& s : shape_strs) {
record.shape.push_back(std::stoi(s));
}
VLOG(3) << "data size " << record.data.size();
VLOG(3) << "data shape size " << record.shape.size();
VLOG(30) << "data size " << record.data.size();
VLOG(30) << "data shape size " << record.shape.size();
return record;
}
......@@ -78,8 +78,8 @@ void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
file.close();
size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
VLOG(3) << "predictor output numel " << numel;
VLOG(3) << "reference output numel " << refer.data.size();
VLOG(30) << "predictor output numel " << numel;
VLOG(30) << "reference output numel " << refer.data.size();
CHECK_EQ(numel, refer.data.size());
switch (output.dtype) {
case PaddleDType::INT64: {
......
......@@ -49,11 +49,11 @@ void Main(bool use_gpu) {
config.fraction_of_gpu_memory = 0.1; // set by yourself
}
VLOG(3) << "init predictor";
VLOG(30) << "init predictor";
predictor = CreatePaddlePredictor<NativeConfig>(config);
analysis_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
VLOG(3) << "begin to process data";
VLOG(30) << "begin to process data";
// Just a single batch of data.
std::string line;
std::ifstream file(FLAGS_data);
......@@ -68,13 +68,13 @@ void Main(bool use_gpu) {
PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
input.dtype = PaddleDType::FLOAT32;
VLOG(3) << "run executor";
VLOG(30) << "run executor";
std::vector<PaddleTensor> output, analysis_output;
predictor->Run({input}, &output, 1);
VLOG(3) << "output.size " << output.size();
VLOG(30) << "output.size " << output.size();
auto& tensor = output.front();
VLOG(3) << "output: " << SummaryTensor(tensor);
VLOG(30) << "output: " << SummaryTensor(tensor);
// compare with reference result
CheckOutput(FLAGS_refer, tensor);
......
......@@ -26,7 +26,7 @@ void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) {
// parameter.
if (var_name == "feed" || var_name == "fetch") continue;
if (var->Type() == typeid(framework::LoDTensorArray)) {
VLOG(4) << "collect " << var_name;
VLOG(40) << "collect " << var_name;
arrays_.push_back(var->GetMutable<framework::LoDTensorArray>());
}
}
......@@ -34,7 +34,7 @@ void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) {
CollectTensorArrays(kid);
}
VLOG(3) << "Collect " << arrays_.size() << " arrays";
VLOG(30) << "Collect " << arrays_.size() << " arrays";
flag_ = false;
}
}
......
......@@ -16,13 +16,14 @@
#include <glog/logging.h>
#include <sys/time.h>
#include <algorithm>
#include <chrono> // NOLINT
#include <numeric>
#include <sstream>
#include <string>
#include <vector>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/string/printf.h"
#include "paddle_inference_api.h"
namespace paddle {
namespace inference {
......
......@@ -59,7 +59,8 @@ void ReadBinaryFile(const std::string& filename, std::string* contents) {
bool IsPersistable(const framework::VarDesc* var) {
if (var->Persistable() &&
var->GetType() != framework::proto::VarType::FEED_MINIBATCH &&
var->GetType() != framework::proto::VarType::FETCH_LIST) {
var->GetType() != framework::proto::VarType::FETCH_LIST &&
var->GetType() != framework::proto::VarType::RAW) {
return true;
}
return false;
......@@ -77,7 +78,7 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
for (auto* var : global_block.AllVars()) {
if (IsPersistable(var)) {
VLOG(3) << "persistable variable's name: " << var->Name();
VLOG(30) << "persistable variable's name: " << var->Name();
framework::VarDesc* new_var = load_block->Var(var->Name());
new_var->SetShape(var->GetShape());
......@@ -120,7 +121,7 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
const std::string& dirname) {
std::string model_filename = dirname + "/__model__";
std::string program_desc_str;
VLOG(3) << "loading model from " << model_filename;
VLOG(30) << "loading model from " << model_filename;
ReadBinaryFile(model_filename, &program_desc_str);
std::unique_ptr<framework::ProgramDesc> main_program(
......
......@@ -25,7 +25,7 @@ class ConcatOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias";
VLOG(40) << "convert a fluid mul op to tensorrt mul layer without bias";
framework::OpDesc op_desc(op, nullptr);
// Declare inputs
......
......@@ -25,7 +25,7 @@ class DropoutOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
VLOG(4) << "convert a fluid dropout op to tensorrt dropout layer";
VLOG(40) << "convert a fluid dropout op to tensorrt dropout layer";
framework::OpDesc op_desc(op, nullptr);
// Declare inputs
auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
......
......@@ -52,7 +52,7 @@ class FcOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
VLOG(4) << "convert a fluid fc op to tensorrt fc layer without bias";
VLOG(40) << "convert a fluid fc op to tensorrt fc layer without bias";
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
......
......@@ -25,7 +25,7 @@ class MulOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias";
VLOG(40) << "convert a fluid mul op to tensorrt mul layer without bias";
framework::OpDesc op_desc(op, nullptr);
// Declare inputs
......
......@@ -25,7 +25,7 @@ class PadOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
VLOG(4) << "convert a fluid transpose op to tensorrt tranpose layer";
VLOG(40) << "convert a fluid transpose op to tensorrt tranpose layer";
framework::OpDesc op_desc(op, nullptr);
// Declare inputs
......
......@@ -25,7 +25,7 @@ class Pool2dOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
VLOG(4)
VLOG(40)
<< "convert a fluid pool2d op to tensorrt pool2d layer without bias";
framework::OpDesc op_desc(op, nullptr);
// Declare inputs
......
......@@ -25,7 +25,7 @@ class SoftMaxOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
VLOG(4)
VLOG(40)
<< "convert a fluid softmax op to tensorrt softmax layer without bias";
framework::OpDesc op_desc(op, nullptr);
// Declare inputs
......
......@@ -134,7 +134,7 @@ class TensorRTEngine : public EngineBase {
std::unordered_map<std::string /*name*/, std::unique_ptr<framework::Tensor>>
weight_map;
// TODO: (NHZLX)
// TODO(NHZLX)
// In the normal case, the paddle-trt exists bug when runing the googlenet.
// When there are more than two convolutions of 1 * 1 with the same input, the
// paddle-tensorrt will do the merging optimization, which fuse those conv
......
......@@ -27,7 +27,7 @@ struct Record {
};
Record ProcessALine(const std::string &line) {
VLOG(3) << "process a line";
VLOG(30) << "process a line";
std::vector<std::string> columns;
split(line, '\t', &columns);
CHECK_EQ(columns.size(), 2UL)
......@@ -45,8 +45,8 @@ Record ProcessALine(const std::string &line) {
for (auto &s : shape_strs) {
record.shape.push_back(std::stoi(s));
}
VLOG(3) << "data size " << record.data.size();
VLOG(3) << "data shape size " << record.shape.size();
VLOG(30) << "data size " << record.data.size();
VLOG(30) << "data shape size " << record.shape.size();
return record;
}
......
......@@ -32,11 +32,11 @@ BuddyAllocator::BuddyAllocator(
system_allocator_(std::move(system_allocator)) {}
BuddyAllocator::~BuddyAllocator() {
VLOG(10) << "BuddyAllocator Disconstructor makes sure that all of these "
"have actually been freed";
VLOG(100) << "BuddyAllocator Disconstructor makes sure that all of these "
"have actually been freed";
while (!pool_.empty()) {
auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
VLOG(10) << "Free from block (" << block << ", " << max_chunk_size_ << ")";
VLOG(100) << "Free from block (" << block << ", " << max_chunk_size_ << ")";
system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
cache_.invalidate(block);
......@@ -57,12 +57,12 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
// acquire the allocator lock
std::lock_guard<std::mutex> lock(mutex_);
VLOG(10) << "Allocate " << unaligned_size << " bytes from chunk size "
<< size;
VLOG(100) << "Allocate " << unaligned_size << " bytes from chunk size "
<< size;
// if the allocation is huge, send directly to the system allocator
if (size > max_chunk_size_) {
VLOG(10) << "Allocate from system allocator.";
VLOG(100) << "Allocate from system allocator.";
return SystemAlloc(size);
}
......@@ -77,9 +77,9 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
return nullptr;
}
} else {
VLOG(10) << "Allocation from existing memory block " << std::get<2>(*it)
<< " at address "
<< reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
VLOG(100) << "Allocation from existing memory block " << std::get<2>(*it)
<< " at address "
<< reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
}
total_used_ += size;
......@@ -96,10 +96,10 @@ void BuddyAllocator::Free(void* p) {
// Acquire the allocator lock
std::lock_guard<std::mutex> lock(mutex_);
VLOG(10) << "Free from address " << block;
VLOG(100) << "Free from address " << block;
if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) {
VLOG(10) << "Free directly from system allocator";
VLOG(100) << "Free directly from system allocator";
system_allocator_->Free(block, block->total_size(cache_),
block->index(cache_));
......@@ -116,8 +116,8 @@ void BuddyAllocator::Free(void* p) {
// Trying to merge the right buddy
if (block->has_right_buddy(cache_)) {
VLOG(10) << "Merging this block " << block << " with its right buddy "
<< block->right_buddy(cache_);
VLOG(100) << "Merging this block " << block << " with its right buddy "
<< block->right_buddy(cache_);
auto right_buddy = block->right_buddy(cache_);
......@@ -134,8 +134,8 @@ void BuddyAllocator::Free(void* p) {
// Trying to merge the left buddy
if (block->has_left_buddy(cache_)) {
VLOG(10) << "Merging this block " << block << " with its left buddy "
<< block->left_buddy(cache_);
VLOG(100) << "Merging this block " << block << " with its left buddy "
<< block->left_buddy(cache_);
auto left_buddy = block->left_buddy(cache_);
......@@ -151,8 +151,8 @@ void BuddyAllocator::Free(void* p) {
}
// Dumping this block into pool
VLOG(10) << "Inserting free block (" << block << ", "
<< block->total_size(cache_) << ")";
VLOG(100) << "Inserting free block (" << block << ", "
<< block->total_size(cache_) << ")";
pool_.insert(
IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
......@@ -174,7 +174,7 @@ void* BuddyAllocator::SystemAlloc(size_t size) {
size_t index = 0;
void* p = system_allocator_->Alloc(&index, size);
VLOG(10) << "Allocated " << p << " from system allocator.";
VLOG(100) << "Allocated " << p << " from system allocator.";
if (p == nullptr) return nullptr;
......@@ -200,8 +200,8 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
if (p == nullptr) return pool_.end();
VLOG(10) << "Creating and inserting new block " << p
<< " from system allocator";
VLOG(100) << "Creating and inserting new block " << p
<< " from system allocator";
static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index,
max_chunk_size_, nullptr, nullptr);
......@@ -245,19 +245,19 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
auto block = static_cast<MemoryBlock*>(std::get<2>(*it));
pool_.erase(it);
VLOG(10) << "Split block (" << block << ", " << block->total_size(cache_)
<< ") into";
VLOG(100) << "Split block (" << block << ", " << block->total_size(cache_)
<< ") into";
block->split(&cache_, size);
VLOG(10) << "Left block (" << block << ", " << block->total_size(cache_)
<< ")";
VLOG(100) << "Left block (" << block << ", " << block->total_size(cache_)
<< ")";
block->set_type(&cache_, MemoryBlock::ARENA_CHUNK);
// the rest of memory if exist
if (block->has_right_buddy(cache_)) {
if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) {
VLOG(10) << "Insert right block (" << block->right_buddy(cache_) << ", "
<< block->right_buddy(cache_)->total_size(cache_) << ")";
VLOG(100) << "Insert right block (" << block->right_buddy(cache_) << ", "
<< block->right_buddy(cache_)->total_size(cache_) << ")";
pool_.insert(
IndexSizeAddress(block->right_buddy(cache_)->index(cache_),
......@@ -284,7 +284,7 @@ void BuddyAllocator::CleanIdleFallBackAlloc() {
return;
}
VLOG(10) << "Return block " << block << " to fallback allocator.";
VLOG(100) << "Return block " << block << " to fallback allocator.";
system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
cache_.invalidate(block);
......@@ -320,7 +320,7 @@ void BuddyAllocator::CleanIdleNormalAlloc() {
MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
VLOG(10) << "Return block " << block << " to base allocator.";
VLOG(100) << "Return block " << block << " to base allocator.";
system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
cache_.invalidate(block);
......
......@@ -29,7 +29,7 @@ MemoryBlock::Desc MetadataCache::load(const MemoryBlock* block) const {
return existing_desc->second;
} else {
auto* desc = reinterpret_cast<const MemoryBlock::Desc*>(block);
VLOG(10) << "Load MemoryBlock::Desc type=" << desc->type;
VLOG(100) << "Load MemoryBlock::Desc type=" << desc->type;
PADDLE_ASSERT(desc->check_guards());
return *reinterpret_cast<const MemoryBlock::Desc*>(block);
}
......
......@@ -78,7 +78,7 @@ void* Alloc<platform::CPUPlace>(const platform::CPUPlace& place, size_t size) {
if (FLAGS_init_allocated_mem) {
memset(p, 0xEF, size);
}
VLOG(10) << " pointer=" << p;
VLOG(100) << " pointer=" << p;
return p;
}
......@@ -111,12 +111,12 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
std::unique_ptr<detail::SystemAllocator>(new detail::GPUAllocator(i)),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
VLOG(10) << "\n\nNOTE: each GPU device use "
<< FLAGS_fraction_of_gpu_memory_to_use * 100
<< "% of GPU memory.\n"
<< "You can set GFlags environment variable '"
<< "FLAGS_fraction_of_gpu_memory_to_use"
<< "' to change the fraction of GPU usage.\n\n";
VLOG(100) << "\n\nNOTE: each GPU device use "
<< FLAGS_fraction_of_gpu_memory_to_use * 100
<< "% of GPU memory.\n"
<< "You can set GFlags environment variable '"
<< "FLAGS_fraction_of_gpu_memory_to_use"
<< "' to change the fraction of GPU usage.\n\n";
}
});
......
......@@ -317,6 +317,7 @@ op_library(save_op DEPS lod_tensor)
op_library(load_op DEPS lod_tensor)
op_library(save_combine_op DEPS lod_tensor)
op_library(load_combine_op DEPS lod_tensor)
op_library(tensor_array_to_tensor_op DEPS concat_op)
op_library(concat_op DEPS concat_and_split)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
......
......@@ -91,16 +91,12 @@ class ActivationOp : public framework::OperatorWithKernel {
}
};
class ActivationOpInferVarType : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override {
auto x_name = op_desc.Input("X")[0];
auto out_name = op_desc.Output("Out")[0];
auto& x = block->FindRecursiveOrCreateVar(x_name);
auto& out = block->FindRecursiveOrCreateVar(out_name);
out.SetType(x.GetType());
out.SetDataType(x.GetDataType());
class ActivationOpInferVarType
: public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
const override {
return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
}
};
......
......@@ -95,7 +95,7 @@ class ActivationGradKernel
auto x = framework::EigenVector<T>::Flatten(*X);
functor(*place, x, out, dout, dx);
} else {
VLOG(10) << " Inplace activation ";
VLOG(100) << " Inplace activation ";
auto x = framework::EigenVector<T>::Flatten(*dX);
functor(*place, x, out, dout, dx);
}
......
......@@ -297,7 +297,7 @@ class AdamOpKernel : public framework::OpKernel<T> {
auto& grad =
Ref(ctx.Input<framework::SelectedRows>("Grad"), "Must set Grad");
if (grad.rows().size() == 0) {
VLOG(3) << "grad row size is 0!!";
VLOG(30) << "grad row size is 0!!";
return;
}
......
......@@ -66,9 +66,10 @@ class AddPositionEncodingKernel : public framework::OpKernel<T> {
x_lod.empty() ? max_seq_len : x_lod[0][i + 1] - x_lod[0][i];
for (int j = 0; j < max_length; ++j) {
for (int k = 0; k < half_size; ++k) {
const double val = (half_size > 1)
? j / pow(10000.0, double(k) / (half_size - 1))
: j / 10000.0;
const double val =
(half_size > 1)
? j / pow(10000.0, static_cast<double>(k) / (half_size - 1))
: j / 10000.0;
dst_ptr[k] = src_ptr[k] * alpha + sin(val) * beta;
dst_ptr[half_size + k] =
src_ptr[half_size + k] * alpha + cos(val) * beta;
......
......@@ -49,7 +49,7 @@ class ArrayOp : public framework::OperatorBase {
} else {
offset = static_cast<size_t>(*i_tensor.data<int64_t>());
}
VLOG(10) << " Offset = " << offset;
VLOG(100) << " Offset = " << offset;
return offset;
}
};
......
......@@ -148,8 +148,8 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
size_t start_offset = lod_and_offset.second.first;
size_t end_offset = lod_and_offset.second.second;
VLOG(10) << "idx=" << idx << " x_idx=" << x_idx << " ["
<< ", " << end_offset << "]";
VLOG(100) << "idx=" << idx << " x_idx=" << x_idx << " ["
<< ", " << end_offset << "]";
// Copy data
PADDLE_ENFORCE_GE(end_offset, start_offset);
size_t len = end_offset - start_offset;
......
......@@ -170,6 +170,15 @@ The required data format for this layer is one of the following:
}
};
class BatchNormOpInferVarType
: public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
const override {
return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
}
};
template <typename T>
class BatchNormKernel<platform::CPUDeviceContext, T>
: public framework::OpKernel<T> {
......@@ -525,7 +534,7 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
namespace ops = paddle::operators;
REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
ops::BatchNormGradMaker);
ops::BatchNormOpInferVarType, ops::BatchNormGradMaker);
REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp);
REGISTER_OP_CPU_KERNEL(
......
......@@ -96,7 +96,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
mode_ = CUDNN_BATCHNORM_SPATIAL;
#endif
VLOG(3) << "Setting descriptors.";
VLOG(30) << "Setting descriptors.";
std::vector<int> dims;
std::vector<int> strides;
if (data_layout == DataLayout::kNCHW) {
......
......@@ -33,11 +33,11 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
auto items = SelectTopBeamSizeItems(pre_ids, pre_scores);
auto selected_items = ToMap(items, high_level.back());
VLOG(3) << "selected_items:";
VLOG(30) << "selected_items:";
for (size_t i = 0; i < selected_items.size(); ++i) {
VLOG(3) << "offset:" << i;
VLOG(30) << "offset:" << i;
for (auto &item : selected_items[i]) {
VLOG(3) << ItemToString(item);
VLOG(30) << ItemToString(item);
}
}
......@@ -138,11 +138,11 @@ std::vector<std::vector<BeamSearch::Item>> BeamSearch::SelectTopBeamSizeItems(
}
result.emplace_back(items);
}
VLOG(3) << "SelectTopBeamSizeItems result size " << result.size();
VLOG(30) << "SelectTopBeamSizeItems result size " << result.size();
for (auto &items : result) {
VLOG(3) << "item set:";
VLOG(30) << "item set:";
for (auto &item : items) {
VLOG(3) << ItemToString(item);
VLOG(30) << ItemToString(item);
}
}
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
class BilinearInterpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input_t = ctx.Input<Tensor>("X"); // float tensor
auto* output_t = ctx.Output<Tensor>("Out"); // float tensor
auto out_dims = output_t->dims();
auto* input = input_t->data<T>();
int out_h = ctx.Attr<int>("out_h");
int out_w = ctx.Attr<int>("out_w");
auto out_size_t = ctx.Input<Tensor>("OutSize");
if (out_size_t != nullptr) {
auto out_size_data = out_size_t->data<int>();
out_h = out_size_data[0];
out_w = out_size_data[1];
}
auto* output = output_t->mutable_data<T>(
{out_dims[0], out_dims[1], out_h, out_w}, ctx.GetPlace());
int batch_size = input_t->dims()[0];
int channels = input_t->dims()[1];
int in_h = input_t->dims()[2];
int in_w = input_t->dims()[3];
int in_hw = in_h * in_w;
int out_hw = out_h * out_w;
int in_chw = channels * in_hw;
int out_chw = channels * out_hw;
float ratio_h =
(out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
float ratio_w =
(out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
if (in_h == out_h && in_w == out_w) {
memcpy(output, input, input_t->numel() * sizeof(T));
} else {
for (int k = 0; k < batch_size; ++k) { // loop for batches
for (int i = 0; i < out_h; ++i) { // loop for images
int h = ratio_h * i;
int hid = (h < in_h - 1) ? 1 : 0;
float h1lambda = ratio_h * i - h;
float h2lambda = 1.f - h1lambda;
for (int j = 0; j < out_w; ++j) {
int w = ratio_w * j;
int wid = (w < in_w - 1) ? 1 : 0;
float w1lambda = ratio_w * j - w;
float w2lambda = 1.f - w1lambda;
// calculate four position for bilinear interpolation
const T* in_pos = &input[k * in_chw + h * in_w + w];
T* out_pos = &output[k * out_chw + i * out_w + j];
for (int c = 0; c < channels; ++c) { // loop for channels
// bilinear interpolation
out_pos[0] = static_cast<T>(
h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[wid]) +
h1lambda * (w2lambda * in_pos[hid * in_w] +
w1lambda * in_pos[hid * in_w + wid]));
in_pos += in_hw;
out_pos += out_hw;
}
}
}
}
}
}
};
template <typename T>
class BilinearInterpGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* d_input_t = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* d_output_t = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* d_output = d_output_t->data<T>();
auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace());
auto& device_ctx =
ctx.template device_context<platform::CPUDeviceContext>();
math::SetConstant<platform::CPUDeviceContext, T> zero;
zero(device_ctx, d_input_t, static_cast<T>(0.0));
int out_h = ctx.Attr<int>("out_h");
int out_w = ctx.Attr<int>("out_w");
auto out_size_t = ctx.Input<Tensor>("OutSize");
if (out_size_t != nullptr) {
auto out_size_data = out_size_t->data<int>();
out_h = out_size_data[0];
out_w = out_size_data[1];
}
int batch_size = d_input_t->dims()[0];
int channels = d_input_t->dims()[1];
int in_h = d_input_t->dims()[2];
int in_w = d_input_t->dims()[3];
int in_hw = in_h * in_w;
int out_hw = out_h * out_w;
int in_chw = channels * in_hw;
int out_chw = channels * out_hw;
float ratio_h =
(out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
float ratio_w =
(out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
if (in_h == out_h && in_w == out_w) {
memcpy(d_input, d_output, d_input_t->numel() * sizeof(T));
} else {
for (int k = 0; k < batch_size; ++k) { // loop for batches
for (int i = 0; i < out_h; ++i) { // loop for images
int h = ratio_h * i;
int hid = (h < in_h - 1) ? 1 : 0;
float h1lambda = ratio_h * i - h;
float h2lambda = 1 - h1lambda;
for (int j = 0; j < out_w; ++j) {
int w = ratio_w * j;
int wid = (w < in_w - 1) ? 1 : 0;
float w1lambda = ratio_w * j - w;
float w2lambda = 1 - w1lambda;
T* in_pos = &d_input[k * in_chw + h * in_w + w];
const T* out_pos = &d_output[k * out_chw + i * out_w + j];
for (int c = 0; c < channels; ++c) { // loop for channels
in_pos[0] += static_cast<T>(h2lambda * w2lambda * out_pos[0]);
in_pos[wid] += static_cast<T>(h2lambda * w1lambda * out_pos[0]);
in_pos[hid * in_w] +=
static_cast<T>(h1lambda * w2lambda * out_pos[0]);
in_pos[hid * in_w + wid] +=
static_cast<T>(h1lambda * w1lambda * out_pos[0]);
in_pos += in_hw;
out_pos += out_hw;
}
}
}
}
}
}
};
} // namespace operators
} // namespace paddle
......@@ -46,8 +46,8 @@ class CheckpointNotifyOp : public framework::OperatorBase {
auto lookup_table_save_dir =
string::Sprintf("%s/%s_%d", dir, lookup_table_name, i);
rpc_client->AsyncCheckpointNotify(epmap[i], lookup_table_save_dir);
VLOG(3) << "checkpoint notify sending lookup table: " << lookup_table_name
<< " and dir:" << dir << " to " << epmap[i];
VLOG(30) << "checkpoint notify sending lookup table: "
<< lookup_table_name << " and dir:" << dir << " to " << epmap[i];
}
PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
}
......
......@@ -37,7 +37,7 @@ class ConcatOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_GT(n, 0, "Input tensors count should > 0.");
if (n == 1) {
VLOG(3) << "Warning: concat op have only one input, may waste memory";
VLOG(30) << "Warning: concat op have only one input, may waste memory";
}
auto out_dims = ins[0];
......
......@@ -15,15 +15,22 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/platform/assert.h"
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_bool(cudnn_deterministic, false,
"Whether allow using an autotuning algorithm for convolution "
"operator. The autotuning algorithm may be non-deterministic. If "
"true, the algorithm is deterministic.");
DEFINE_uint64(conv_workspace_size_limit, 4096,
"cuDNN convolution workspace limit in MB unit.");
DEFINE_bool(cudnn_exhaustive_search, false,
"Whether enable exhaustive search for cuDNN convolution or "
"not, defalut is False.");
namespace paddle {
namespace operators {
......@@ -36,13 +43,25 @@ using DataLayout = platform::DataLayout;
template <typename T>
using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
static constexpr char kCUDNNFwdAlgoCache[] = "kCUDNNFwdAlgoCache";
static constexpr char kCUDNNBwdDataAlgoCache[] = "kCUDNNBwdDataAlgoCache";
static constexpr char kCUDNNBwdFilterAlgoCache[] = "kCUDNNBwdFilterAlgoCache";
static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
static_cast<size_t>(1024) * 1024 * 1024;
static constexpr size_t kNUM_CUDNN_FWD_ALGS =
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS =
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS =
CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
template <typename T>
class CUDNNConvOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use CUDAPlace.");
auto* input = ctx.Input<Tensor>("Input");
......@@ -55,6 +74,8 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
int groups = ctx.Attr<int>("groups");
int64_t user_workspace_size =
static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
bool exhaustive_search =
FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
const T* input_data = input->data<T>();
const T* filter_data = filter->data<T>();
......@@ -120,19 +141,19 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv workspace ---------------------
size_t workspace_size_in_bytes; // final workspace to allocate.
size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
if (user_workspace_size > 0) {
workspace_size_limit = user_workspace_size * 1024 * 1024;
if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) {
int64_t max_user_size =
std::max(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
user_workspace_size);
workspace_size_limit = max_user_size * 1024 * 1024;
}
// ------------------- cudnn conv algorithm ---------------------
cudnnConvolutionFwdAlgo_t algo;
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
bool half_float = false;
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
// Tensor core is supported since the volta GPU and
// is only enabled when input and filter data are float16
......@@ -143,14 +164,66 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
// Currently tensor core is only enabled using this algo
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
VLOG(5) << "use cudnn_tensor_op_math";
half_float = true;
VLOG(50) << "use cudnn_tensor_op_math";
} else {
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
VLOG(5) << "NOT use cudnn_tensor_op_math";
VLOG(50) << "NOT use cudnn_tensor_op_math";
}
#endif
auto x_dims = framework::vectorize(input->dims());
auto f_dims = framework::vectorize(filter->dims());
if ((!exhaustive_search) && (!half_float)) {
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
VLOG(3) << "cuDNN forward algo " << algo;
} else if (exhaustive_search && (!half_float)) {
AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* algo_cache = nullptr;
if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) {
algo_cache =
ctx.scope()
.FindVar(kCUDNNFwdAlgoCache)
->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
} else {
algo_cache =
const_cast<framework::Scope&>(ctx.scope())
.Var(kCUDNNFwdAlgoCache)
->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
}
algo = algo_cache->GetAlgorithm(
x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
int returned_algo_count;
std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
fwd_perf_stat;
auto cudnn_find_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(
platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
handle, cudnn_input_desc, input_data, cudnn_filter_desc,
filter_data, cudnn_conv_desc, cudnn_output_desc,
output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
fwd_perf_stat.data(), cudnn_workspace,
workspace_size_limit));
};
workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit);
VLOG(3) << "Perf result: (algo: stat, time, memory)";
for (int i = 0; i < returned_algo_count; ++i) {
const auto& stat = fwd_perf_stat[i];
VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
<< " " << stat.memory;
}
return fwd_perf_stat[0].algo;
});
VLOG(3) << "choose algo " << algo;
} else {
PADDLE_ENFORCE(half_float,
"cuDNN exhaustive search doesn't support half float.");
}
// get workspace size able to allocate
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
......@@ -162,7 +235,6 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv forward ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
for (int i = 0; i < groups; i++) {
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
......@@ -180,6 +252,7 @@ template <typename T>
class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use CUDAPlace.");
auto input = ctx.Input<Tensor>("Input");
......@@ -198,6 +271,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
int groups = ctx.Attr<int>("groups");
int64_t user_workspace_size =
static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
bool exhaustive_search =
FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
if (exhaustive_search && FLAGS_cudnn_deterministic) {
PADDLE_THROW(
"Cann't set exhaustive_search True and "
"FLAGS_cudnn_deterministic True at same time.");
}
// ------------------- cudnn descriptors ---------------------
ScopedTensorDescriptor input_desc;
......@@ -265,14 +345,66 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
cudnnConvolutionBwdFilterAlgo_t filter_algo;
size_t workspace_size_in_bytes = 0, tmp_size = 0;
size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
if (user_workspace_size > 0) {
workspace_size_limit = user_workspace_size * 1024 * 1024;
if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) {
int64_t max_user_size =
std::max(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
user_workspace_size);
workspace_size_limit = max_user_size * 1024 * 1024;
}
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto x_dims = framework::vectorize(input->dims());
auto f_dims = framework::vectorize(filter->dims());
auto handle = dev_ctx.cudnn_handle();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
if (input_grad) {
if (!FLAGS_cudnn_deterministic) {
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
if (exhaustive_search) {
AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>* data_algo_cache;
if (ctx.scope().FindVar(kCUDNNBwdDataAlgoCache)) {
data_algo_cache =
ctx.scope()
.FindVar(kCUDNNBwdDataAlgoCache)
->GetMutable<
AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>();
} else {
data_algo_cache =
const_cast<framework::Scope&>(ctx.scope())
.Var(kCUDNNBwdDataAlgoCache)
->GetMutable<
AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>();
}
data_algo = data_algo_cache->GetAlgorithm(
x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
int returned_algo_count;
std::array<cudnnConvolutionBwdDataAlgoPerf_t,
kNUM_CUDNN_BWD_DATA_ALGS>
data_perf_stat;
auto cudnn_find_bd_data_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(
platform::dynload::
cudnnFindConvolutionBackwardDataAlgorithmEx(
handle, cudnn_filter_desc, filter_data,
cudnn_output_grad_desc, output_grad_data,
cudnn_conv_desc, cudnn_input_desc, input_grad_data,
kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count,
data_perf_stat.data(), cudnn_workspace,
workspace_size_limit));
};
workspace_handle.RunFunc(cudnn_find_bd_data_func,
workspace_size_limit);
VLOG(3) << "Perf result: (algo: stat, time, memory)";
for (int i = 0; i < returned_algo_count; ++i) {
const auto& stat = data_perf_stat[i];
VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
<< " " << stat.memory;
}
return data_perf_stat[0].algo;
});
VLOG(3) << "cuDNN backward data algo " << data_algo;
} else if (FLAGS_cudnn_deterministic) {
data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
} else {
CUDNN_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
handle, cudnn_filter_desc,
......@@ -285,10 +417,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
cudnn_input_desc,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &data_algo));
} else {
data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
}
CUDNN_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
handle, cudnn_filter_desc, cudnn_output_grad_desc,
......@@ -297,17 +426,54 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
}
if (filter_grad) {
if (!FLAGS_cudnn_deterministic) {
T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
if (exhaustive_search) {
AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>* f_algo_cache;
if (ctx.scope().FindVar(kCUDNNBwdFilterAlgoCache)) {
f_algo_cache =
ctx.scope()
.FindVar(kCUDNNBwdFilterAlgoCache)
->GetMutable<
AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>();
} else {
f_algo_cache =
const_cast<framework::Scope&>(ctx.scope())
.Var(kCUDNNBwdFilterAlgoCache)
->GetMutable<
AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>();
}
filter_algo = f_algo_cache->GetAlgorithm(
x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
int returned_algo_count;
std::array<cudnnConvolutionBwdFilterAlgoPerf_t,
kNUM_CUDNN_BWD_FILTER_ALGS>
filter_perf_stat;
auto cudnn_find_bd_f_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(
platform::dynload::
cudnnFindConvolutionBackwardFilterAlgorithmEx(
handle, cudnn_input_desc, input_data,
cudnn_output_grad_desc, output_grad_data,
cudnn_conv_desc, cudnn_filter_desc,
filter_grad_data, kNUM_CUDNN_BWD_FILTER_ALGS,
&returned_algo_count, filter_perf_stat.data(),
cudnn_workspace, workspace_size_limit));
};
workspace_handle.RunFunc(cudnn_find_bd_f_func,
workspace_size_limit);
return filter_perf_stat[0].algo;
});
VLOG(3) << "cuDNN backward filter algo " << filter_algo;
} else if (FLAGS_cudnn_deterministic) {
filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
} else {
CUDNN_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
handle, cudnn_input_desc, cudnn_output_grad_desc,
cudnn_conv_desc, cudnn_filter_desc,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &filter_algo));
} else {
filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
}
CUDNN_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
......@@ -317,7 +483,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv backward data ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
if (input_grad) {
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
// Because beta is zero, it is unnecessary to reset input_grad.
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <functional>
#include <unordered_map>
#include <vector>
namespace paddle {
namespace operators {
template <typename TAlgorithm>
class AlgorithmsCache {
public:
// Caches the best algorithm for a given
// combination of tensor dimensions & compute data type.
TAlgorithm GetAlgorithm(
const std::vector<int64_t>& dims1, const std::vector<int64_t>& dims2,
const std::vector<int>& strides, const std::vector<int>& paddings,
const std::vector<int>& dilations,
int algorithmFlags, // can set for different data type
std::function<TAlgorithm()> gen_func);
private:
std::unordered_map<int64_t, TAlgorithm> hash_;
std::mutex mutex_;
};
template <typename TAlgorithm>
TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
const std::vector<int64_t>& dims1, const std::vector<int64_t>& dims2,
const std::vector<int>& strides, const std::vector<int>& paddings,
const std::vector<int>& dilations, int algorithmFlags,
std::function<TAlgorithm()> gen_func) {
std::lock_guard<std::mutex> lock(mutex_);
int64_t seed = 0;
// Hash all of the inputs, use to try and look up a previously
// discovered algorithm, or fall back to generating a new one.
std::hash<int64_t> hashFn;
// do hash like boost
// https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x
for (const auto num : dims1) {
seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
}
for (const auto num : dims2) {
seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2) + 1;
}
for (const auto num : strides) {
seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
(seed >> 2) + 2;
}
for (const auto num : paddings) {
seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
(seed >> 2) + 3;
}
for (const auto num : dilations) {
seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
(seed >> 2) + 4;
}
seed ^= hashFn(static_cast<int64_t>(algorithmFlags)) + 0x9e3779b9 +
(seed << 6) + (seed >> 2) + 5;
if (seed == 0) return gen_func();
if (hash_.find(seed) == hash_.end()) {
TAlgorithm value = gen_func();
hash_[seed] = value;
}
return hash_[seed];
}
} // namespace operators
} // namespace paddle
......@@ -189,6 +189,11 @@ void Conv2DOpMaker::Make() {
"workspace size can increase performance but also requires "
"better hardware. This size should be chosen carefully.")
.SetDefault(4096);
AddAttr<bool>("exhaustive_search",
"(bool, default false) cuDNN has many algorithm to calculation "
"convolution, whether enable exhaustive search ",
"for cuDNN convolution or not, defalut is False.")
.SetDefault(false);
AddComment(R"DOC(
Convolution Operator.
......@@ -219,6 +224,15 @@ $$
)DOC");
}
class ConvOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
const override {
return std::unordered_map<std::string, std::string>{
{"Input", /*->*/ "Output"}};
}
};
void Conv3DOpMaker::Make() {
AddInput(
"Input",
......@@ -283,7 +297,11 @@ void Conv3DOpMaker::Make() {
"workspace size can increase performance but also requires "
"better hardware. This size should be chosen carefully.")
.SetDefault(4096);
AddAttr<bool>("exhaustive_search",
"(bool, default false) cuDNN has many algorithm to calculation "
"convolution, whether enable exhaustive search ",
"for cuDNN convolution or not, defalut is False.")
.SetDefault(false);
AddComment(R"DOC(
Convolution3D Operator.
......@@ -356,6 +374,7 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
namespace ops = paddle::operators;
REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker,
ops::ConvOpInferVarType,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad);
......@@ -363,7 +382,9 @@ REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad);
REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad);
REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker,
ops::ConvOpInferVarType,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad);
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/cross_entropy_op.h"
#include <string>
namespace paddle {
namespace operators {
......@@ -179,6 +180,15 @@ or not. But the output only shares the LoD information with input X.
)DOC");
}
};
class CrossEntropyOpInferVarType
: public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
const override {
return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
}
};
} // namespace operators
} // namespace paddle
......@@ -186,6 +196,7 @@ namespace ops = paddle::operators;
using CPUCtx = paddle::platform::CPUDeviceContext;
REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
ops::CrossEntropyOpInferVarType,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp);
REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<CPUCtx, float>,
......
......@@ -133,10 +133,10 @@ void AsyncBRPCServer::StartServer() {
void AsyncBRPCServer::ShutDownImpl() { server_.Stop(1000); }
void AsyncBRPCServer::WaitServerReady() {
VLOG(3) << "AsyncGRPCServer is wait server ready";
VLOG(30) << "AsyncGRPCServer is wait server ready";
std::unique_lock<std::mutex> lock(this->mutex_ready_);
condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
VLOG(3) << "AsyncGRPCServer WaitSeverReady";
VLOG(30) << "AsyncGRPCServer WaitSeverReady";
}
}; // namespace distributed
......
......@@ -38,7 +38,7 @@ void GRPCClient::SendComplete() {
std::unique_lock<std::mutex> lk(completed_mutex_);
if (!completed_) {
for (auto& it : channels_) {
VLOG(3) << "send complete message to " << it.first;
VLOG(30) << "send complete message to " << it.first;
this->AsyncSendComplete(it.first);
}
PADDLE_ENFORCE(this->Wait(), "internal grpc error");
......@@ -81,7 +81,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
::grpc::ByteBuffer req;
SerializeToByteBuffer(var_name_val, var, *p_ctx, &req, "", trainer_id_);
VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
VLOG(30) << s->GetVarHandlePtr()->String() << " begin";
// stub context
s->response_call_back_ = nullptr;
......@@ -142,7 +142,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
::grpc::ByteBuffer buf;
RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
VLOG(30) << s->GetVarHandlePtr()->String() << " begin";
// stub context
s->response_call_back_ = ProcGetResponse;
......@@ -190,7 +190,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
::grpc::ByteBuffer req;
SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val);
VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
VLOG(30) << s->GetVarHandlePtr()->String() << " begin";
// stub context
s->response_call_back_ = ProcGetResponse;
......@@ -328,14 +328,14 @@ void GRPCClient::Proceed() {
void* tag = nullptr;
bool ok = false;
VLOG(3) << "GRPCClient Proceed begin";
VLOG(30) << "GRPCClient Proceed begin";
while (!stopped_ && cq_.Next(&tag, &ok)) {
BaseProcessor* c = static_cast<BaseProcessor*>(tag);
GPR_ASSERT(ok);
PADDLE_ENFORCE(c);
if (c->status_.ok()) {
VLOG(3) << c->GetVarHandlePtr()->String() << " process";
VLOG(30) << c->GetVarHandlePtr()->String() << " process";
c->Process();
} else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) {
// FIXME(gongwb): parse error_details?
......@@ -370,7 +370,7 @@ void GRPCClient::Proceed() {
sync_cond_.notify_all();
}
}
VLOG(3) << "GRPCClient Proceed end";
VLOG(30) << "GRPCClient Proceed end";
}
std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
......
......@@ -98,7 +98,7 @@ class RequestSend final : public RequestBase {
void Process() override {
std::string varname = GetReqName();
VLOG(4) << "RequestSend var_name:" << varname;
VLOG(40) << "RequestSend var_name:" << varname;
auto scope = request_->GetMutableLocalScope();
auto invar = request_->GetVar();
......@@ -135,7 +135,7 @@ class RequestGet final : public RequestBase {
// proc request.
std::string varname = request_.varname();
int trainer_id = request_.trainer_id();
VLOG(4) << "RequestGet " << varname;
VLOG(40) << "RequestGet " << varname;
auto scope = request_handler_->scope();
auto invar = scope->FindVar(varname);
......@@ -182,8 +182,8 @@ class RequestPrefetch final : public RequestBase {
std::string in_var_name = request_->Varname();
std::string out_var_name = request_->OutVarname();
int trainer_id = request_->GetTrainerId();
VLOG(4) << "RequestPrefetch, in_var_name: " << in_var_name
<< " out_var_name: " << out_var_name;
VLOG(40) << "RequestPrefetch, in_var_name: " << in_var_name
<< " out_var_name: " << out_var_name;
auto scope = request_->GetMutableLocalScope();
auto invar = scope->FindVar(in_var_name);
......@@ -231,8 +231,8 @@ class RequestCheckpointNotify final : public RequestBase {
std::string checkpoint_dir = request_->OutVarname();
int trainer_id = request_->GetTrainerId();
VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify
<< ", dir: " << checkpoint_dir;
VLOG(40) << "RequestCheckpointNotify notify: " << checkpoint_notify
<< ", dir: " << checkpoint_dir;
request_handler_->Handle(checkpoint_notify, scope, nullptr, nullptr,
trainer_id, checkpoint_dir);
......@@ -246,10 +246,10 @@ class RequestCheckpointNotify final : public RequestBase {
};
void AsyncGRPCServer::WaitServerReady() {
VLOG(4) << "AsyncGRPCServer is wait server ready";
VLOG(40) << "AsyncGRPCServer is wait server ready";
std::unique_lock<std::mutex> lock(this->mutex_ready_);
condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
VLOG(4) << "AsyncGRPCServer WaitSeverReady";
VLOG(40) << "AsyncGRPCServer WaitSeverReady";
}
void AsyncGRPCServer::StartServer() {
......@@ -282,14 +282,15 @@ void AsyncGRPCServer::StartServer() {
reqs.reserve(kRequestBufSize);
for (int i = 0; i < kRequestBufSize; i++) {
VLOG(6) << "TryToRegisterNewOne on RPC NAME: " << rpc_name << " I: " << i;
VLOG(60) << "TryToRegisterNewOne on RPC NAME: " << rpc_name
<< " I: " << i;
TryToRegisterNewOne(rpc_name, i);
}
for (int i = 0; i < threadnum; i++) {
rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind(
&AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f)));
VLOG(4) << t.first << " creates threads!";
VLOG(40) << t.first << " creates threads!";
}
}
......@@ -306,7 +307,7 @@ void AsyncGRPCServer::StartServer() {
auto& threads = t.second;
for (size_t i = 0; i < threads.size(); ++i) {
threads[i]->join();
VLOG(4) << t.first << " threads ends!";
VLOG(40) << t.first << " threads ends!";
}
}
}
......@@ -314,7 +315,7 @@ void AsyncGRPCServer::StartServer() {
void AsyncGRPCServer::ShutdownQueue() {
for (auto& t : rpc_cq_) {
t.second->Shutdown();
VLOG(4) << t.first << " queue shutdown!";
VLOG(40) << t.first << " queue shutdown!";
}
}
......@@ -323,7 +324,7 @@ void AsyncGRPCServer::ShutDownImpl() {
is_shut_down_ = true;
ShutdownQueue();
VLOG(4) << "server_ shutdown!";
VLOG(40) << "server_ shutdown!";
server_->Shutdown();
}
......@@ -331,12 +332,12 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
int req_id) {
std::unique_lock<std::mutex> lock(cq_mutex_);
if (is_shut_down_) {
VLOG(4) << "shutdown, do not TryToRegisterNewSendOne";
VLOG(40) << "shutdown, do not TryToRegisterNewSendOne";
return;
}
VLOG(4) << "TryToRegisterNewOne on RPC NAME: " << rpc_name
<< " REQ ID: " << req_id;
VLOG(40) << "TryToRegisterNewOne on RPC NAME: " << rpc_name
<< " REQ ID: " << req_id;
auto& reqs = rpc_reqs_[rpc_name];
auto& handler = rpc_call_map_[rpc_name];
......@@ -357,7 +358,7 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
reqs[req_id] = b;
VLOG(4) << "Create RequestSend status:" << b->Status();
VLOG(40) << "Create RequestSend status:" << b->Status();
}
void AsyncGRPCServer::HandleRequest(
......@@ -367,15 +368,15 @@ void AsyncGRPCServer::HandleRequest(
bool ok = false;
while (true) {
VLOG(4) << "HandleRequest " << rpc_name << " wait next";
VLOG(40) << "HandleRequest " << rpc_name << " wait next";
if (!cq->Next(&tag, &ok)) {
VLOG(3) << "CompletionQueue " << rpc_name << " shutdown!";
VLOG(30) << "CompletionQueue " << rpc_name << " shutdown!";
break;
}
int req_id = static_cast<int>(reinterpret_cast<intptr_t>(tag));
VLOG(4) << "HandleRequest " << rpc_name << ", req_id:" << req_id
<< " get next";
VLOG(40) << "HandleRequest " << rpc_name << ", req_id:" << req_id
<< " get next";
auto& reqs = rpc_reqs_[rpc_name];
RequestBase* base = nullptr;
......@@ -385,7 +386,7 @@ void AsyncGRPCServer::HandleRequest(
base = reqs[req_id];
}
VLOG(3) << base->Status2String(rpc_name);
VLOG(30) << base->Status2String(rpc_name);
// reference:
// https://github.com/tensorflow/tensorflow/issues/5596
......
......@@ -75,7 +75,7 @@ class VarHandle {
wait_cond_.wait(lk, [this] { return status_ != kDefaultState; });
ret = status_;
}
VLOG(7) << "VarHandle wait:" << ret;
VLOG(70) << "VarHandle wait:" << ret;
return ret != kErrorState;
}
......@@ -84,7 +84,7 @@ class VarHandle {
std::unique_lock<std::mutex> lk(sync_mutex_);
status_ = ok ? kFinishState : kErrorState;
}
VLOG(7) << "VarHandle finish:" << ok;
VLOG(70) << "VarHandle finish:" << ok;
wait_cond_.notify_all();
}
......
......@@ -38,19 +38,19 @@ bool RequestSendHandler::Handle(const std::string& varname,
framework::Variable** outvar,
const int trainer_id,
const std::string& out_var_name) {
VLOG(4) << "RequestSendHandler:" << varname;
VLOG(40) << "RequestSendHandler:" << varname;
// Sync
if (varname == BATCH_BARRIER_MESSAGE) {
VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE";
VLOG(30) << "sync: recv BATCH_BARRIER_MESSAGE";
rpc_server_->IncreaseBatchBarrier(kRequestSend);
} else if (varname == COMPLETE_MESSAGE) {
VLOG(3) << "sync: recv complete message";
VLOG(30) << "sync: recv complete message";
rpc_server_->Complete();
} else {
// Async
if (!sync_mode_) {
VLOG(3) << "async process var: " << varname;
VLOG(30) << "async process var: " << varname;
try {
executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
scope);
......@@ -61,7 +61,7 @@ bool RequestSendHandler::Handle(const std::string& varname,
return true;
} else { // sync
rpc_server_->WaitCond(kRequestSend);
VLOG(3) << "sync: processing received var: " << varname;
VLOG(30) << "sync: processing received var: " << varname;
if (invar == nullptr) {
LOG(FATAL) << "sync: Can not find server side var: " << varname;
......@@ -78,10 +78,10 @@ bool RequestGetHandler::Handle(const std::string& varname,
framework::Variable** outvar,
const int trainer_id,
const std::string& out_var_name) {
VLOG(4) << "RequestGetHandler:" << varname;
VLOG(40) << "RequestGetHandler:" << varname;
if (sync_mode_) {
if (varname == FETCH_BARRIER_MESSAGE) {
VLOG(3) << "sync: recv fetch barrier message";
VLOG(30) << "sync: recv fetch barrier message";
rpc_server_->IncreaseBatchBarrier(kRequestGet);
} else {
rpc_server_->WaitCond(kRequestGet);
......@@ -93,13 +93,14 @@ bool RequestGetHandler::Handle(const std::string& varname,
// NOTE: the format is determined by distributed_transpiler.py
std::string param_bak_name =
string::Sprintf("%s.trainer_%d_bak", varname, trainer_id);
VLOG(3) << "getting " << param_bak_name << " trainer_id " << trainer_id;
VLOG(30) << "getting " << param_bak_name << " trainer_id "
<< trainer_id;
auto var = scope_->FindVar(varname);
auto t_orig = var->Get<framework::LoDTensor>();
auto param_bak = scope_->Var(param_bak_name);
auto t = param_bak->GetMutable<framework::LoDTensor>();
t->mutable_data(dev_ctx_->GetPlace(), t_orig.type());
VLOG(3) << "copying " << varname << " to " << param_bak_name;
VLOG(30) << "copying " << varname << " to " << param_bak_name;
framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t);
}
*outvar = scope_->FindVar(varname);
......@@ -114,7 +115,7 @@ bool RequestPrefetchHandler::Handle(const std::string& varname,
framework::Variable** outvar,
const int trainer_id,
const std::string& out_var_name) {
VLOG(4) << "RequestPrefetchHandler " << varname;
VLOG(40) << "RequestPrefetchHandler " << varname;
auto var_desc = program_->Block(0).FindVar(out_var_name);
InitializeVariable(*outvar, var_desc->GetType());
......@@ -138,8 +139,8 @@ bool RequestCheckpointHandler::Handle(const std::string& varname,
auto* lt_var = scope_->FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>();
lt_var->clear();
lt_var->append(out_var_name);
VLOG(4) << "RequestCheckpointHandler update var kLookupTablePath to: "
<< out_var_name;
VLOG(40) << "RequestCheckpointHandler update var kLookupTablePath to: "
<< out_var_name;
executor_->RunPreparedContext(checkpoint_prepared_ctx_.get(), scope_);
return true;
}
......
......@@ -39,7 +39,7 @@ void RPCServer::SavePort() const {
port_file.open(file_path);
port_file << selected_port_;
port_file.close();
VLOG(4) << "selected port written to " << file_path;
VLOG(40) << "selected port written to " << file_path;
}
void RPCServer::WaitBarrier(const std::string& rpc_name) {
......@@ -49,12 +49,12 @@ void RPCServer::WaitBarrier(const std::string& rpc_name) {
exit_flag_.load());
});
VLOG(3) << "batch_barrier_: " << rpc_name << " "
<< barrier_counter_[rpc_name];
VLOG(30) << "batch_barrier_: " << rpc_name << " "
<< barrier_counter_[rpc_name];
}
void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
VLOG(4) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
VLOG(40) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
int b = 0;
std::unique_lock<std::mutex> lock(mutex_);
b = ++barrier_counter_[rpc_name];
......@@ -71,7 +71,7 @@ void RPCServer::Complete() {
client_num_--;
need_reset_all_vars_ = true;
VLOG(4) << "decrease client_num to: " << client_num_;
VLOG(40) << "decrease client_num to: " << client_num_;
if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) {
barrier_counter_[kRequestGet]--;
}
......@@ -90,7 +90,7 @@ int RPCServer::GetClientNum() {
}
void RPCServer::ResetBarrierCounter() {
VLOG(3) << "RPCServer ResetBarrierCounter ";
VLOG(30) << "RPCServer ResetBarrierCounter ";
std::unique_lock<std::mutex> lock(mutex_);
for (auto& t : barrier_counter_) {
t.second = 0;
......@@ -105,12 +105,12 @@ void RPCServer::RegisterRPC(const std::string& rpc_name,
static int cond = -1;
rpc_cond_map_[rpc_name] = ++cond;
VLOG(4) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler
<< ", cond:" << rpc_cond_map_[rpc_name];
VLOG(40) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler
<< ", cond:" << rpc_cond_map_[rpc_name];
}
void RPCServer::SetCond(const std::string& rpc_name) {
VLOG(3) << "RPCServer SetCond " << rpc_name;
VLOG(30) << "RPCServer SetCond " << rpc_name;
{
std::unique_lock<std::mutex> lock(mutex_);
cur_cond_ = rpc_cond_map_[rpc_name];
......@@ -120,7 +120,7 @@ void RPCServer::SetCond(const std::string& rpc_name) {
}
void RPCServer::WaitCond(const std::string& rpc_name) {
VLOG(4) << "RPCServer WaitCond " << rpc_name;
VLOG(40) << "RPCServer WaitCond " << rpc_name;
int cond = 0;
{
std::unique_lock<std::mutex> lock(mutex_);
......
......@@ -50,7 +50,7 @@ bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
size_to_write = length - total_written;
}
// This log is useful to see how long a internal block size is of rpc.
VLOG(7) << "copy " << size_to_write << " data to CUDAPlace";
VLOG(70) << "copy " << size_to_write << " data to CUDAPlace";
memory::Copy(boost::get<platform::CUDAPlace>(place),
reinterpret_cast<void*>(p), cpu, data, size_to_write,
gpu_dev_ctx.stream());
......@@ -79,7 +79,7 @@ bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
// TODO(gongwb): can we avoid copy?
platform::CPUPlace cpu;
// This log is useful to see how long a internal block size is of rpc.
VLOG(7) << "copy " << size_to_write << " data to CPUPlace";
VLOG(70) << "copy " << size_to_write << " data to CPUPlace";
memory::Copy(cpu, reinterpret_cast<void*>(p), cpu, data, size_to_write);
p += size_to_write;
......@@ -198,8 +198,8 @@ bool VariableResponse::ProcSerializedField(
#endif
}
VLOG(7) << "ProcSerializedField:" << meta_.varname()
<< ", type:" << meta_.type() << std::endl;
VLOG(70) << "ProcSerializedField:" << meta_.varname()
<< ", type:" << meta_.type() << std::endl;
framework::DDim dims = GetDims(meta_.dims());
if (meta_.type() == sendrecv::LOD_TENSOR) {
PADDLE_ENFORCE(meta_.lod_size() >= 0, "lod info should be got first!");
......
......@@ -75,16 +75,12 @@ class ElementwiseOp : public framework::OperatorWithKernel {
}
};
class ElementwiseOpInferVarType : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
auto x_name = op_desc.Input("X")[0];
auto out_name = op_desc.Output("Out")[0];
auto &x = block->FindRecursiveOrCreateVar(x_name);
auto &out = block->FindRecursiveOrCreateVar(out_name);
out.SetType(x.GetType());
out.SetDataType(x.GetDataType());
class ElementwiseOpInferVarType
: public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
const override {
return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
}
};
......
......@@ -47,8 +47,8 @@ class FeedOp : public framework::OperatorBase {
auto col = Attr<int>("col");
VLOG(3) << "Feed Var " << feed_var_name << "'s " << col << " column to var "
<< out_name;
VLOG(30) << "Feed Var " << feed_var_name << "'s " << col
<< " column to var " << out_name;
auto &feed_list = feed_var->Get<framework::FeedFetchList>();
auto &feed_item = feed_list.at(static_cast<size_t>(col));
......
......@@ -43,7 +43,7 @@ class FetchBarrierOp : public framework::OperatorBase {
PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
for (auto& ep : eps) {
VLOG(3) << "fetch barrier, ep: " << ep;
VLOG(30) << "fetch barrier, ep: " << ep;
rpc_client->AsyncSendFetchBarrier(ep);
}
PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
......
......@@ -57,7 +57,7 @@ class FetchOp : public framework::OperatorBase {
TensorCopySync(src_item, platform::CPUPlace(), &dst_item);
dst_item.set_lod(src_item.lod());
VLOG(3) << "Fetch variable " << fetch_var_name << " to " << out_name;
VLOG(30) << "Fetch variable " << fetch_var_name << " to " << out_name;
}
};
......
......@@ -64,7 +64,7 @@ class GenNCCLIdOp : public framework::OperatorBase {
distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
for (auto& ep : endpoint_list) {
VLOG(3) << "sending nccl id to " << ep;
VLOG(30) << "sending nccl id to " << ep;
client->AsyncSendVar(ep, dev_ctx, *scope, NCCL_ID_VARNAME);
}
client->Wait();
......@@ -72,7 +72,7 @@ class GenNCCLIdOp : public framework::OperatorBase {
client->AsyncSendBatchBarrier(ep);
}
client->Wait();
VLOG(3) << "sending completed...";
VLOG(30) << "sending completed...";
}
void GetIdByServer(framework::Scope* scope,
......@@ -99,11 +99,11 @@ class GenNCCLIdOp : public framework::OperatorBase {
std::bind(&distributed::RPCServer::StartServer, rpc_service.get()));
rpc_service->SetCond(distributed::kRequestSend);
VLOG(3) << "start getting nccl id from trainer 0...";
VLOG(30) << "start getting nccl id from trainer 0...";
rpc_service->WaitBarrier(distributed::kRequestSend);
VLOG(3) << "got nccl id and stop server...";
VLOG(30) << "got nccl id and stop server...";
rpc_service->ShutDown();
VLOG(3) << "rpc server stopped";
VLOG(30) << "rpc server stopped";
server_thread.join();
}
};
......
......@@ -63,12 +63,19 @@ static void CalcGridLocations(const platform::CPUDeviceContext& ctx,
Tensor ones;
ones.mutable_data<T>({n, h, w}, ctx.GetPlace());
auto ones_t = EigenTensor<T, 3>::From(ones).setConstant(1.0);
Tensor half_xmax, half_ymax;
half_xmax.mutable_data<T>({n, h, w}, ctx.GetPlace());
auto half_xmax_t =
EigenTensor<T, 3>::From(half_xmax).setConstant(0.5 * x_max);
half_ymax.mutable_data<T>({n, h, w}, ctx.GetPlace());
auto half_ymax_t =
EigenTensor<T, 3>::From(half_ymax).setConstant(0.5 * y_max);
// scale grid to [0, h-1/w-1]
auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
grid_x_t.device(place) = 0.5 * ((grid_x_t + ones_t) * x_max);
grid_y_t.device(place) = 0.5 * ((grid_y_t + ones_t) * y_max);
grid_x_t.device(place) = (grid_x_t + ones_t) * half_xmax_t;
grid_y_t.device(place) = (grid_y_t + ones_t) * half_ymax_t;
// calculate coords of 4 corner points
x_w->mutable_data<T>({n, h, w}, ctx.GetPlace());
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
......@@ -9,7 +9,8 @@
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/bilinear_interp_op.h"
#include "paddle/fluid/operators/interpolate_op.h"
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
......@@ -18,27 +19,34 @@ namespace operators {
using framework::Tensor;
class BilinearInterpOp : public framework::OperatorWithKernel {
class InterpolateOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of BilinearInterOp should not be null.");
"Input(X) of InterpolateOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of BilinearInterOp should not be null.");
"Output(Out) of InterpolationOp should not be null.");
auto interp_method = ctx->Attrs().Get<std::string>("interp_method");
PADDLE_ENFORCE(
"bilinear" == interp_method || "nearest" == interp_method,
"Interpolation method can only be \"bilinear\" or \"nearest\".");
auto dim_x = ctx->GetInputDim("X"); // NCHW format
int out_h = ctx->Attrs().Get<int>("out_h");
int out_w = ctx->Attrs().Get<int>("out_w");
PADDLE_ENFORCE_EQ(dim_x.size(), 4, "X's dimension must be 4");
if (ctx->HasInput("OutSize")) {
if (ctx->HasInput("OutSize") && ctx->IsRuntime()) {
auto out_size_dim = ctx->GetInputDim("OutSize");
PADDLE_ENFORCE_EQ(out_size_dim.size(), 1,
"OutSize's dimension size must be 1");
PADDLE_ENFORCE_EQ(out_size_dim[0], 2, "OutSize's dim[0] must be 2");
ctx->ShareLoD("X", "Out");
return;
}
std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
ctx->SetOutputDim("Out", framework::make_ddim(dim_out));
......@@ -52,35 +60,53 @@ class BilinearInterpOp : public framework::OperatorWithKernel {
}
};
class BilinearInterpOpMaker : public framework::OpProtoAndCheckerMaker {
class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"The input tensor of bilinear interpolation, "
"This is a 4-D tensor with shape of (N x C x h x w)");
"The input tensor of interpolate operator, "
"This is a 4-D tensor with shape of [N, C, H, w].");
AddInput("OutSize",
"This is a 1-D tensor with two number. "
"This is a 1-D tensor with two numbers to specify output size. "
"The first number is height and the second number is width.")
.AsDispensable();
AddOutput("Out", "The dimension of output is (N x C x out_h x out_w)");
AddOutput("Out",
"The output tensor of interpolate operator, "
"This is a 4-D tensor with shape of [N, C, H, W].");
AddAttr<int>("out_h", "output height of bilinear interpolation op.");
AddAttr<int>("out_w", "output width of bilinear interpolation op.");
AddAttr<int>("out_h", "output height of interpolate op.");
AddAttr<int>("out_w", "output width of interpolate op.");
AddAttr<std::string>(
"interp_method",
"(string), interpolation method, can be \"bilinear\" for "
"bilinear interpolation and \"nearest\" for nearest "
"neighbor interpolation.");
AddComment(R"DOC(
This operator samples input X to given output shape by using specified
interpolation method, the interpolation methods can be \"nearest\"
for nearest neighbor interpolation and \"bilinear\" for bilinear
interpolation.
Nearest neighbor interpolation is to perform nearest neighbor interpolation
in both the 3rd dimention(in height direction) and the 4th dimention(in width
direction) on input tensor.
Bilinear interpolation is an extension of linear interpolation for
interpolating functions of two variables (e.g. H-direction and
W-direction in this op) on a rectilinear 2D grid.
The key idea is to perform linear interpolation first in one
direction, and then again in the other direction.
For details, please refer to Wikipedia:
W-direction in this op) on a rectilinear 2D grid. The key idea is
to perform linear interpolation first in one direction, and then
again in the other direction.
For details of nearest neighbor interpolation, please refer to Wikipedia:
https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation
For details of bilinear interpolation, please refer to Wikipedia:
https://en.wikipedia.org/wiki/Bilinear_interpolation
)DOC");
}
};
class BilinearInterpOpGrad : public framework::OperatorWithKernel {
class InterpolateOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
......@@ -106,11 +132,11 @@ class BilinearInterpOpGrad : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(bilinear_interp, ops::BilinearInterpOp,
ops::BilinearInterpOpMaker,
REGISTER_OPERATOR(interpolate, ops::InterpolateOp, ops::InterpolateOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(bilinear_interp_grad, ops::BilinearInterpOpGrad);
REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::BilinearInterpKernel<float>,
ops::BilinearInterpKernel<uint8_t>);
REGISTER_OP_CPU_KERNEL(bilinear_interp_grad,
ops::BilinearInterpGradKernel<float>);
REGISTER_OPERATOR(interpolate_grad, ops::InterpolateOpGrad);
REGISTER_OP_CPU_KERNEL(interpolate, ops::InterpolateKernel<float>,
ops::InterpolateKernel<double>,
ops::InterpolateKernel<uint8_t>);
REGISTER_OP_CPU_KERNEL(interpolate_grad, ops::InterpolateGradKernel<float>,
ops::InterpolateGradKernel<double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
......@@ -9,7 +9,8 @@
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/bilinear_interp_op.h"
#include <string>
#include "paddle/fluid/operators/interpolate_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle {
......@@ -17,15 +18,72 @@ namespace operators {
using framework::Tensor;
template <typename T>
__global__ void KeNearestNeighborInterpFw(
const T* in, const size_t in_img_h, const size_t in_img_w,
const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
const size_t out_img_w, const size_t output_h, const size_t output_w,
const size_t num_channels, const float ratio_h, const float ratio_w) {
int nthreads = output_h * output_w;
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (; tid < nthreads; tid += stride) {
int out_id_h = tid / output_w;
int out_id_w = tid % output_w;
int in_img_size = input_w / num_channels;
int out_img_size = output_w / num_channels;
int channel_id = out_id_w / out_img_size;
int out_img_idy = (out_id_w % out_img_size) / out_img_w;
int in_img_idy = static_cast<int>(ratio_h * out_img_idy + 0.5);
int out_img_idx = tid % out_img_w;
int in_img_idx = static_cast<int>(ratio_w * out_img_idx + 0.5);
out[tid] = in[out_id_h * input_w + channel_id * in_img_size +
in_img_idy * in_img_w + in_img_idx];
}
}
template <typename T>
__global__ void KeNearestNeighborInterpBw(
T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
const size_t input_w, const T* out, const size_t out_img_h,
const size_t out_img_w, const size_t output_h, const size_t output_w,
const size_t num_channels, const float ratio_h, const float ratio_w) {
int nthreads = output_h * output_w;
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (; tid < nthreads; tid += stride) {
int out_id_h = tid / output_w;
int out_id_w = tid % output_w;
int in_img_size = input_w / num_channels;
int out_img_size = output_w / num_channels;
int channel_id = out_id_w / out_img_size;
int out_img_idy = (out_id_w % out_img_size) / out_img_w;
int in_img_idy = static_cast<int>(ratio_h * out_img_idy + 0.5);
int out_img_idx = tid % out_img_w;
int in_img_idx = static_cast<int>(ratio_w * out_img_idx + 0.5);
T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
in_img_idy * in_img_w + in_img_idx];
const T out_pos = out[out_id_h * output_w + out_id_w];
platform::CudaAtomicAdd(in_pos, out_pos);
}
}
template <typename T>
__global__ void KeBilinearInterpFw(
const T* in, const size_t in_img_h, const size_t in_img_w,
const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
const size_t out_img_w, const size_t output_h, const size_t output_w,
const size_t num_channels, const T ratio_h, const T ratioW) {
const size_t num_channels, const float ratio_h, const float ratio_w) {
int nthreads = output_h * output_w;
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < nthreads) {
int stride = blockDim.x * gridDim.x;
for (; tid < nthreads; tid += stride) {
int out_id_h = tid / output_w;
int out_id_w = tid % output_w;
int in_img_size = input_w / num_channels;
......@@ -39,9 +97,9 @@ __global__ void KeBilinearInterpFw(
T h2lambda = 1.f - h1lambda;
int out_img_idx = tid % out_img_w;
int in_img_idx = ratioW * out_img_idx;
int in_img_idx = ratio_w * out_img_idx;
int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
T w1lambda = ratioW * out_img_idx - in_img_idx;
T w1lambda = ratio_w * out_img_idx - in_img_idx;
T w2lambda = 1.f - w1lambda;
const T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
......@@ -60,10 +118,11 @@ __global__ void KeBilinearInterpBw(
T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
const size_t input_w, const T* out, const size_t out_img_h,
const size_t out_img_w, const size_t output_h, const size_t output_w,
const size_t num_channels, const T ratio_h, const T ratioW) {
const size_t num_channels, const T ratio_h, const T ratio_w) {
int nthreads = output_h * output_w;
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < nthreads) {
int stride = blockDim.x * gridDim.x;
for (; tid < nthreads; tid += stride) {
int out_id_h = tid / output_w;
int out_id_w = tid % output_w;
int in_img_size = input_w / num_channels;
......@@ -77,122 +136,146 @@ __global__ void KeBilinearInterpBw(
T h2lambda = 1.f - h1lambda;
int out_img_idx = tid % out_img_w;
int in_img_idx = ratioW * out_img_idx;
int in_img_idx = ratio_w * out_img_idx;
int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
T w1lambda = ratioW * out_img_idx - in_img_idx;
T w1lambda = ratio_w * out_img_idx - in_img_idx;
T w2lambda = 1.f - w1lambda;
T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
in_img_idy * in_img_w + in_img_idx];
const T* out_pos = &out[out_id_h * output_w + out_id_w];
atomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
atomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]);
atomicAdd(&in_pos[h_id * in_img_w], h1lambda * w2lambda * out_pos[0]);
atomicAdd(&in_pos[h_id * in_img_w + w_id],
h1lambda * w1lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos[h_id * in_img_w],
h1lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos[h_id * in_img_w + w_id],
h1lambda * w1lambda * out_pos[0]);
}
}
template <typename T>
class BilinearInterpOpCUDAKernel : public framework::OpKernel<T> {
class InterpolateOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"This kernel only runs on GPU device.");
auto* input_t = ctx.Input<Tensor>("X"); // float tensor
auto* output_t = ctx.Output<Tensor>("Out"); // float tensor
auto* input = input_t->data<T>();
auto* input = ctx.Input<Tensor>("X");
auto* output = ctx.Output<Tensor>("Out");
auto* input_data = input->data<T>();
auto interp_method = ctx.Attr<std::string>("interp_method");
int out_h = ctx.Attr<int>("out_h");
int out_w = ctx.Attr<int>("out_w");
auto out_dims = output_t->dims();
auto out_size_t = ctx.Input<Tensor>("OutSize");
if (out_size_t != nullptr) {
auto out_size = ctx.Input<Tensor>("OutSize");
if (out_size != nullptr) {
Tensor sizes;
framework::TensorCopy(*out_size_t, platform::CPUPlace(), &sizes);
framework::TensorCopy(*out_size, platform::CPUPlace(), &sizes);
auto size_data = sizes.data<int>();
out_h = size_data[0];
out_w = size_data[1];
}
auto* output = output_t->mutable_data<T>(
{out_dims[0], out_dims[1], out_h, out_w}, ctx.GetPlace());
int batch_size = input_t->dims()[0];
int channels = input_t->dims()[1];
int in_h = input_t->dims()[2];
int in_w = input_t->dims()[3];
int n = input->dims()[0];
int c = input->dims()[1];
int in_h = input->dims()[2];
int in_w = input->dims()[3];
auto* output_data =
output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
int in_hw = in_h * in_w;
int out_hw = out_h * out_w;
int in_chw = channels * in_hw;
int out_chw = channels * out_hw;
int in_chw = c * in_hw;
int out_chw = c * out_hw;
T ratio_h = (out_h > 1) ? static_cast<T>(in_h - 1) / (out_h - 1) : 0.f;
T ratio_w = (out_w > 1) ? static_cast<T>(in_w - 1) / (out_w - 1) : 0.f;
float ratio_h =
(out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
float ratio_w =
(out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
if (in_h == out_h && in_w == out_w) {
memcpy(output, input, input_t->numel() * sizeof(T));
} else {
int threadNum = batch_size * out_chw;
int blocks = (threadNum + 1024 - 1) / 1024;
framework::TensorCopy(*input, ctx.GetPlace(), output);
return;
}
int pixelNum = n * out_chw;
int grid_dim = (pixelNum + 512 - 1) / 512;
grid_dim = grid_dim > 8 ? 8 : grid_dim;
if ("nearest" == interp_method) {
KeNearestNeighborInterpFw<
T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
out_chw, c, ratio_h, ratio_w);
} else if ("bilinear" == interp_method) {
KeBilinearInterpFw<
T><<<blocks, 1024, 0, ctx.cuda_device_context().stream()>>>(
input, in_h, in_w, batch_size, in_chw, output, out_h, out_w,
batch_size, out_chw, channels, ratio_h, ratio_w);
T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
out_chw, c, ratio_h, ratio_w);
}
}
};
template <typename T>
class BilinearInterpGradOpCUDAKernel : public framework::OpKernel<T> {
class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* d_input_t = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* d_output_t = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* d_output = d_output_t->data<T>();
auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace());
auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* output_grad_data = output_grad->data<T>();
auto* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
auto& device_ctx =
ctx.template device_context<platform::CUDADeviceContext>();
math::SetConstant<platform::CUDADeviceContext, T> zero;
zero(device_ctx, d_input_t, static_cast<T>(0.0));
zero(device_ctx, input_grad, static_cast<T>(0.0));
auto interp_method = ctx.Attr<std::string>("interp_method");
int out_h = ctx.Attr<int>("out_h");
int out_w = ctx.Attr<int>("out_w");
auto out_size_t = ctx.Input<Tensor>("OutSize");
if (out_size_t != nullptr) {
auto out_size = ctx.Input<Tensor>("OutSize");
if (out_size != nullptr) {
Tensor sizes;
framework::TensorCopy(*out_size_t, platform::CPUPlace(), &sizes);
framework::TensorCopy(*out_size, platform::CPUPlace(), &sizes);
auto size_data = sizes.data<int>();
out_h = size_data[0];
out_w = size_data[1];
}
int batch_size = d_input_t->dims()[0];
int channels = d_input_t->dims()[1];
int in_h = d_input_t->dims()[2];
int in_w = d_input_t->dims()[3];
int n = input_grad->dims()[0];
int c = input_grad->dims()[1];
int in_h = input_grad->dims()[2];
int in_w = input_grad->dims()[3];
int in_hw = in_h * in_w;
int out_hw = out_h * out_w;
int in_chw = channels * in_hw;
int out_chw = channels * out_hw;
int in_chw = c * in_hw;
int out_chw = c * out_hw;
T ratio_h = (out_h > 1) ? static_cast<T>(in_h - 1) / (out_h - 1) : 0.f;
T ratio_w = (out_w > 1) ? static_cast<T>(in_w - 1) / (out_w - 1) : 0.f;
float ratio_h =
(out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
float ratio_w =
(out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
if (in_h == out_h && in_w == out_w) {
memcpy(d_input, d_output, d_input_t->numel() * sizeof(T));
} else {
int threadNum = batch_size * out_chw;
int blocks = (threadNum + 1024 - 1) / 1024;
framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad);
return;
}
int pixelNum = n * out_chw;
int grid_dim = (pixelNum + 512 - 1) / 512;
grid_dim = grid_dim > 8 ? 8 : grid_dim;
if ("nearest" == interp_method) {
KeNearestNeighborInterpBw<
T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h,
out_w, n, out_chw, c, ratio_h, ratio_w);
} else if ("bilinear" == interp_method) {
KeBilinearInterpBw<
T><<<blocks, 1024, 0, ctx.cuda_device_context().stream()>>>(
d_input, in_h, in_w, batch_size, in_chw, d_output, out_h, out_w,
batch_size, out_chw, channels, ratio_h, ratio_w);
T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h,
out_w, n, out_chw, c, ratio_h, ratio_w);
}
}
};
......@@ -201,7 +284,9 @@ class BilinearInterpGradOpCUDAKernel : public framework::OpKernel<T> {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(bilinear_interp,
ops::BilinearInterpOpCUDAKernel<float>);
REGISTER_OP_CUDA_KERNEL(bilinear_interp_grad,
ops::BilinearInterpGradOpCUDAKernel<float>);
REGISTER_OP_CUDA_KERNEL(interpolate, ops::InterpolateOpCUDAKernel<float>,
ops::InterpolateOpCUDAKernel<double>,
ops::InterpolateOpCUDAKernel<int>);
REGISTER_OP_CUDA_KERNEL(interpolate_grad,
ops::InterpolateGradOpCUDAKernel<float>,
ops::InterpolateGradOpCUDAKernel<double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
template <typename T, size_t D, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
using Tensor = framework::Tensor;
template <typename T>
static void NearestNeighborInterpolate(const Tensor& input, Tensor* output,
const float ratio_h, const float ratio_w,
const int n, const int c,
const int out_h, const int out_w) {
auto input_t = EigenTensor<T, 4>::From(input);
auto output_t = EigenTensor<T, 4>::From(*output);
for (int k = 0; k < out_h; k++) { // loop for images
int in_k = static_cast<int>(ratio_h * k + 0.5);
for (int l = 0; l < out_w; l++) {
int in_l = static_cast<int>(ratio_w * l + 0.5);
for (int i = 0; i < n; i++) { // loop for batches
for (int j = 0; j < c; j++) { // loop for channels
output_t(i, j, k, l) = input_t(i, j, in_k, in_l);
}
}
}
}
}
template <typename T>
static void BilinearInterpolation(const Tensor& input, Tensor* output,
const float ratio_h, const float ratio_w,
const int in_h, const int in_w, const int n,
const int c, const int out_h,
const int out_w) {
auto input_t = EigenTensor<T, 4>::From(input);
auto output_t = EigenTensor<T, 4>::From(*output);
for (int k = 0; k < out_h; k++) { // loop for images
int y_n = static_cast<int>(ratio_h * k);
int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
float d_n = ratio_h * k - y_n;
float d_s = 1.f - d_n;
for (int l = 0; l < out_w; l++) {
int x_w = static_cast<int>(ratio_w * l);
int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
float d_w = ratio_w * l - x_w;
float d_e = 1.f - d_w;
for (int i = 0; i < n; i++) { // loop for batches
for (int j = 0; j < c; j++) { // loop for channels
// bilinear interpolation
output_t(i, j, k, l) = input_t(i, j, y_n, x_w) * d_s * d_e +
input_t(i, j, y_s, x_w) * d_n * d_e +
input_t(i, j, y_n, x_e) * d_s * d_w +
input_t(i, j, y_s, x_e) * d_n * d_w;
}
}
}
}
}
template <typename T>
static void NearestNeighborInterpolateGrad(const Tensor& output_grad,
Tensor* input_grad,
const float ratio_h,
const float ratio_w, const int n,
const int c, const int out_h,
const int out_w) {
auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
for (int k = 0; k < out_h; k++) { // loop for images
int in_k = static_cast<int>(ratio_h * k + 0.5);
for (int l = 0; l < out_w; l++) {
int in_l = static_cast<int>(ratio_w * l + 0.5);
for (int i = 0; i < n; i++) { // loop for batches
for (int j = 0; j < c; j++) { // loop for channels
input_grad_t(i, j, in_k, in_l) += output_grad_t(i, j, k, l);
}
}
}
}
}
template <typename T>
static void BilinearInterpolationGrad(const Tensor& output_grad,
Tensor* input_grad, const float ratio_h,
const float ratio_w, const int in_h,
const int in_w, const int n, const int c,
const int out_h, const int out_w) {
auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
for (int k = 0; k < out_h; k++) { // loop for images
int y_n = static_cast<int>(ratio_h * k);
int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
float d_n = ratio_h * k - y_n;
float d_s = 1.f - d_n;
for (int l = 0; l < out_w; l++) {
int x_w = static_cast<int>(ratio_w * l);
int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
float d_w = ratio_w * l - x_w;
float d_e = 1.f - d_w;
for (int i = 0; i < n; i++) { // loop for batches
for (int j = 0; j < c; j++) { // loop for channels
// bilinear interpolation grad
const T grad = output_grad_t(i, j, k, l);
input_grad_t(i, j, y_n, x_w) += static_cast<T>(grad * d_s * d_e);
input_grad_t(i, j, y_s, x_w) += static_cast<T>(grad * d_n * d_e);
input_grad_t(i, j, y_n, x_e) += static_cast<T>(grad * d_s * d_w);
input_grad_t(i, j, y_s, x_e) += static_cast<T>(grad * d_n * d_w);
}
}
}
}
}
template <typename T>
class InterpolateKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<Tensor>("X");
auto* output = ctx.Output<Tensor>("Out");
std::string interp_method = ctx.Attr<std::string>("interp_method");
int out_h = ctx.Attr<int>("out_h");
int out_w = ctx.Attr<int>("out_w");
auto out_size = ctx.Input<Tensor>("OutSize");
if (out_size != nullptr) {
auto out_size_data = out_size->data<int>();
out_h = out_size_data[0];
out_w = out_size_data[1];
}
const int n = input->dims()[0];
const int c = input->dims()[1];
const int in_h = input->dims()[2];
const int in_w = input->dims()[3];
output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
auto& device_ctx =
ctx.template device_context<platform::CPUDeviceContext>();
math::SetConstant<platform::CPUDeviceContext, T> zero;
zero(device_ctx, output, static_cast<T>(0.0));
if (in_h == out_h && in_w == out_w) {
framework::TensorCopy(*input, ctx.GetPlace(), output);
return;
}
float ratio_h =
(out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
float ratio_w =
(out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
if ("bilinear" == interp_method) {
BilinearInterpolation<T>(*input, output, ratio_h, ratio_w, in_h, in_w, n,
c, out_h, out_w);
} else if ("nearest" == interp_method) {
NearestNeighborInterpolate<T>(*input, output, ratio_h, ratio_w, n, c,
out_h, out_w);
}
}
};
template <typename T>
class InterpolateGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<Tensor>("X");
auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
std::string interp_method = ctx.Attr<std::string>("interp_method");
int out_h = ctx.Attr<int>("out_h");
int out_w = ctx.Attr<int>("out_w");
auto out_size = ctx.Input<Tensor>("OutSize");
if (out_size != nullptr) {
auto out_size_data = out_size->data<int>();
out_h = out_size_data[0];
out_w = out_size_data[1];
}
const int n = input->dims()[0];
const int c = input->dims()[1];
const int in_h = input->dims()[2];
const int in_w = input->dims()[3];
input_grad->mutable_data<T>({n, c, in_h, in_w}, ctx.GetPlace());
auto& device_ctx =
ctx.template device_context<platform::CPUDeviceContext>();
math::SetConstant<platform::CPUDeviceContext, T> zero;
zero(device_ctx, input_grad, static_cast<T>(0.0));
if (in_h == out_h && in_w == out_w) {
framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad);
return;
}
float ratio_h =
(out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
float ratio_w =
(out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
if ("bilinear" == interp_method) {
BilinearInterpolationGrad<T>(*output_grad, input_grad, ratio_h, ratio_w,
in_h, in_w, n, c, out_h, out_w);
} else if ("nearest" == interp_method) {
NearestNeighborInterpolateGrad<T>(*output_grad, input_grad, ratio_h,
ratio_w, n, c, out_h, out_w);
}
}
};
} // namespace operators
} // namespace paddle
......@@ -36,7 +36,7 @@ namespace operators {
void RunServer(std::shared_ptr<distributed::RPCServer> service) {
service->StartServer();
VLOG(4) << "RunServer thread end";
VLOG(40) << "RunServer thread end";
}
static void split(const std::string &str, char sep,
std::vector<std::string> *pieces) {
......@@ -66,8 +66,8 @@ static void ParallelExecuteBlocks(
fs.push_back(framework::Async([&executor, &prepared, &scope, idx]() {
int run_block = idx; // thread local
try {
VLOG(3) << "running server block: " << run_block
<< "pointer: " << prepared[run_block].get();
VLOG(30) << "running server block: " << run_block
<< "pointer: " << prepared[run_block].get();
executor->RunPreparedContext(prepared[run_block].get(), scope);
} catch (const std::exception &e) {
LOG(FATAL) << "run sub program:" << idx << " error " << e.what();
......@@ -108,7 +108,7 @@ void ListenAndServOp::RunSyncLoop(
framework::Scope *recv_scope, platform::DeviceContext *dev_ctx,
const std::vector<int> &prefetch_block_id_list,
const int checkpoint_point_block_id) const {
VLOG(2) << "RunSyncLoop";
VLOG(20) << "RunSyncLoop";
size_t num_blocks = program->Size();
auto optimize_blocks =
Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
......@@ -167,7 +167,7 @@ void ListenAndServOp::RunSyncLoop(
}
ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program,
recv_scope);
VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
VLOG(20) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
ResetReceivedVars(recv_scope, dev_ctx, rpc_service_->NeedResetAllVars());
......@@ -183,11 +183,11 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
for (auto &varname : sparse_vars_) {
auto var = recv_scope->FindVar(varname);
if (var == nullptr) {
VLOG(2) << "can not find var " << varname << " in received scope";
VLOG(20) << "can not find var " << varname << " in received scope";
continue;
}
if (var->IsType<framework::SelectedRows>()) {
VLOG(3) << "reset sparse var: " << varname;
VLOG(30) << "reset sparse var: " << varname;
var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
} else {
PADDLE_THROW("The type of sparse var should be SelectedRows");
......@@ -197,7 +197,7 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
for (auto &varname : dense_vars_) {
auto var = recv_scope->FindVar(varname);
if (var == nullptr) {
VLOG(2) << "can not find var " << varname << " in received scope";
VLOG(20) << "can not find var " << varname << " in received scope";
continue;
}
if (var->IsType<framework::LoDTensor>()) {
......@@ -216,7 +216,7 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
framework::ProgramDesc *program,
framework::Scope *recv_scope) const {
VLOG(2) << "RunAsyncLoop";
VLOG(20) << "RunAsyncLoop";
auto grad_to_block_id_str =
Attr<std::vector<std::string>>("grad_to_block_id");
DoubleFindMap<std::string, int32_t> grad_to_block_id;
......@@ -225,7 +225,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
const std::string &grad_and_id) {
std::vector<std::string> pieces;
split(grad_and_id, ':', &pieces);
VLOG(3) << "after split, key = " << pieces[0] << ", id=" << pieces[1];
VLOG(30) << "after split, key = " << pieces[0] << ", id=" << pieces[1];
PADDLE_ENFORCE_EQ(pieces.size(), 2);
PADDLE_ENFORCE_EQ(out_map->count(pieces[0]), 0);
......@@ -270,7 +270,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
while (true) {
if (rpc_service_->IsExit()) {
VLOG(4) << "get exit!rpc_processor break!";
VLOG(40) << "get exit!rpc_processor break!";
break;
}
......@@ -332,9 +332,9 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
std::string endpoint = Attr<std::string>("endpoint");
int checkpoint_block_id = Attr<int>(kCheckpointBlockId);
VLOG(4) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in
<< ", end_point:" << endpoint
<< ", checkpoint_block_id: " << checkpoint_block_id;
VLOG(40) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in
<< ", end_point:" << endpoint
<< ", checkpoint_block_id: " << checkpoint_block_id;
rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in));
......@@ -383,8 +383,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
prefetch_var_name_to_block_id_str) {
std::vector<std::string> pieces;
split(prefetch_var_name_and_id, ':', &pieces);
VLOG(3) << "after split, prefetch_var = " << pieces[0]
<< ", id=" << pieces[1];
VLOG(30) << "after split, prefetch_var = " << pieces[0]
<< ", id=" << pieces[1];
PADDLE_ENFORCE_EQ(pieces.size(), 2);
int block_id = std::stoi(pieces[1]);
......@@ -415,7 +415,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
// start the server listening after all member initialized.
server_thread_.reset(new std::thread(RunServer, rpc_service_));
VLOG(3) << "wait server thread to become ready...";
VLOG(30) << "wait server thread to become ready...";
rpc_service_->WaitServerReady();
// register SIGINT(from ctrl+C) and SIGTERM(from kill) signal handlers
......
......@@ -30,9 +30,9 @@ class LoDRankTableOp : public framework::OperatorBase {
auto x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
auto *out =
scope.FindVar(Output("Out"))->GetMutable<framework::LoDRankTable>();
VLOG(10) << "Level = " << static_cast<size_t>(Attr<int>("level"));
VLOG(100) << "Level = " << static_cast<size_t>(Attr<int>("level"));
out->Reset(x.lod(), static_cast<size_t>(Attr<int>("level")));
VLOG(10) << Input("X") << "'s lod information is " << *out;
VLOG(100) << Input("X") << "'s lod information is " << *out;
}
};
......
......@@ -134,13 +134,13 @@ class LookupTableOpGradVarTypeInference : public framework::VarTypeInference {
auto attr = op_desc.GetAttr("is_sparse");
bool is_sparse = boost::get<bool>(attr);
if (is_sparse) {
VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
<< " is set to SelectedRows";
VLOG(30) << "lookup_table_grad op " << framework::GradVarName("W")
<< " is set to SelectedRows";
block->Var(out_var_name)
->SetType(framework::proto::VarType::SELECTED_ROWS);
} else {
VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
<< " is set to LoDTensor";
VLOG(30) << "lookup_table_grad op " << framework::GradVarName("W")
<< " is set to LoDTensor";
block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR);
}
block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType());
......
......@@ -96,8 +96,8 @@ void TestAndBench(const int n, std::function<void(const int, const T*, T*)> tgt,
}
auto et = GetCurrentUS();
VLOG(3) << "Vec size " << n << ": refer takes: " << (et - mt) / repeat
<< " us, tgt takes: " << (mt - st) / repeat;
VLOG(30) << "Vec size " << n << ": refer takes: " << (et - mt) / repeat
<< " us, tgt takes: " << (mt - st) / repeat;
for (int i = 0; i < n; ++i) {
EXPECT_NEAR(ytgt_data[i], yref_data[i], 1e-3);
}
......
......@@ -87,7 +87,7 @@ TEST(JitKernel, vrelu) {
vrelu_intri8(d, x_data, zref_data);
}
auto si1 = GetCurrentUS();
VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
VLOG(30) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
}
#endif
auto ttgts = GetCurrentUS();
......@@ -95,8 +95,9 @@ TEST(JitKernel, vrelu) {
ker->Compute(x_data, ztgt_data);
}
auto ttgte = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
<< " us, tgt takes: " << (ttgte - ttgts) / repeat;
VLOG(30) << "Vec size " << d
<< ": refer takes: " << (trefe - trefs) / repeat
<< " us, tgt takes: " << (ttgte - ttgts) / repeat;
for (int i = 0; i < d; ++i) {
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
}
......@@ -132,8 +133,9 @@ TEST(JitKernel, vaddbias) {
}
auto ttgte = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
<< " us, tgt takes: " << (ttgte - ttgts) / repeat;
VLOG(30) << "Vec size " << d
<< ": refer takes: " << (trefe - trefs) / repeat
<< " us, tgt takes: " << (ttgte - ttgts) / repeat;
for (int i = 0; i < d; ++i) {
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
}
......@@ -183,13 +185,14 @@ TEST(JitKernel, vexp) {
}
auto ttgte = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
VLOG(30) << "Vec size " << d
<< ": refer takes: " << (trefe - trefs) / repeat
#ifdef PADDLE_WITH_MKLML
<< " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
<< " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
#else
<< " us, "
<< " us, "
#endif
<< "tgt takes: " << (ttgte - ttgts) / repeat;
<< "tgt takes: " << (ttgte - ttgts) / repeat;
for (int i = 0; i < d; ++i) {
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
}
......@@ -254,9 +257,10 @@ TEST(JitKernel, vsigmoid) {
}
auto ttgte = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
<< " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat
<< " us, tgt takes: " << (ttgte - ttgts) / repeat;
VLOG(30) << "Vec size " << d
<< ": refer takes: " << (trefe - trefs) / repeat
<< " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat
<< " us, tgt takes: " << (ttgte - ttgts) / repeat;
for (int i = 0; i < d; ++i) {
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
}
......@@ -321,9 +325,10 @@ TEST(JitKernel, vtanh) {
}
auto ttgte = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
<< " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat
<< " us, tgt takes: " << (ttgte - ttgts) / repeat;
VLOG(30) << "Vec size " << d
<< ": refer takes: " << (trefe - trefs) / repeat
<< " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat
<< " us, tgt takes: " << (ttgte - ttgts) / repeat;
for (int i = 0; i < d; ++i) {
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
}
......@@ -441,9 +446,10 @@ TEST(JitKernel, lstm) {
ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data);
}
auto ttgte = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
<< " us, better(jit) takes: " << (tmkle - tmkls) / repeat
<< " us, tgt takes: " << (ttgte - ttgts) / repeat;
VLOG(30) << "Vec size " << d
<< ": refer takes: " << (trefe - trefs) / repeat
<< " us, better(jit) takes: " << (tmkle - tmkls) / repeat
<< " us, tgt takes: " << (ttgte - ttgts) / repeat;
}
}
......@@ -525,8 +531,8 @@ TEST(JitKernel, vscal) {
vscal_inp_intri8(d, a, y_data);
}
auto si3 = GetCurrentUS();
VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat
<< " us, inplace: " << (si3 - si2) / repeat;
VLOG(30) << "Vec size 8 intr takes: " << (si1 - si0) / repeat
<< " us, inplace: " << (si3 - si2) / repeat;
}
#endif
......@@ -540,15 +546,17 @@ TEST(JitKernel, vscal) {
ker->Compute(&a, y_data, y_data, d);
}
auto ttgte1 = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
<< " us, inplace takes: " << (trefe1 - trefs1) / repeat
VLOG(30) << "Vec size " << d
<< ": refer takes: " << (trefe - trefs) / repeat
<< " us, inplace takes: " << (trefe1 - trefs1) / repeat
#ifdef PADDLE_WITH_MKLML
<< " us, mkl inplace takes: " << (tmkle - tmkls) / repeat << " us, "
<< " us, mkl inplace takes: " << (tmkle - tmkls) / repeat
<< " us, "
#else
<< " us, "
<< " us, "
#endif
<< "tgt takes: " << (ttgte - ttgts) / repeat
<< "us, tgt inplace takes: " << (ttgte1 - ttgts1) / repeat;
<< "tgt takes: " << (ttgte - ttgts) / repeat
<< "us, tgt inplace takes: " << (ttgte1 - ttgts1) / repeat;
for (int i = 0; i < d; ++i) {
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
}
......@@ -611,7 +619,7 @@ TEST(JitKernel, vmul) {
vmul_intri8(d, x_data, y_data, zref_data);
}
auto si1 = GetCurrentUS();
VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
VLOG(30) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
}
#endif
......@@ -621,13 +629,14 @@ TEST(JitKernel, vmul) {
}
auto ttgte = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
VLOG(30) << "Vec size " << d
<< ": refer takes: " << (trefe - trefs) / repeat
#ifdef PADDLE_WITH_MKLML
<< " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
<< " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
#else
<< " us, "
<< " us, "
#endif
<< "tgt takes: " << (ttgte - ttgts) / repeat;
<< "tgt takes: " << (ttgte - ttgts) / repeat;
for (int i = 0; i < d; ++i) {
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
}
......@@ -690,7 +699,7 @@ TEST(JitKernel, vadd) {
vadd_intri8(d, x_data, y_data, zref_data);
}
auto si1 = GetCurrentUS();
VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
VLOG(30) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
}
#endif
......@@ -700,13 +709,14 @@ TEST(JitKernel, vadd) {
}
auto ttgte = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
VLOG(30) << "Vec size " << d
<< ": refer takes: " << (trefe - trefs) / repeat
#ifdef PADDLE_WITH_MKLML
<< " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
<< " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
#else
<< " us, "
<< " us, "
#endif
<< "tgt takes: " << (ttgte - ttgts) / repeat;
<< "tgt takes: " << (ttgte - ttgts) / repeat;
for (int i = 0; i < d; ++i) {
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
}
......@@ -761,9 +771,10 @@ TEST(JitKernel, vaddrelu) {
ker->Compute(x_data, y_data, ztgt_data, d);
}
auto ttgte = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
<< " us, better takes: " << (tmkle - tmkls) / repeat << " us, "
<< "tgt takes: " << (ttgte - ttgts) / repeat;
VLOG(30) << "Vec size " << d
<< ": refer takes: " << (trefe - trefs) / repeat
<< " us, better takes: " << (tmkle - tmkls) / repeat << " us, "
<< "tgt takes: " << (ttgte - ttgts) / repeat;
for (int i = 0; i < d; ++i) {
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
}
......
......@@ -270,7 +270,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
const std::vector<const framework::SelectedRows*>& inputs,
framework::SelectedRows* output) {
if (inputs.size() == 0) {
VLOG(3) << "no input! return";
VLOG(30) << "no input! return";
return;
}
const framework::SelectedRows* has_value_input = nullptr;
......@@ -281,7 +281,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
}
}
if (has_value_input == nullptr) {
VLOG(3) << "no input has value! just return" << std::endl;
VLOG(30) << "no input has value! just return" << std::endl;
return;
}
auto input_width = has_value_input->value().dims()[1];
......
......@@ -314,7 +314,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
const std::vector<const framework::SelectedRows*>& inputs,
framework::SelectedRows* output) {
if (inputs.size() == 0) {
VLOG(3) << "no input! return";
VLOG(30) << "no input! return";
return;
}
const framework::SelectedRows* has_value_input = nullptr;
......@@ -325,7 +325,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
}
}
if (has_value_input == nullptr) {
VLOG(3) << "no input has value! just return" << std::endl;
VLOG(30) << "no input has value! just return" << std::endl;
return;
}
auto input_width = has_value_input->value().dims()[1];
......
......@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/mean_op.h"
#include <string>
namespace paddle {
namespace operators {
......@@ -42,6 +42,14 @@ Mean Operator calculates the mean of all elements in X.
}
};
class MeanOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
const override {
return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
}
};
class MeanGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
......@@ -50,6 +58,14 @@ class MeanGradOp : public framework::OperatorWithKernel {
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
ctx->ShareLoD("X", framework::GradVarName("X"));
}
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto input_data_type =
framework::ToDataType(ctx.Input<Tensor>("X")->type());
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
};
class MeanGradMaker : public framework::SingleGradOpDescMaker {
......@@ -71,7 +87,8 @@ class MeanGradMaker : public framework::SingleGradOpDescMaker {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanGradMaker);
REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanOpInferVarType,
ops::MeanGradMaker);
REGISTER_OPERATOR(mean_grad, ops::MeanGradOp);
REGISTER_OP_CPU_KERNEL(
mean, ops::MeanKernel<paddle::platform::CPUDeviceContext, float>,
......
......@@ -346,7 +346,7 @@ class MomentumOpKernel : public framework::OpKernel<T> {
// sparse update maybe empty.
if (grad->rows().size() == 0) {
VLOG(3) << "Grad SelectedRows contains no data!";
VLOG(30) << "Grad SelectedRows contains no data!";
return;
}
auto* merged_grad = const_cast<framework::Scope&>(ctx.scope())
......
......@@ -38,9 +38,9 @@ class MulOp : public framework::OperatorWithKernel {
int x_num_col_dims = ctx->Attrs().Get<int>("x_num_col_dims");
int y_num_col_dims = ctx->Attrs().Get<int>("y_num_col_dims");
VLOG(3) << "mul operator x.shape=" << x_dims << " y.shape=" << y_dims
<< " x_num_col_dims=" << x_num_col_dims
<< " y_num_col_dims=" << y_num_col_dims;
VLOG(30) << "mul operator x.shape=" << x_dims << " y.shape=" << y_dims
<< " x_num_col_dims=" << x_num_col_dims
<< " y_num_col_dims=" << y_num_col_dims;
PADDLE_ENFORCE_GT(
x_dims.size(), x_num_col_dims,
......@@ -126,6 +126,14 @@ or not. But the output only shares the LoD information with input $X$.
}
};
class MulOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
const override {
return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
}
};
class MulGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
......@@ -178,7 +186,8 @@ class MulOpGradMaker : public framework::SingleGradOpDescMaker {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker, ops::MulOpGradMaker);
REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker, ops::MulOpInferVarType,
ops::MulOpGradMaker);
REGISTER_OPERATOR(mul_grad, ops::MulGradOp);
REGISTER_OP_CPU_KERNEL(
mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>,
......
......@@ -63,16 +63,16 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
// device id
int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
int idx = comm->GetCommId(gpu_id);
VLOG(3) << "gpu : "
<< " invoke allreduce. send " << x->numel() << " recv "
<< out->numel();
VLOG(30) << "gpu : "
<< " invoke allreduce. send " << x->numel() << " recv "
<< out->numel();
PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
x->data<T>(), out->mutable_data<T>(ctx.GetPlace()), out->numel(),
NCCLTypeWrapper<T>::type, reduction_op_, comm->comms().at(idx),
ctx.cuda_device_context().stream()));
VLOG(3) << "gpu : "
<< " finished allreduce. send " << x->numel() << " recv "
<< out->numel();
VLOG(30) << "gpu : "
<< " finished allreduce. send " << x->numel() << " recv "
<< out->numel();
}
};
......@@ -109,14 +109,14 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
} else {
out->Resize(framework::make_ddim({0}));
}
VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel()
<< " recv " << out->numel();
VLOG(30) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel()
<< " recv " << out->numel();
PADDLE_ENFORCE(platform::dynload::ncclReduce(
x->data<T>(), recvbuffer, x->numel(), NCCLTypeWrapper<T>::type,
reduction_op_, root, comm->comms().at(idx),
ctx.cuda_device_context().stream()));
VLOG(3) << "gpu : " << gpu_id << " finished reduce. send " << x->numel()
<< " recv " << out->numel();
VLOG(30) << "gpu : " << gpu_id << " finished reduce. send " << x->numel()
<< " recv " << out->numel();
}
};
......@@ -133,21 +133,22 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
int idx = comm->GetCommId(gpu_id);
if (idx == root) {
auto* x = ctx.Input<LoDTensor>("X");
VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel();
VLOG(30) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel();
PADDLE_ENFORCE(platform::dynload::ncclBcast(
reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), x->numel(),
NCCLTypeWrapper<T>::type, root, comm->comms().at(idx),
ctx.cuda_device_context().stream()));
VLOG(3) << "gpu : " << gpu_id << " finished Bcast.";
VLOG(30) << "gpu : " << gpu_id << " finished Bcast.";
} else {
auto* out = ctx.Output<LoDTensor>("Out");
VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. recv buffer "
<< framework::product(out->dims());
VLOG(30) << "gpu : " << gpu_id << " invoke Bcast. recv buffer "
<< framework::product(out->dims());
PADDLE_ENFORCE(platform::dynload::ncclBcast(
out->mutable_data<T>(ctx.GetPlace()), out->numel(),
NCCLTypeWrapper<T>::type, root, comm->comms().at(idx),
ctx.cuda_device_context().stream()));
VLOG(3) << "gpu : " << gpu_id << " finished Bcast. recv " << out->numel();
VLOG(30) << "gpu : " << gpu_id << " finished Bcast. recv "
<< out->numel();
}
}
};
......
......@@ -86,9 +86,9 @@ class NCCLTester : public ::testing::Test {
(*p_scopes).resize(gpu_list_.size());
auto op = f::OpRegistry::CreateOp(*op1);
VLOG(1) << "invoke NCCLInitOp.";
VLOG(10) << "invoke NCCLInitOp.";
op->Run(g_scope_, cpu_place);
VLOG(1) << "NCCLInitOp finished.";
VLOG(10) << "NCCLInitOp finished.";
}
int GetGPUData(int gpu_id) { return gpu_id + 42; }
......@@ -109,7 +109,7 @@ class NCCLTester : public ::testing::Test {
std::vector<T> send_vector(f::product(kDims), GetGPUData(gpu_id));
paddle::framework::TensorFromVector<T>(send_vector, *ctx, send_tensor);
VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
VLOG(10) << "Send Tensor filled with elements " << send_tensor->numel();
}
lk.unlock();
......@@ -119,11 +119,11 @@ class NCCLTester : public ::testing::Test {
auto op = f::OpRegistry::CreateOp(*op1);
VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type();
VLOG(1) << " send_tensor : " << send_tensor->numel()
<< " recv_tensor : " << recv_tensor->numel();
VLOG(10) << "Device : " << gpu_id << " invoke " << op_desc.Type();
VLOG(10) << " send_tensor : " << send_tensor->numel()
<< " recv_tensor : " << recv_tensor->numel();
op->Run(*scope, place);
VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type();
VLOG(10) << "Device : " << gpu_id << " finished " << op_desc.Type();
}
public:
......
......@@ -48,7 +48,7 @@ static void SplitTensorAndMoveTensorToScopes(
auto lod_tensors = tensor.SplitLoDTensor(places);
for (auto &lod : lod_tensors) {
VLOG(3) << lod.dims();
VLOG(30) << lod.dims();
}
if (num_sub_scopes == 0) {
num_sub_scopes = lod_tensors.size();
......@@ -263,7 +263,7 @@ class ParallelDoGradOp : public framework::OperatorBase {
if (s == framework::kEmptyVarName) {
continue;
}
VLOG(3) << "Moving " << s;
VLOG(30) << "Moving " << s;
CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s));
}
WaitOnPlaces(places);
......@@ -277,7 +277,7 @@ class ParallelDoGradOp : public framework::OperatorBase {
if (s == framework::kEmptyVarName) {
continue;
}
VLOG(3) << "Accumulating " << s;
VLOG(30) << "Accumulating " << s;
if (s == framework::kEmptyVarName) continue;
std::string tmp_name;
auto *tmp = sub_scopes[0]->Var(&tmp_name);
......@@ -289,7 +289,7 @@ class ParallelDoGradOp : public framework::OperatorBase {
auto sum_op = framework::OpRegistry::CreateOp(
"sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
framework::AttributeMap{{"use_mkldnn", {false}}});
VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]);
VLOG(100) << sum_op->DebugStringEx(sub_scopes[0]);
sum_op->Run(*sub_scopes[0], places[0]);
WaitOnPlace(places[0]);
}
......@@ -316,7 +316,7 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
auto *grad = new framework::OpDesc();
grad->SetType("parallel_do_grad");
for (auto &input_param : this->InputNames()) {
VLOG(3) << input_param;
VLOG(30) << input_param;
grad->SetInput(input_param, this->Input(input_param));
if (input_param != kPlaces) {
grad->SetOutput(framework::GradVarName(input_param),
......
......@@ -40,7 +40,7 @@ int PoolOutputSize(int input_size, int filter_size, int padding, int stride,
return output_size;
}
void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) of Pooling should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Out(Output) of Pooling should not be null.");
......@@ -81,7 +81,7 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
}
framework::OpKernelType PoolOp::GetExpectedKernelType(
const framework::ExecutionContext &ctx) const {
const framework::ExecutionContext& ctx) const {
framework::LibraryType library_{framework::LibraryType::kPlain};
std::string data_format = ctx.Attr<std::string>("data_format");
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
......@@ -104,7 +104,7 @@ framework::OpKernelType PoolOp::GetExpectedKernelType(
layout_, library_);
}
void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
void PoolOpGrad::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
"Input(X@GRAD) should not be null.");
......@@ -112,7 +112,7 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
}
framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
const framework::ExecutionContext &ctx) const {
const framework::ExecutionContext& ctx) const {
framework::LibraryType library_{framework::LibraryType::kPlain};
std::string data_format = ctx.Attr<std::string>("data_format");
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
......@@ -262,6 +262,14 @@ Example:
)DOC");
}
class PoolOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
const override {
return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
}
};
void Pool3dOpMaker::Make() {
AddInput("X",
"(Tensor) The input tensor of pooling operator. "
......@@ -372,6 +380,7 @@ Example:
namespace ops = paddle::operators;
REGISTER_OPERATOR(pool2d, ops::PoolOp, ops::Pool2dOpMaker,
ops::PoolOpInferVarType,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(pool2d_grad, ops::PoolOpGrad);
......@@ -383,6 +392,7 @@ REGISTER_OP_CPU_KERNEL(
ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OPERATOR(pool3d, ops::PoolOp, ops::Pool3dOpMaker,
ops::PoolOpInferVarType,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(pool3d_grad, ops::PoolOpGrad);
......
......@@ -48,12 +48,12 @@ class PrefetchOp : public framework::OperatorBase {
std::vector<distributed::VarHandlePtr> rets;
for (size_t i = 0; i < ins.size(); i++) {
if (NeedSend(scope, ins[i])) {
VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get "
<< outs[i] << " back";
VLOG(30) << "sending " << ins[i] << " to " << epmap[i] << " to get "
<< outs[i] << " back";
rets.push_back(rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope,
ins[i], outs[i]));
} else {
VLOG(3) << "don't send no-initialied variable: " << ins[i];
VLOG(30) << "don't send no-initialied variable: " << ins[i];
}
}
for (size_t i = 0; i < rets.size(); i++) {
......
......@@ -155,8 +155,8 @@ class RandomCropKernel : public framework::OpKernel<T> {
seed = *cpu_seed.data<int64_t>();
}
} else {
VLOG(5) << "WARNING: The input 'Seed' is not initialized, use attribute "
"'startup_seed' instead.";
VLOG(50) << "WARNING: The input 'Seed' is not initialized, use attribute "
"'startup_seed' instead.";
seed = ctx.Attr<int>("startup_seed");
}
auto shape = ctx.Attr<std::vector<int>>("shape");
......
......@@ -42,7 +42,7 @@ class BlockingQueue {
std::unique_lock<std::mutex> lock(mutex_);
send_cv_.wait(lock, [&] { return queue_.size() < capacity_ || closed_; });
if (closed_) {
VLOG(5)
VLOG(50)
<< "WARNING: Sending an element to a closed reader::BlokcingQueue.";
return false;
}
......@@ -56,7 +56,7 @@ class BlockingQueue {
std::unique_lock<std::mutex> lock(mutex_);
send_cv_.wait(lock, [&] { return queue_.size() < capacity_ || closed_; });
if (closed_) {
VLOG(5)
VLOG(50)
<< "WARNING: Sending an element to a closed reader::BlokcingQueue.";
return false;
}
......
......@@ -26,7 +26,7 @@ class ShuffleReader : public framework::DecoratedReader {
ShuffleReader(const std::shared_ptr<ReaderBase>& reader, size_t buffer_size,
size_t seed = 0)
: DecoratedReader(reader), buffer_size_(buffer_size), seed_(seed) {
VLOG(10) << "Create shuffle reader of " << reader_;
VLOG(100) << "Create shuffle reader of " << reader_;
if (seed_ == 0) {
std::random_device device;
seed_ = device();
......@@ -37,7 +37,7 @@ class ShuffleReader : public framework::DecoratedReader {
void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
out->clear();
if (iteration_pos_ >= buffer_.size()) {
VLOG(10) << "Resetting shuffle buffer";
VLOG(100) << "Resetting shuffle buffer";
ReloadBuffer();
if (buffer_.empty()) {
return;
......@@ -73,7 +73,7 @@ class ShuffleReader : public framework::DecoratedReader {
std::mt19937 g(seed_);
std::shuffle(buffer_.begin(), buffer_.end(), g);
seed_ = g(); // update seed_;
VLOG(10) << "random buffer size = " << buffer_.size();
VLOG(100) << "random buffer size = " << buffer_.size();
}
size_t buffer_size_;
......
......@@ -160,7 +160,7 @@ class RecurrentBase : public framework::OperatorBase {
Callback callback) {
PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size());
for (size_t i = 0; i < dst_vars.size(); ++i) {
VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i];
VLOG(100) << "Link " << src_vars[i] << " to " << dst_vars[i];
AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback);
}
}
......@@ -176,7 +176,7 @@ class RecurrentBase : public framework::OperatorBase {
Callback callback) {
PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size());
for (size_t i = 0; i < dst_vars.size(); ++i) {
VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i];
VLOG(100) << "Link " << src_vars[i] << " to " << dst_vars[i];
AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback);
}
}
......@@ -230,7 +230,7 @@ class RecurrentOp : public RecurrentBase {
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto seq_len = static_cast<size_t>(this->GetSequenceLength(scope));
VLOG(3) << "Static RNN input sequence length = " << seq_len;
VLOG(30) << "Static RNN input sequence length = " << seq_len;
StepScopes scopes = CreateStepScopes(scope, seq_len);
auto reverse = Attr<bool>(kReverse);
......@@ -241,7 +241,7 @@ class RecurrentOp : public RecurrentBase {
for (size_t i = 0; i < seq_len; ++i) {
size_t seq_offset = reverse ? seq_len - i - 1 : i;
VLOG(3) << "Recurrent operate at the time step " << seq_offset;
VLOG(30) << "Recurrent operate at the time step " << seq_offset;
auto &cur_scope = scopes.CurScope();
......@@ -334,7 +334,7 @@ class RecurrentGradOp : public RecurrentBase {
for (size_t step_id = 0; step_id < seq_len; ++step_id) {
size_t seq_offset = reverse ? step_id : seq_len - step_id - 1;
VLOG(3) << "Recurrent backward operate at the time step " << seq_offset;
VLOG(30) << "Recurrent backward operate at the time step " << seq_offset;
auto &cur_scope = scopes.CurScope();
// Link outside::output_grads --> inside::output_grads
// inside::output_grad = outside::output_grad[seq_offset:seq_offset+1]
......@@ -348,11 +348,11 @@ class RecurrentGradOp : public RecurrentBase {
});
auto og_set = List2Set(Inputs(kOutputGrads));
if (VLOG_IS_ON(10)) {
if (VLOG_IS_ON(100)) {
std::ostringstream sout;
std::copy(og_set.begin(), og_set.end(),
std::ostream_iterator<std::string>(sout, ","));
VLOG(10) << " RNN output gradients = [" << sout.str() << "]";
VLOG(100) << " RNN output gradients = [" << sout.str() << "]";
}
// Link states
......@@ -374,7 +374,7 @@ class RecurrentGradOp : public RecurrentBase {
auto &ex_tensor =
ex_scope.FindVar(ex_grad)->Get<framework::LoDTensor>();
VLOG(10) << " RNN link " << cur_grad << " from " << ex_grad;
VLOG(100) << " RNN link " << cur_grad << " from " << ex_grad;
auto *cur_grad_var = cur_scope.Var(cur_grad);
auto cur_grad_tensor =
cur_grad_var->GetMutable<framework::LoDTensor>();
......@@ -382,12 +382,12 @@ class RecurrentGradOp : public RecurrentBase {
}
}
VLOG(5) << "Recurrent memory linking finished ";
VLOG(50) << "Recurrent memory linking finished ";
// Run step block with cur_scope
executor.Run(*program, &cur_scope, block->ID(),
false /*create_local_scope*/);
VLOG(5) << "executor.Run finished ";
VLOG(50) << "executor.Run finished ";
auto local_var_names = LocalVarNames(cur_scope);
......@@ -436,7 +436,7 @@ class RecurrentGradOp : public RecurrentBase {
cur_scope.Rename(new_inside_name, inside_grad_name);
}
}
VLOG(5) << "Accumulate Parameter finished ";
VLOG(50) << "Accumulate Parameter finished ";
// Copy input gradient from inside to outside
// outside::input_grad[seq_offset: seq_offset + 1] = inside::input_grad
......@@ -455,7 +455,7 @@ class RecurrentGradOp : public RecurrentBase {
auto dst = outside->Slice(seq_offset, seq_offset + 1);
framework::TensorCopy(inside, place, dev_ctx, &dst);
});
VLOG(5) << "Link outside gradient finished ";
VLOG(50) << "Link outside gradient finished ";
if (step_id + 1 == seq_len) { // at_end
// copy initialize states gradient from inside to outside
......@@ -468,7 +468,7 @@ class RecurrentGradOp : public RecurrentBase {
outside->mutable_data(place, inside.type());
framework::TensorCopy(inside, place, dev_ctx, outside);
});
VLOG(5) << "Link initialize state gradient finished ";
VLOG(50) << "Link initialize state gradient finished ";
}
scopes.Next();
}
......
......@@ -47,7 +47,7 @@ class RecvOp : public framework::OperatorBase {
std::vector<distributed::VarHandlePtr> rets;
for (size_t i = 0; i < outs.size(); i++) {
VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
VLOG(30) << "getting " << outs[i] << " from " << epmap[i];
rets.push_back(rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]));
}
if (sync_mode) {
......
......@@ -93,7 +93,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
in_grad_var_name);
if (out_grad_var == nullptr) {
VLOG(5) << "Using fill constant 0 as starting gradient";
VLOG(50) << "Using fill constant 0 as starting gradient";
auto in_var_name = Input("X");
auto *in_var = scope.FindVar(in_var_name);
auto &in_var_tensor = in_var->Get<framework::LoDTensor>();
......
......@@ -110,7 +110,7 @@ class SaveOp : public framework::OperatorBase {
lt_var != nullptr,
"Can not find variable kLookupTablePath for SaveSelectedRows");
std::string filename = lt_var->data();
VLOG(4) << "SaveSelectedRows get File name: " << filename;
VLOG(40) << "SaveSelectedRows get File name: " << filename;
MkDirRecursively(DirName(filename).c_str());
......
......@@ -42,12 +42,12 @@ class SendBarrierOp : public framework::OperatorBase {
distributed::RPCClient::GetInstance<RPCCLIENT_T>(
Attr<int>("trainer_id"));
VLOG(3) << "SendBarrierOp sync";
VLOG(30) << "SendBarrierOp sync";
// need to wait before sending send_barrier message
PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
for (auto& ep : eps) {
VLOG(3) << "send barrier, ep: " << ep;
VLOG(30) << "send barrier, ep: " << ep;
rpc_client->AsyncSendBatchBarrier(ep);
}
PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
......
......@@ -50,10 +50,10 @@ class SendOp : public framework::OperatorBase {
std::vector<distributed::VarHandlePtr> rets;
for (size_t i = 0; i < ins.size(); i++) {
if (NeedSend(scope, ins[i])) {
VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
VLOG(30) << "sending " << ins[i] << " to " << epmap[i];
rets.push_back(rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]));
} else {
VLOG(3) << "don't send no-initialied variable: " << ins[i];
VLOG(30) << "don't send no-initialied variable: " << ins[i];
}
}
if (sync_send) {
......
......@@ -120,7 +120,7 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs,
void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) {
f::Scope scope;
p::CPUPlace place;
VLOG(4) << "before init tensor";
VLOG(40) << "before init tensor";
if (is_sparse) {
InitSelectedRowsInScope(place, &scope);
} else {
......@@ -146,7 +146,7 @@ void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) {
attrs.insert({"PrefetchBlock", prefetch_block});
attrs.insert({"grad_to_block_id", std::vector<std::string>({""})});
attrs.insert({"sync_mode", true});
VLOG(4) << "before init op";
VLOG(40) << "before init op";
listen_and_serv_op =
f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs);
*initialized = true;
......
......@@ -127,7 +127,7 @@ class SequenceMaskKernel : public framework::OpKernel<Tx> {
auto x_numel = x->numel();
if (maxlen < 0) {
#ifdef __NVCC__
VLOG(10)
VLOG(100)
<< "SequenceMaskOp on GPU may be slow when maxlen is not provided.";
maxlen = static_cast<int>(
thrust::reduce(thrust::device_pointer_cast(x_data),
......
......@@ -98,10 +98,10 @@ class SGDOpKernel : public framework::OpKernel<T> {
auto param_row_width = param.value().dims()[1];
auto grad_row_width = grad.value().dims()[1];
VLOG(4) << " param rows: " << param.rows().size()
<< " param memory rows: " << param.value().dims()[0]
<< " grad rows: " << grad.rows().size()
<< " grad memory rows: " << grad.value().dims()[0];
VLOG(40) << " param rows: " << param.rows().size()
<< " param memory rows: " << param.value().dims()[0]
<< " grad rows: " << grad.rows().size()
<< " grad memory rows: " << grad.value().dims()[0];
PADDLE_ENFORCE_EQ(param_row_width, grad_row_width,
"param_row should have the same size with grad_row");
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/similarity_focus_op.h"
namespace paddle {
namespace operators {
class SimilarityFocusOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(Tensor, default Tensor<float>), a 4-D tensor with shape,"
" [BatchSize, X, Y, Z]");
AddOutput("Out",
"(Tensor, default Tensor<float>), the similarity focus mask"
" with the same shape of input X.");
AddAttr<int>("axis",
"(int32), indicating the dimension to be select. It can"
" only be 1, 2, or 3.");
AddAttr<std::vector<int>>("indexes",
"(std::vector<int32>), indicating the indexes"
" of the selected dimension.");
AddComment(R"DOC(
SimilarityFocus Operator.
Generate a similarity focus mask with the same shape of input using the following method:
1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding
to the axis according to the indexes. For example, if axis=1 and indexes=[a],
it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X
is (BatchSize, A, B, C), the shape of tensor T is (BatchSize, B, C).
2. For each index, find the largest numbers in the tensor T, so that the same
row and same column has at most one number(what it means is that if the
largest number has been found in the i-th row and the j-th column, then
the numbers in the i-th row or j-th column will be skipped. And then the
next largest number will be selected from the remaining numbers. Obviously
there will be min(B, C) numbers), and mark the corresponding position of the
3-D similarity focus mask as 1, otherwise as 0. Do elementwise-or for
each index.
3. Broadcast the 3-D similarity focus mask to the same shape of input X.
Refer to `Similarity Focus Layer <http://www.aclweb.org/anthology/N16-1108>`_
)DOC");
}
};
class SimilarityFocusOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
auto x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(x_dims.size(), 4, "Input(X)'s rank should be 4.");
ctx->SetOutputDim("Out", x_dims);
ctx->ShareLoD("X", /*->*/ "Out");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()),
platform::CPUPlace());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(similarity_focus, ops::SimilarityFocusOp,
ops::SimilarityFocusOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(similarity_focus, ops::SimilarityFocusKernel<float>,
ops::SimilarityFocusKernel<double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <cstring>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
class SimilarityFocusKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
Tensor* out = context.Output<Tensor>("Out");
const Tensor* x = context.Input<Tensor>("X");
T* out_data = out->mutable_data<T>(context.GetPlace());
const T* x_data = x->data<T>();
int axis = context.Attr<int>("axis");
std::vector<int> indexes = context.Attr<std::vector<int>>("indexes");
int64_t batch_size = x->dims()[0];
int64_t dim[4];
for (int i = 1; i <= 3; ++i) {
dim[i] = x->dims()[i];
}
if (indexes.size() < 1) {
PADDLE_THROW("Indexes' size can not be 0.");
}
for (auto index : indexes) {
if (dim[axis] < index) {
PADDLE_THROW("Index exceeds tensor shape limit.");
}
}
int64_t array_size = 1;
for (int i = 1; i <= 3; ++i) {
if (i != axis) {
array_size *= dim[i];
}
}
std::vector<std::pair<T, int64_t>> array(array_size);
bool (*cmp)(std::pair<T, int64_t>, std::pair<T, int64_t>) = [](
std::pair<T, int64_t> x, std::pair<T, int64_t> y) {
return x.first > y.first;
};
int64_t (*compute_index)(int64_t*, int, int, int, int) = [](
int64_t* dim, int d1, int d2, int d3, int d4) {
return d1 * dim[1] * dim[2] * dim[3] + d2 * dim[2] * dim[3] +
d3 * dim[3] + d4;
};
memset(out_data, 0, sizeof(T) * batch_size * dim[1] * dim[2] * dim[3]);
for (int i = 0; i < batch_size; ++i) {
for (auto index : indexes) {
if (axis == 1) {
for (int j = 0; j < dim[2]; ++j) {
for (int k = 0; k < dim[3]; ++k) {
array[j * dim[3] + k] = std::make_pair(
x_data[compute_index(dim, i, index, j, k)], j * dim[3] + k);
}
}
std::sort(array.begin(), array.end(), cmp);
int tag_num = 0;
std::vector<bool> tag2(dim[2]), tag3(dim[3]);
for (auto x : array) {
int idx2 = x.second / dim[3];
int idx3 = x.second % dim[3];
if (tag2[idx2] || tag3[idx3]) {
continue;
}
tag_num++;
tag2[idx2] = true;
tag3[idx3] = true;
for (int j = 0; j < dim[1]; ++j) {
out_data[compute_index(dim, i, j, idx2, idx3)] = 1;
}
if (tag_num == std::min(dim[2], dim[3])) {
break;
}
}
} else if (axis == 2) {
for (int j = 0; j < dim[1]; ++j) {
for (int k = 0; k < dim[3]; ++k) {
array[j * dim[3] + k] = std::make_pair(
x_data[compute_index(dim, i, j, index, k)], j * dim[3] + k);
}
}
std::sort(array.begin(), array.end(), cmp);
int tag_num = 0;
std::vector<bool> tag1(dim[1]), tag3(dim[3]);
for (auto x : array) {
int idx1 = x.second / dim[3];
int idx3 = x.second % dim[3];
if (tag1[idx1] || tag3[idx3]) {
continue;
}
tag_num++;
tag1[idx1] = true;
tag3[idx3] = true;
for (int j = 0; j < dim[2]; ++j) {
out_data[compute_index(dim, i, idx1, j, idx3)] = 1;
}
if (tag_num == std::min(dim[1], dim[3])) {
break;
}
}
} else if (axis == 3) {
for (int j = 0; j < dim[1]; ++j) {
for (int k = 0; k < dim[2]; ++k) {
array[j * dim[2] + k] = std::make_pair(
x_data[compute_index(dim, i, j, k, index)], j * dim[2] + k);
}
}
std::sort(array.begin(), array.end(), cmp);
int tag_num = 0;
std::vector<bool> tag1(dim[1]), tag2(dim[2]);
for (auto x : array) {
int idx1 = x.second / dim[2];
int idx2 = x.second % dim[2];
if (tag1[idx1] || tag2[idx2]) {
continue;
}
tag_num++;
tag1[idx1] = true;
tag2[idx2] = true;
for (int j = 0; j < dim[3]; ++j) {
out_data[compute_index(dim, i, idx1, idx2, j)] = 1;
}
if (tag_num == std::min(dim[1], dim[2])) {
break;
}
}
} else {
PADDLE_THROW("Axis must be 1 or 2 or 3");
}
}
}
}
};
} // namespace operators
} // namespace paddle
......@@ -124,6 +124,14 @@ For each row $i$ and each column $j$ in the matrix, we have:
}
};
class SoftmaxOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
const override {
return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
}
};
class SoftmaxOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
......@@ -196,7 +204,7 @@ class SoftmaxOpGradMaker : public framework::SingleGradOpDescMaker {
namespace ops = paddle::operators;
REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker,
ops::SoftmaxOpGradMaker);
ops::SoftmaxOpInferVarType, ops::SoftmaxOpGradMaker);
REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad);
REGISTER_OP_CPU_KERNEL(
softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
......
......@@ -32,7 +32,7 @@ class SplitByrefOpKernel : public framework::OpKernel<T> {
for (size_t i = 0; i < outs.size(); ++i) {
// NOTE: no need to call mutable_data here to allocate memory.
auto* out = outs[i];
VLOG(3) << "spliting by ref: " << row_offset << " " << out->dims()[0];
VLOG(30) << "spliting by ref: " << row_offset << " " << out->dims()[0];
*out = in->Slice(row_offset, row_offset + out->dims()[0]);
row_offset += out->dims()[0];
}
......
......@@ -44,7 +44,7 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
for (size_t i = 0; i < ids_tensors.size(); ++i) {
batch_size += ids_tensors[i]->dims()[0];
}
VLOG(4) << "Get Total BatchSize is: " << batch_size;
VLOG(40) << "Get Total BatchSize is: " << batch_size;
std::vector<T> all_ids(batch_size);
int offset = 0;
......
......@@ -186,7 +186,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
}
if (in_dim.empty()) {
VLOG(3) << "WARNING: all the inputs are empty";
VLOG(30) << "WARNING: all the inputs are empty";
in_dim = framework::vectorize(get_selected_row(N - 1).value().dims());
} else {
in_dim[0] = static_cast<int64_t>(first_dim);
......
......@@ -45,7 +45,7 @@ class SumOp : public framework::OperatorWithKernel {
size_t N = x_dims.size();
PADDLE_ENFORCE_GT(N, 0, "Input tensors count should > 0.");
if (N == 1) {
VLOG(3) << "Warning: sum have only one input, may waste memory";
VLOG(30) << "Warning: sum have only one input, may waste memory";
}
framework::DDim in_dim({0});
......@@ -157,8 +157,8 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
auto& inputs = op_desc.Input("X");
auto var_type = framework::proto::VarType::SELECTED_ROWS;
for (auto& name : op_desc.Input("X")) {
VLOG(10) << name << " "
<< block->FindRecursiveOrCreateVar(name).GetType();
VLOG(100) << name << " "
<< block->FindRecursiveOrCreateVar(name).GetType();
}
bool any_input_is_lod_tensor = std::any_of(
......
......@@ -34,8 +34,8 @@ class WriteToArrayOp : public ArrayOp {
auto *out =
scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensorArray>();
if (offset >= out->size()) {
VLOG(10) << "Resize " << Output("Out") << " from " << out->size()
<< " to " << offset + 1;
VLOG(100) << "Resize " << Output("Out") << " from " << out->size()
<< " to " << offset + 1;
out->resize(offset + 1);
}
auto *out_tensor = &out->at(offset);
......@@ -47,9 +47,9 @@ class WriteToArrayOp : public ArrayOp {
TensorCopy(x_tensor, place, dev_ctx, out_tensor);
} else {
VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
"nothing has been written to output array["
<< offset << "].";
VLOG(100) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
"nothing has been written to output array["
<< offset << "].";
}
}
};
......@@ -104,7 +104,7 @@ class WriteToArrayInferVarType : public framework::VarTypeInference {
framework::BlockDesc *block) const override {
auto x_name = op_desc.Input("X")[0];
auto out_name = op_desc.Output("Out")[0];
VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY";
VLOG(100) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY";
auto &out = block->FindRecursiveOrCreateVar(out_name);
out.SetType(framework::proto::VarType::LOD_TENSOR_ARRAY);
auto *x = block->FindVarRecursive(x_name);
......@@ -139,7 +139,7 @@ class ReadFromArrayOp : public ArrayOp {
framework::TensorCopy(x_array[offset], place, dev_ctx, out_tensor);
out_tensor->set_lod(x_array[offset].lod());
} else {
VLOG(10) << "offset " << offset << " >= " << x_array.size();
VLOG(100) << "offset " << offset << " >= " << x_array.size();
}
}
};
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/variable.h"
namespace paddle {
namespace operators {
using framework::Tensor;
void LodTensorArray2LodTensorVector(const framework::Scope &scope,
const std::string &base_name,
const std::string &lod_tensor_array_name,
std::vector<std::string> *res_names) {
auto &inx =
scope.FindVar(lod_tensor_array_name)->Get<framework::LoDTensorArray>();
for (size_t i = 0; i < inx.size(); i++) {
std::string var_name = base_name + std::to_string(i);
framework::Variable *g_feed_value =
const_cast<framework::Scope &>(scope).Var(var_name);
auto &feed_input =
*(g_feed_value->GetMutable<paddle::framework::LoDTensor>());
feed_input.ShareDataWith(inx[i]);
res_names->push_back(var_name);
}
}
void LodTensorVectorResizeFromLodTensorArray(
const framework::Scope &scope, const std::string &base_name,
const std::string &lod_tensor_array_name,
std::vector<std::string> *res_names) {
auto &inx =
scope.FindVar(lod_tensor_array_name)->Get<framework::LoDTensorArray>();
for (size_t i = 0; i < inx.size(); i++) {
std::string var_name = base_name + std::to_string(i);
framework::Variable *g_feed_value =
const_cast<framework::Scope &>(scope).Var(var_name);
auto &feed_input =
*(g_feed_value->GetMutable<paddle::framework::LoDTensor>());
auto dims = inx[i].dims();
feed_input.Resize(dims);
res_names->push_back(var_name);
}
}
void LodTensorArrayCreateFromLodTensorArray(
const framework::Scope &scope,
const std::string &input_lod_tensor_array_name,
const std::string &output_lod_tensor_array_name) {
auto &inx = scope.FindVar(input_lod_tensor_array_name)
->Get<framework::LoDTensorArray>();
auto &grad_inx = *scope.FindVar(output_lod_tensor_array_name)
->GetMutable<framework::LoDTensorArray>();
for (size_t i = 0; i < inx.size(); i++) {
std::string var_name = output_lod_tensor_array_name + std::to_string(i);
framework::Variable *g_feed_value =
const_cast<framework::Scope &>(scope).Var(var_name);
auto &feed_input =
*(g_feed_value->GetMutable<paddle::framework::LoDTensor>());
grad_inx.push_back(feed_input);
}
}
class LoDTensorArray2TensorOp : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto axis = Attr<int>("axis");
framework::AttributeMap attrs;
attrs["axis"] = axis;
auto &inx = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
auto &out =
*scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
auto &out_inx =
*scope.FindVar(Output("OutIndex"))->GetMutable<framework::LoDTensor>();
const size_t n = inx.size();
PADDLE_ENFORCE_GT(n, 0, "Input tensorarray size should > 0.");
std::string base_name = Inputs("X")[0];
std::vector<std::string> names;
// get the input tensorarray items' dim in out_inx
auto out_inx_dim = out_inx.dims();
out_inx_dim[0] = inx.size();
out_inx.Resize(out_inx_dim);
std::string var_name = "out_index";
framework::Variable *tmp_index_var =
const_cast<framework::Scope &>(scope).Var(var_name);
auto &tmp_index_tensor =
*(tmp_index_var->GetMutable<paddle::framework::LoDTensor>());
tmp_index_tensor.Resize(out_inx_dim);
int *tmp_index_data =
tmp_index_tensor.mutable_data<int>(platform::CPUPlace());
auto out_dims = inx[0].dims();
size_t out_dim_sum = 0;
for (size_t index = 0; index < inx.size(); index++) {
auto inx_dims = inx[index].dims();
out_dim_sum += inx_dims[axis];
tmp_index_data[index] = inx_dims[axis];
}
out_inx.ShareDataWith(tmp_index_tensor);
// get input array items' dims
out_dims[axis] = out_dim_sum;
out.Resize(out_dims);
LodTensorArray2LodTensorVector(scope, base_name, Input("X"), &names);
// Invoke Reshape Op
auto concat_op = framework::OpRegistry::CreateOp(
"concat", {{"X", names}}, {{"Out", {Output("Out")}}}, attrs);
concat_op->Run(scope, place);
}
};
class LoDTensorArray2TensorOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "Input LoDTensorArray of tensor_array_to_tensor operator.");
AddOutput("Out", "Output tensor of tensor_array_to_tensor operator.");
AddOutput("OutIndex",
"Output input LoDTensorArray items' dims of "
"tensor_array_to_tensor operator.");
AddAttr<int>("axis",
"The axis along which the input tensors will be concatenated.")
.SetDefault(0);
AddComment(R"DOC(
tensor_array_to_tensor Operator.
Concatenate the input LoDTensorArray along dimension axis to the output Tensor.
Examples:
Input = {[1,2], [3,4], [5,6]}
axis = 0
Output = [[1,2],
[3,4],
[5,6]]
OutputIndex = [1,1,1]
)DOC");
}
};
class LoDTensorArray2TensorOpInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *ctx) const override {}
};
class LoDTensorArray2TensorGradInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *context) const override {}
};
class LoDTensorArray2TensorGradInferVarType
: public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
for (auto &out_var : op_desc.Output(framework::GradVarName("X"))) {
block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR_ARRAY);
}
}
};
class LoDTensorArray2TensorGradOp : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto axis = Attr<int>("axis");
framework::AttributeMap attrs;
attrs["axis"] = axis;
auto &inx = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
const size_t n = inx.size();
PADDLE_ENFORCE_GT(n, 0, "Input tensorarray size should > 0.");
std::string base_name = Inputs("X")[0];
std::vector<std::string> names;
LodTensorArray2LodTensorVector(scope, base_name, Input("X"), &names);
// grad
auto dx_name = Output(framework::GradVarName("X"));
auto dout_name = Input(framework::GradVarName("Out"));
std::vector<std::string> grad_names;
LodTensorVectorResizeFromLodTensorArray(scope, "grad_name", Input("X"),
&grad_names);
auto concat_grad_op = framework::OpRegistry::CreateOp(
"concat_grad", {{"X", names}, {"Out@GRAD", {dout_name}}},
{{"X@GRAD", grad_names}}, attrs);
concat_grad_op->Run(scope, place);
LodTensorArrayCreateFromLodTensorArray(scope, Input("X"), dx_name);
auto &grad_inx =
*scope.FindVar(dx_name)->GetMutable<framework::LoDTensorArray>();
for (size_t i = 0; i < grad_names.size(); i++) {
std::string var_name = grad_names[i];
auto &feed_input = scope.FindVar(var_name)->Get<framework::LoDTensor>();
grad_inx[i].ShareDataWith(feed_input);
}
}
};
} // namespace operators
} // namespace paddle
USE_OP(concat);
namespace ops = paddle::operators;
REGISTER_OPERATOR(tensor_array_to_tensor, ops::LoDTensorArray2TensorOp,
ops::LoDTensorArray2TensorOpMaker,
ops::LoDTensorArray2TensorOpInferShape,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(tensor_array_to_tensor_grad, ops::LoDTensorArray2TensorGradOp,
ops::LoDTensorArray2TensorGradInferShape,
ops::LoDTensorArray2TensorGradInferVarType);
......@@ -34,7 +34,7 @@ namespace operators {
using FluidDT = framework::proto::VarType_Type;
using TRT_DT = nvinfer1::DataType;
namespace {
namespace { // NOLINT
TRT_DT FluidDataType2TRT(FluidDT type) {
switch (type) {
......@@ -60,7 +60,7 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
return nvinfer1::DimsCHW(shape[1], 1, 1);
}
} // namespace
} // namespace // NOLINT
using inference::Singleton;
using inference::tensorrt::TRT_EngineManager;
......@@ -127,9 +127,9 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
// Convert output tensor from engine to fluid
int output_index = 0;
VLOG(4) << "TensorRT Engine Op Outputs:";
VLOG(40) << "TensorRT Engine Op Outputs:";
for (const auto& y : context.Outputs("Ys")) {
VLOG(4) << y;
VLOG(40) << y;
// convert output and copy to fluid.
nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]);
auto dims = trt_t->getDimensions();
......@@ -167,7 +167,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
protected:
void Prepare(const framework::ExecutionContext& context) const {
VLOG(4) << "Prepare engine";
VLOG(40) << "Prepare engine";
// Get the ProgramDesc and pass to convert.
framework::proto::BlockDesc block_desc;
block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
......@@ -192,12 +192,12 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
engine->InitNetwork();
framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
VLOG(4) << "parsed var size " << block.AllVars().size();
VLOG(40) << "parsed var size " << block.AllVars().size();
// Add inputs
VLOG(4) << "declare inputs";
VLOG(40) << "declare inputs";
for (auto& input : context.Inputs("Xs")) {
if (parameters.count(input)) continue;
VLOG(4) << "declare input " << input;
VLOG(40) << "declare input " << input;
auto* var = block.FindVar(input);
// TensorRT engine need to create parameters. The parameter's description
// should be set in
......
......@@ -129,15 +129,15 @@ class WhileGradOp : public framework::OperatorBase {
for (auto cur_scope_iter = step_scopes->rbegin();
cur_scope_iter != step_scopes->rend(); ++cur_scope_iter) {
VLOG(3) << "Start backward at time_step "
<< cur_scope_iter - step_scopes->rbegin();
VLOG(30) << "Start backward at time_step "
<< cur_scope_iter - step_scopes->rbegin();
framework::Scope &cur_scope = **cur_scope_iter;
// Link OG from outside to inside
for (size_t i = 0; i < outside_og_names.size(); ++i) {
auto outside_og_name = outside_og_names[i];
auto inside_og_name = inside_og_names[i];
VLOG(8) << "Linking outside " << outside_og_name << " --> inside "
<< inside_og_name;
VLOG(80) << "Linking outside " << outside_og_name << " --> inside "
<< inside_og_name;
if (scope.FindVar(outside_og_name) == nullptr) {
continue;
}
......@@ -159,11 +159,11 @@ class WhileGradOp : public framework::OperatorBase {
auto &outside_array = og_outside.Get<framework::LoDTensorArray>();
auto &inside_array =
detail::Ref(og_inside.GetMutable<framework::LoDTensorArray>());
VLOG(8) << outside_og_name << " size = " << outside_array.size();
VLOG(80) << outside_og_name << " size = " << outside_array.size();
inside_array.resize(outside_array.size());
for (size_t j = 0; j < inside_array.size(); ++j) {
VLOG(8) << j << " " << outside_array[j].numel();
VLOG(80) << j << " " << outside_array[j].numel();
if (outside_array[j].numel() != 0) {
inside_array[j].set_lod(outside_array[j].lod());
inside_array[j].ShareDataWith(outside_array[j]);
......@@ -289,7 +289,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
auto igs = InputGrad(kX, /*do not drop empty gradient*/ false);
for (auto &each_ig : igs) {
if (inner_op_outputs.find(each_ig) == inner_op_outputs.end()) {
VLOG(8) << "Ignore " << each_ig;
VLOG(80) << "Ignore " << each_ig;
each_ig = framework::kEmptyVarName;
}
}
......@@ -353,8 +353,8 @@ class WhileGradOpVarTypeInference : public framework::VarTypeInference {
auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i]));
auto *g_var = block->FindVarRecursive(pg_ig_names[i]);
if (g_var != nullptr) { // Gradient could be @EMPTY@
VLOG(5) << "Setting " << pg_ig_names[i] << " following " << p_names[i]
<< " type: " << p_var.GetType();
VLOG(50) << "Setting " << pg_ig_names[i] << " following " << p_names[i]
<< " type: " << p_var.GetType();
g_var->SetType(p_var.GetType());
g_var->SetDataType(p_var.GetDataType());
}
......
......@@ -207,7 +207,10 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
<< "." << (driver_version_ % 100) / 10
<< ", Runtime Version: " << runtime_version_ / 1000
<< "." << (runtime_version_ % 100) / 10;
size_t cudnn_dso_ver = dynload::cudnnGetVersion();
LOG_FIRST_N(WARNING, 1) << "device: " << place_.device
<< ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
<< (cudnn_dso_ver % 100) / 10 << ".";
callback_manager_.reset(new StreamCallbackManager(stream_));
}
......
......@@ -203,7 +203,7 @@ class DeviceTracerImpl : public DeviceTracer {
void AddCPURecords(const std::string &anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id, int64_t thread_id) {
if (anno.empty()) {
VLOG(1) << "Empty timeline annotation.";
VLOG(10) << "Empty timeline annotation.";
return;
}
std::lock_guard<std::mutex> l(trace_mu_);
......@@ -216,7 +216,7 @@ class DeviceTracerImpl : public DeviceTracer {
uint32_t correlation_id, uint64_t bytes) {
// 0 means timestamp information could not be collected for the kernel.
if (start_ns == 0 || end_ns == 0) {
VLOG(3) << name << " cannot be traced";
VLOG(30) << name << " cannot be traced";
return;
}
std::lock_guard<std::mutex> l(trace_mu_);
......@@ -228,7 +228,7 @@ class DeviceTracerImpl : public DeviceTracer {
int64_t stream_id, uint32_t correlation_id) {
// 0 means timestamp information could not be collected for the kernel.
if (start == 0 || end == 0) {
VLOG(3) << correlation_id << " cannot be traced";
VLOG(30) << correlation_id << " cannot be traced";
return;
}
std::lock_guard<std::mutex> l(trace_mu_);
......@@ -347,7 +347,7 @@ class DeviceTracerImpl : public DeviceTracer {
tracer->AddAnnotation(cbInfo->correlationId, anno);
}
} else {
VLOG(1) << "Unhandled API Callback for " << domain << " " << cbid;
VLOG(10) << "Unhandled API Callback for " << domain << " " << cbid;
}
}
CUpti_SubscriberHandle subscriber_;
......
......@@ -65,51 +65,54 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
* include all needed cudnn functions in HPPL
* different cudnn version has different interfaces
**/
#define CUDNN_DNN_ROUTINE_EACH(__macro) \
__macro(cudnnSetTensor4dDescriptor); \
__macro(cudnnSetTensor4dDescriptorEx); \
__macro(cudnnSetTensorNdDescriptor); \
__macro(cudnnGetTensorNdDescriptor); \
__macro(cudnnGetConvolutionNdForwardOutputDim); \
__macro(cudnnGetConvolutionForwardAlgorithm); \
__macro(cudnnCreateTensorDescriptor); \
__macro(cudnnDestroyTensorDescriptor); \
__macro(cudnnCreateFilterDescriptor); \
__macro(cudnnSetFilter4dDescriptor); \
__macro(cudnnSetFilterNdDescriptor); \
__macro(cudnnGetFilterNdDescriptor); \
__macro(cudnnSetPooling2dDescriptor); \
__macro(cudnnSetPoolingNdDescriptor); \
__macro(cudnnGetPoolingNdDescriptor); \
__macro(cudnnDestroyFilterDescriptor); \
__macro(cudnnCreateConvolutionDescriptor); \
__macro(cudnnCreatePoolingDescriptor); \
__macro(cudnnDestroyPoolingDescriptor); \
__macro(cudnnSetConvolution2dDescriptor); \
__macro(cudnnDestroyConvolutionDescriptor); \
__macro(cudnnSetConvolutionNdDescriptor); \
__macro(cudnnGetConvolutionNdDescriptor); \
__macro(cudnnDeriveBNTensorDescriptor); \
__macro(cudnnCreateSpatialTransformerDescriptor); \
__macro(cudnnSetSpatialTransformerNdDescriptor); \
__macro(cudnnDestroySpatialTransformerDescriptor); \
__macro(cudnnSpatialTfGridGeneratorForward); \
__macro(cudnnSpatialTfGridGeneratorBackward); \
__macro(cudnnSpatialTfSamplerForward); \
__macro(cudnnSpatialTfSamplerBackward); \
__macro(cudnnCreate); \
__macro(cudnnDestroy); \
__macro(cudnnSetStream); \
__macro(cudnnActivationForward); \
__macro(cudnnConvolutionForward); \
__macro(cudnnConvolutionBackwardBias); \
__macro(cudnnGetConvolutionForwardWorkspaceSize); \
__macro(cudnnTransformTensor); \
__macro(cudnnPoolingForward); \
__macro(cudnnPoolingBackward); \
__macro(cudnnSoftmaxBackward); \
__macro(cudnnSoftmaxForward); \
__macro(cudnnGetVersion); \
#define CUDNN_DNN_ROUTINE_EACH(__macro) \
__macro(cudnnSetTensor4dDescriptor); \
__macro(cudnnSetTensor4dDescriptorEx); \
__macro(cudnnSetTensorNdDescriptor); \
__macro(cudnnGetTensorNdDescriptor); \
__macro(cudnnGetConvolutionNdForwardOutputDim); \
__macro(cudnnGetConvolutionForwardAlgorithm); \
__macro(cudnnCreateTensorDescriptor); \
__macro(cudnnDestroyTensorDescriptor); \
__macro(cudnnCreateFilterDescriptor); \
__macro(cudnnSetFilter4dDescriptor); \
__macro(cudnnSetFilterNdDescriptor); \
__macro(cudnnGetFilterNdDescriptor); \
__macro(cudnnSetPooling2dDescriptor); \
__macro(cudnnSetPoolingNdDescriptor); \
__macro(cudnnGetPoolingNdDescriptor); \
__macro(cudnnDestroyFilterDescriptor); \
__macro(cudnnCreateConvolutionDescriptor); \
__macro(cudnnCreatePoolingDescriptor); \
__macro(cudnnDestroyPoolingDescriptor); \
__macro(cudnnSetConvolution2dDescriptor); \
__macro(cudnnDestroyConvolutionDescriptor); \
__macro(cudnnSetConvolutionNdDescriptor); \
__macro(cudnnGetConvolutionNdDescriptor); \
__macro(cudnnDeriveBNTensorDescriptor); \
__macro(cudnnCreateSpatialTransformerDescriptor); \
__macro(cudnnSetSpatialTransformerNdDescriptor); \
__macro(cudnnDestroySpatialTransformerDescriptor); \
__macro(cudnnSpatialTfGridGeneratorForward); \
__macro(cudnnSpatialTfGridGeneratorBackward); \
__macro(cudnnSpatialTfSamplerForward); \
__macro(cudnnSpatialTfSamplerBackward); \
__macro(cudnnCreate); \
__macro(cudnnDestroy); \
__macro(cudnnSetStream); \
__macro(cudnnActivationForward); \
__macro(cudnnConvolutionForward); \
__macro(cudnnConvolutionBackwardBias); \
__macro(cudnnGetConvolutionForwardWorkspaceSize); \
__macro(cudnnTransformTensor); \
__macro(cudnnPoolingForward); \
__macro(cudnnPoolingBackward); \
__macro(cudnnSoftmaxBackward); \
__macro(cudnnSoftmaxForward); \
__macro(cudnnGetVersion); \
__macro(cudnnFindConvolutionForwardAlgorithmEx); \
__macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \
__macro(cudnnFindConvolutionBackwardDataAlgorithmEx); \
__macro(cudnnGetErrorString);
CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
......
......@@ -72,8 +72,8 @@ static inline std::string join(const std::string& part1,
static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
int dynload_flags) {
VLOG(3) << "Try to find library: " << dso_path
<< " from default system path.";
VLOG(30) << "Try to find library: " << dso_path
<< " from default system path.";
// default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
// and /usr/local/lib path
void* dso_handle = dlopen(dso_path.c_str(), dynload_flags);
......
......@@ -124,8 +124,8 @@ size_t GpuMaxChunkSize() {
size_t available = 0;
GpuMemoryUsage(&available, &total);
VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
<< total / 1024 / 1024 << "M";
VLOG(100) << "GPU Usage " << available / 1024 / 1024 << "M/"
<< total / 1024 / 1024 << "M";
size_t reserving = static_cast<size_t>(0.05 * total);
// If available less than minimum chunk size, no usable memory exists.
available =
......
......@@ -48,7 +48,7 @@ void InitGflags(std::vector<std::string> argv) {
line += ' ';
}
google::ParseCommandLineFlags(&argc, &arr, true);
VLOG(1) << "Init commandline: " << line;
VLOG(10) << "Init commandline: " << line;
});
}
......
......@@ -112,7 +112,7 @@ struct NCCLContextMap {
NCCLGroupGuard gurad;
for (auto &gpu_id : order_) {
int rank = trainer_id * order_.size() + gpu_id;
VLOG(3) << "init nccl rank: " << rank << " nranks: " << nranks;
VLOG(30) << "init nccl rank: " << rank << " nranks: " << nranks;
PADDLE_ENFORCE(cudaSetDevice(gpu_id));
PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
comms.get() + gpu_id, nranks, *nccl_id, rank));
......
......@@ -61,9 +61,9 @@ struct variant_caster<V<Ts...>> {
if (std::is_same<T, std::vector<float>>::value) {
auto caster_ints = make_caster<std::vector<int64_t>>();
if (caster_ints.load(src, convert)) {
VLOG(4) << "This value are floats and int64_ts satisfy "
"simultaneously, will set it's type to "
"std::vector<int64_t>";
VLOG(40) << "This value are floats and int64_ts satisfy "
"simultaneously, will set it's type to "
"std::vector<int64_t>";
value = cast_op<std::vector<int64_t>>(caster_ints);
return true;
}
......
......@@ -40,7 +40,7 @@ void ReadBinaryFile(const std::string& filename, std::string* contents) {
std::unique_ptr<paddle::framework::ProgramDesc> Load(
paddle::framework::Executor* executor, const std::string& model_filename) {
VLOG(3) << "loading model from " << model_filename;
VLOG(30) << "loading model from " << model_filename;
std::string program_desc_str;
ReadBinaryFile(model_filename, &program_desc_str);
......
......@@ -614,7 +614,24 @@ EOF
CMD='"true"'
fi
cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
if [ "$1" == "cp35-cp35m" ]; then
cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
ADD python/dist/*.whl /
# run paddle version to install python packages first
RUN apt-get update && ${NCCL_DEPS}
RUN apt-get install -y wget python3 python3-pip libgtk2.0-dev dmidecode python3-tk && \
pip3 install opencv-python && pip3 install /*.whl; apt-get install -f -y && \
apt-get clean -y && \
rm -f /*.whl && \
${PADDLE_VERSION} && \
ldconfig
${DOCKERFILE_CUDNN_DSO}
${DOCKERFILE_CUBLAS_DSO}
${DOCKERFILE_GPU_ENV}
ENV NCCL_LAUNCH_MODE PARALLEL
EOF
else
cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
ADD python/dist/*.whl /
# run paddle version to install python packages first
RUN apt-get update && ${NCCL_DEPS}
......@@ -629,6 +646,8 @@ EOF
${DOCKERFILE_GPU_ENV}
ENV NCCL_LAUNCH_MODE PARALLEL
EOF
fi
if [[ ${WITH_GOLANG:-OFF} == "ON" ]]; then
cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
ADD go/cmd/pserver/pserver /usr/bin/
......@@ -703,7 +722,7 @@ function main() {
build)
cmake_gen ${PYTHON_ABI:-""}
build
gen_dockerfile
gen_dockerfile ${PYTHON_ABI:-""}
;;
build_android)
build_android
......@@ -730,7 +749,7 @@ function main() {
gen_html
;;
dockerfile)
gen_dockerfile
gen_dockerfile ${PYTHON_ABI:-""}
;;
capi)
cmake_gen ${PYTHON_ABI:-""}
......
......@@ -118,7 +118,7 @@ void generateSequenceStartPositions(size_t batchSize,
}
buf[i] = pos;
pos += len;
VLOG(1) << " len=" << len;
VLOG(10) << " len=" << len;
}
buf[numSeqs] = batchSize;
}
......
......@@ -34,6 +34,7 @@ from . import regularizer
from . import average
from . import metrics
from . import transpiler
from . import distribute_lookup_table
from .param_attr import ParamAttr, WeightNormParamAttr
from .data_feeder import DataFeeder
from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
......@@ -126,7 +127,8 @@ def __bootstrap__():
if core.is_compiled_with_cuda():
read_env_flags += [
'fraction_of_gpu_memory_to_use', 'cudnn_deterministic'
'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
'conv_workspace_size_limit', 'cudnn_exhaustive_search'
]
core.init_gflags([sys.argv[0]] +
["--tryfromenv=" + ",".join(read_env_flags)])
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
LOOKUP_TABLE_TYPE = "lookup_table"
def find_distributed_lookup_table(program):
"""
Find distribute lookup table in program.
We only support one distribute table now.
:param program:
:return: table_name or None
"""
table_name = None
for op in program.global_block().ops:
if op.type == LOOKUP_TABLE_TYPE:
if op.attr('is_distributed') is True:
if table_name is None:
table_name = op.input("W")[0]
if table_name != op.input("W")[0]:
raise RuntimeError("all distributed lookup_table_ops"
" should have only one table")
else:
if table_name is not None:
assert op.input("W")[0] != table_name
return table_name
......@@ -27,6 +27,7 @@ from .tensor import concat
from . import utils
from .. import unique_name
from functools import reduce
from .. import core
__all__ = [
'fc',
......@@ -101,6 +102,7 @@ __all__ = [
'image_resize',
'image_resize_short',
'resize_bilinear',
'resize_nearest',
'gather',
'scatter',
'sequence_scatter',
......@@ -158,6 +160,7 @@ __all__ = [
'affine_grid',
'sequence_reverse',
'affine_channel',
'similarity_focus',
'hash',
'grid_sampler',
'log_loss',
......@@ -1665,6 +1668,20 @@ def conv2d(input,
pre_bias = helper.create_variable_for_type_inference(dtype)
if use_cudnn:
helper.create_variable(
name="kCUDNNFwdAlgoCache",
persistable=True,
type=core.VarDesc.VarType.RAW)
helper.create_variable(
name="kCUDNNBwdDataAlgoCache",
persistable=True,
type=core.VarDesc.VarType.RAW)
helper.create_variable(
name="kCUDNNBwdFilterAlgoCache",
persistable=True,
type=core.VarDesc.VarType.RAW)
helper.append_op(
type=l_type,
inputs={
......@@ -1678,7 +1695,7 @@ def conv2d(input,
'dilations': dilation,
'groups': groups,
'use_cudnn': use_cudnn,
'use_mkldnn': False
'use_mkldnn': False,
})
pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
......@@ -5640,7 +5657,8 @@ def image_resize(input,
out_shape=None,
scale=None,
name=None,
resample='BILINEAR'):
resample='BILINEAR',
actual_shape=None):
"""
**Resize a Batch of Images**
......@@ -5650,6 +5668,7 @@ def image_resize(input,
Supporting resample methods:
'BILINEAR' : Bilinear interpolation
'NEAREST' : Nearest neighbor interpolation
Args:
input (Variable): The input tensor of image resize layer,
......@@ -5664,25 +5683,51 @@ def image_resize(input,
Default: None
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
resample(str): The resample method. It can only be 'BILINEAR' currently.
resample(str): The resample method. It supports 'BILINEAR' and 'NEAREST'
currently.
Default: 'BILINEAR'
actual_shape(Variable): An optional input to specify output shape
dynamically. If provided, image resize
according to this given shape rather than
:attr:`out_shape` and :attr:`scale` specifying
shape. That is to say actual_shape has the
highest priority. It is recommended to use
actual_shape instead of :attr:`out_shape` if you
want to specify output shape dynamically. When
using actual_shape to specify output shape, one of
:attr:`out_shape` and :attr:`scale` should also be
set, otherwise errors would be occured in graph
constructing stage.
Default: None
Returns:
Variable: The output is a 4-D tensor of the shape
(num_batches, channls, out_h, out_w).
Raises:
TypeError: out_shape should be a list or tuple or Variable.
TypeError: actual_shape should either be Variable or None.
ValueError: The 'resample' of image_resize can only be 'BILINEAR'
or 'NEAREST' currently.
ValueError: One of out_shape and scale must not be None.
ValueError: out_shape length should be 2.
Examples:
.. code-block:: python
out = fluid.layers.image_resize(input, out_shape=[12, 12])
"""
resample_methods = {'BILINEAR': 'bilinear_interp'}
resample_methods = {
'BILINEAR': 'bilinear',
'NEAREST': 'nearest',
}
if resample not in resample_methods:
raise ValueError(
"The 'resample' of image_resize can only be 'BILINEAR' currently.")
"The 'resample' of image_resize can only be 'BILINEAR' or 'NEAREST' currently."
)
if out_shape is None and scale is None:
raise ValueError("One of out_shape and scale must not be None")
helper = LayerHelper('bilinear_interp', **locals())
raise ValueError("One of out_shape and scale must not be None.")
helper = LayerHelper('interpolate', **locals())
dtype = helper.input_dtype()
def _is_list_or_turple_(data):
......@@ -5692,33 +5737,106 @@ def image_resize(input,
out_w = 0
inputs = {"X": input}
if out_shape is not None:
if not (_is_list_or_turple_(out_shape) and
len(out_shape) == 2) and not isinstance(out_shape, Variable):
raise ValueError('out_shape should be a list or tuple or variable')
if _is_list_or_turple_(out_shape):
out_shape = list(map(int, out_shape))
out_h = out_shape[0]
out_w = out_shape[1]
else:
if isinstance(out_shape, Variable):
warnings.warn("out_shape as Variable type is deprecated, \
it is recommended to use actual_shape instead of \
out_shape to specify output shape dynamically.")
inputs['OutSize'] = out_shape
elif not (_is_list_or_turple_(out_shape)):
raise TypeError("out_shape should be a list or tuple or Variable.")
elif len(out_shape) != 2:
raise ValueError("out_shape length should be 2.")
out_shape = list(map(int, out_shape))
out_h = out_shape[0]
out_w = out_shape[1]
else:
out_h = int(input.shape[2] * scale)
out_w = int(input.shape[3] * scale)
if isinstance(actual_shape, Variable):
inputs["OutSize"] = actual_shape
elif actual_shape is not None:
raise TypeError("actual_shape should either be Variable or None.")
out = helper.create_variable_for_type_inference(dtype)
helper.append_op(
type=resample_methods[resample],
type='interpolate',
inputs=inputs,
outputs={"Out": out},
attrs={"out_h": out_h,
"out_w": out_w})
attrs={
"out_h": out_h,
"out_w": out_w,
"interp_method": resample_methods[resample]
})
return out
@templatedoc(op_type="bilinear_interp")
def resize_bilinear(input, out_shape=None, scale=None, name=None):
@templatedoc(op_type="interpolate")
def resize_bilinear(input,
out_shape=None,
scale=None,
name=None,
actual_shape=None):
"""
${comment}
Resize input by performing bilinear interpolation based on given
output shape which specified by actual_shape, out_shape and scale
in priority order.
Bilinear interpolation is an extension of linear interpolation for
interpolating functions of two variables (e.g. H-direction and
W-direction in this op) on a rectilinear 2D grid. The key idea is
to perform linear interpolation first in one direction, and then
again in the other direction.
For details of bilinear interpolation, please refer to Wikipedia:
https://en.wikipedia.org/wiki/Bilinear_interpolation
Args:
input(${x_type}): ${x_comment}.
out_shape(${out_size_type}): ${out_size_comment}.
scale(float|None): The multiplier for the input height or width. At
least one of out_shape or scale must be set. And out_shape has
a higher priority than scale. Default: None.
name(str|None): The output variable name.
actual_shape(Variable): An optional input to specify output shape
dynamically. If provided, image resize
according to this given shape rather than
:attr:`out_shape` and :attr:`scale` specifying
shape. That is to say actual_shape has the
highest priority. It is recommended to use
actual_shape instead of :attr:`out_shape` if you
want to specify output shape dynamically. When
using actual_shape to specify output shape, one of
:attr:`out_shape` and :attr:`scale` should also be
set, otherwise errors would be occured in graph
constructing stage.
Default: None
Returns:
${out_comment}.
"""
return image_resize(input, out_shape, scale, name, 'BILINEAR', actual_shape)
@templatedoc(op_type="interpolate")
def resize_nearest(input,
out_shape=None,
scale=None,
name=None,
actual_shape=None):
"""
Resize input by performing nearest neighbor interpolation in both the
3rd dimention(in height direction) and the 4th dimention(in width
direction) based on given output shape which specified by actual_shape,
out_shape and scale in priority order.
For details of nearest neighbor interpolation, please refer to Wikipedia:
https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation
Args:
input(${x_type}): ${x_comment}.
......@@ -5730,12 +5848,25 @@ def resize_bilinear(input, out_shape=None, scale=None, name=None):
a higher priority than scale. Default: None.
name(str|None): The output variable name.
actual_shape(Variable): An optional input to specify output shape
dynamically. If provided, image resize
according to this given shape rather than
:attr:`out_shape` and :attr:`scale` specifying
shape. That is to say actual_shape has the
highest priority. It is recommended to use
actual_shape instead of :attr:`out_shape` if you
want to specify output shape dynamically. When
using actual_shape to specify output shape, one of
:attr:`out_shape` and :attr:`scale` should also be
set, otherwise errors would be occured in graph
constructing stage.
Default: None
Returns:
${out_comment}.
"""
return image_resize(input, out_shape, scale, name, 'BILINEAR')
return image_resize(input, out_shape, scale, name, 'NEAREST', actual_shape)
def image_resize_short(input, out_short_len, resample='BILINEAR'):
......@@ -7803,6 +7934,118 @@ def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None):
return out
def similarity_focus(input, axis, indexes, name=None):
"""
SimilarityFocus Operator
Generate a similarity focus mask with the same shape of input using the following method:
1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding
to the axis according to the indexes. For example, if axis=1 and indexes=[a],
it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X
is (BatchSize, A, B, C), the shape of tensor T is (BatchSize, B, C).
2. For each index, find the largest numbers in the tensor T, so that the same
row and same column has at most one number(what it means is that if the
largest number has been found in the i-th row and the j-th column, then
the numbers in the i-th row or j-th column will be skipped. And then the
next largest number will be selected from the remaining numbers. Obviously
there will be min(B, C) numbers), and mark the corresponding position of the
3-D similarity focus mask as 1, otherwise as 0. Do elementwise-or for
each index.
3. Broadcast the 3-D similarity focus mask to the same shape of input X.
Refer to `Similarity Focus Layer <http://www.aclweb.org/anthology/N16-1108>`_
.. code-block:: text
* Example :
Given a 4-D tensor x with the shape (BatchSize, C, A, B), where C is
the number of channels and the shape of feature map is (A, B):
x.shape = (2, 3, 2, 2)
x.data = [[[[0.8, 0.1],
[0.4, 0.5]],
[[0.9, 0.7],
[0.9, 0.9]],
[[0.8, 0.9],
[0.1, 0.2]]],
[[[0.2, 0.5],
[0.3, 0.4]],
[[0.9, 0.7],
[0.8, 0.4]],
[[0.0, 0.2],
[0.4, 0.7]]]]
Given axis: 1 (the axis of the channel)
Given indexes: [0]
then we get a 4-D tensor out with the same shape of input x:
out.shape = (2, 3, 2, 2)
out.data = [[[[1.0, 0.0],
[0.0, 1.0]],
[[1.0, 0.0],
[0.0, 1.0]],
[[1.0, 0.0],
[0.0, 1.0]]],
[[[0.0, 1.0],
[1.0, 0.0]],
[[0.0, 1.0],
[1.0, 0.0]],
[[0.0, 1.0],
[1.0, 0.0]]]]
Args:
input(Variable): The input tensor variable(default float). It should
be a 4-D tensor with shape [BatchSize, A, B, C].
axis(int): Indicating the dimension to be selected. It can only be
1, 2 or 3.
indexes(list): Indicating the indexes of the selected dimension.
Returns:
Variable: A tensor variable with the same shape and same type
as the input.
Examples:
.. code-block:: python
data = fluid.layers.data(
name='data', shape=[2, 3, 2, 2], dtype='float32')
x = fluid.layers.layer_norm(input=data, axis=1, indexes=[0])
"""
helper = LayerHelper('similarity_focus', **locals())
# check attrs
if isinstance(axis, int) is False:
raise TypeError("axis must be int type.")
if isinstance(indexes, list) is False:
raise TypeError("indexes must be list type.")
if axis != 1 and axis != 2 and axis != 3:
raise ValueError("axis must be 1, 2 or 3.")
if len(indexes) == 0:
raise ValueError("indexes can not be empty.")
if name is None:
out = helper.create_variable_for_type_inference(dtype=input.dtype)
else:
out = helper.create_variable(
name=name, dtype=input.dtype, persistable=False)
helper.append_op(
type='similarity_focus',
inputs={'X': input},
outputs={'Out': out},
attrs={"axis": axis,
"indexes": indexes})
return out
def hash(input, hash_size, num_hash=1, name=None):
"""
Hash the input to an integer whose value is less than the given hash size.
......
......@@ -24,10 +24,10 @@ from .layer_function_generator import templatedoc
import numpy
__all__ = [
'create_tensor', 'create_parameter', 'create_global_var', 'cast', 'concat',
'sums', 'assign', 'fill_constant_batch_size_like', 'fill_constant',
'argmin', 'argmax', 'argsort', 'ones', 'zeros', 'reverse', 'has_inf',
'has_nan', 'isfinite'
'create_tensor', 'create_parameter', 'create_global_var', 'cast',
'tensor_array_to_tensor', 'concat', 'sums', 'assign',
'fill_constant_batch_size_like', 'fill_constant', 'argmin', 'argmax',
'argsort', 'ones', 'zeros', 'reverse', 'has_inf', 'has_nan', 'isfinite'
]
......@@ -193,6 +193,60 @@ def concat(input, axis=0, name=None):
return out
def tensor_array_to_tensor(input, axis=1, name=None):
"""
This function concatenates the input LodTensorArray along the axis mentioned
and returns that as the output.
A simple example as below:
.. code-block:: text
Given:
input.data = {[[0.6, 0.1, 0.3],
[0.5, 0.3, 0.2]],
[[1.3],
[1.8]],
[[2.3, 2.1],
[2.5, 2.4]]}
axis = 1
Then:
output.data = [[0.6, 0.1, 0.3, 1.3, 2.3, 2.1],
[0.5, 0.3, 0.2, 1.8, 2.5, 2.4]]
output_index.data = [3, 1, 2]
Args:
input(list): Input LodTensorArray
axis(int): Integer axis along which the tensors will be concatenated
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: Output variable of the concatenation
Variable: The input LodTensorArray items' dims along the axis
Examples:
.. code-block:: python
output, output_index = fluid.layers.tensor_array_to_tensor(input=tensor_array)
"""
helper = LayerHelper('tensor_array_concat', **locals())
out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
out_index = helper.create_variable_for_type_inference(dtype="int32")
helper.append_op(
type='tensor_array_concat',
inputs={'X': input},
outputs={'Out': [out],
'OutIndex': [out_index]},
attrs={'axis': axis})
return out, out_index
def sums(input, out=None):
"""
This function performs the sum operation on the input and returns the
......
......@@ -13,21 +13,23 @@
# limitations under the License.
from __future__ import print_function
import re
import sys
from collections import defaultdict
from contextlib import contextmanager
from paddle.fluid.framework import Program, Variable, name_scope, default_main_program
from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
from . import framework
from . import layers
from . import unique_name
from .backward import append_backward
from .clip import append_gradient_clip_ops, error_clip_callback
from .framework import program_guard
from . import unique_name
from .initializer import Constant
from .layer_helper import LayerHelper
from .regularizer import append_regularization_ops
from .clip import append_gradient_clip_ops, error_clip_callback
from contextlib import contextmanager
from .layers import ops
from .regularizer import append_regularization_ops
__all__ = [
'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl',
......@@ -85,7 +87,7 @@ class Optimizer(object):
name=unique_name.generate("learning_rate"),
shape=[1],
value=float(self._learning_rate),
dtype='float32' if self._dtype == None else self._dtype,
dtype='float32' if self._dtype is None else self._dtype,
persistable=True)
def _global_learning_rate(self, program=None):
......@@ -245,6 +247,50 @@ class Optimizer(object):
end = len(global_block.ops)
return global_block._slice_ops(start, end)
def _process_distribute_lookuptable(self, param_grads, loss,
startup_program):
"""
Because distribute lookup table only support SGD optimizer for now, not support
other optimizer and regularization, so we should find the table parameter out,
and avoid to add regularization and other op for it, and add sgd optimize op
for it independently.
:param param_grads(list((Var, Var))): list of (param, grad) pair.
:param loss: the loss variable.
:param startup_program: the startup program
"""
program = loss.block.program
table_name = find_distributed_lookup_table(program)
table_param = None
table_grad = None
new_param_grads = []
for p, g in param_grads:
if p.name == table_name:
if table_param is not None:
raise RuntimeError(
"multi dist table var found, only support one now!")
table_param = p
table_grad = g
else:
new_param_grads.append((p, g))
sgd_op = None
if table_param is not None:
with program_guard(program, startup_program):
param_and_grad = [table_param, table_grad]
with table_param.block.program._optimized_guard(param_and_grad), \
framework.name_scope("optimizer"):
self._create_global_learning_rate()
# create the optimize op
sgd_op = loss.block.append_op(
type='sgd',
inputs={
"Param": table_param,
"Grad": table_grad,
"LearningRate":
self._create_param_lr(param_and_grad)
},
outputs={"ParamOut": param_and_grad[0]})
return new_param_grads, (table_param, table_grad), sgd_op
def minimize(self,
loss,
startup_program=None,
......@@ -260,6 +306,9 @@ class Optimizer(object):
params_grads = sorted(params_grads, key=lambda x: x[0].name)
params_grads, table_param_and_grad, table_optimize_op = \
self._process_distribute_lookuptable(params_grads, loss, startup_program)
params_grads = append_gradient_clip_ops(params_grads)
# Add regularization if any
......@@ -268,6 +317,9 @@ class Optimizer(object):
optimize_ops = self._create_optimization_pass(params_grads, loss,
startup_program)
if table_optimize_op is not None:
optimize_ops.append(table_optimize_op)
params_grads.append(table_param_and_grad)
return optimize_ops, params_grads
......
......@@ -38,7 +38,7 @@ depth = 8
mix_hidden_lr = 1e-3
IS_SPARSE = True
PASS_NUM = 10
PASS_NUM = 1
BATCH_SIZE = 10
embedding_name = 'emb'
......
......@@ -67,6 +67,7 @@ class TestConv2dOp(OpTest):
def setUp(self):
self.op_type = "conv2d"
self.use_cudnn = False
self.exhaustive_search = False
self.use_cuda = False
self.use_mkldnn = False
self.data_format = "AnyLayout"
......@@ -98,7 +99,8 @@ class TestConv2dOp(OpTest):
'dilations': self.dilations,
'use_cudnn': self.use_cudnn,
'use_mkldnn': self.use_mkldnn,
'data_format': self.data_format
'data_format': self.data_format,
'exhaustive_search': self.exhaustive_search
}
self.outputs = {'Output': output}
......@@ -361,6 +363,12 @@ class TestDepthwiseConvWithDilation2(TestConv2dOp):
self.op_type = "depthwise_conv2d"
class TestCUDNNExhaustiveSearch(TestConv2dOp):
def init_kernel_type(self):
self.use_cudnn = True
self.exhaustive_search = True
# Please Don't remove the following code.
# Currently, CI use cudnn V5.0 which not support dilation conv.
# class TestCUDNNWithDilation(TestWithDilation):
......
......@@ -335,6 +335,12 @@ class TestFP16WithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1):
self.check_output_with_place(place, atol=2e-2)
class TestCUDNNExhaustiveSearch(TestCUDNN):
def init_kernel_type(self):
self.use_cudnn = True
self.exhaustive_search = True
# FIXME(typhoonzero): find a way to determine if
# using cudnn > 6 in python
# class TestWithDilationCUDNN(TestWithDilation):
......
......@@ -567,7 +567,6 @@ class TestDistLookupTable(TestDistLookupTableBase):
'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
'fill_constant', 'fill_constant', 'uniform_random',
'uniform_random', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat',
'fake_init'
......@@ -639,7 +638,7 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
# 5 save table
self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
trainer, _ = self.get_trainer(config)
trainer, trainer_startup = self.get_trainer(config)
self.assertEqual(len(trainer.blocks), 1)
ops = [
'split_ids', 'prefetch', 'merge_ids', 'sequence_pool',
......@@ -653,6 +652,16 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
'recv', 'concat'
]
self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
startup_ops = [
'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
'fill_constant', 'fill_constant', 'uniform_random',
'uniform_random', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat',
'fake_init'
]
self.assertEqual([op.type for op in trainer_startup.blocks[0].ops],
startup_ops)
class TestDistLookupTableSliceSize(TestDistLookupTableBase):
......
......@@ -20,10 +20,44 @@ from op_test import OpTest
import paddle.fluid.core as core
def bilinear_interp_np(input, out_h, out_w, out_size):
def nearest_neighbor_interp_np(X,
out_h,
out_w,
out_size=None,
actual_shape=None):
"""nearest neighbor interpolation implement in shape [N, C, H, W]"""
if out_size is not None:
out_h = out_size[0]
out_w = out_size[1]
if actual_shape is not None:
out_h = actual_shape[0]
out_w = actual_shape[1]
n, c, in_h, in_w = X.shape
ratio_h = ratio_w = 0.0
if out_h > 1:
ratio_h = (in_h - 1.0) / (out_h - 1.0)
if out_w > 1:
ratio_w = (in_w - 1.0) / (out_w - 1.0)
out = np.zeros((n, c, out_h, out_w))
for i in range(out_h):
in_i = int(ratio_h * i + 0.5)
for j in range(out_w):
in_j = int(ratio_w * j + 0.5)
out[:, :, i, j] = X[:, :, in_i, in_j]
return out.astype(X.dtype)
def bilinear_interp_np(input, out_h, out_w, out_size=None, actual_shape=None):
"""bilinear interpolation implement in shape [N, C, H, W]"""
if out_size is not None:
out_h = out_size[0]
out_w = out_size[1]
if actual_shape is not None:
out_h = actual_shape[0]
out_w = actual_shape[1]
batch_size, channel, in_h, in_w = input.shape
if out_h > 1:
ratio_h = (in_h - 1.0) / (out_h - 1.0)
......@@ -53,18 +87,32 @@ def bilinear_interp_np(input, out_h, out_w, out_size):
return out.astype(input.dtype)
class TestBilinearInterpOp(OpTest):
INTERPOLATE_FUNCS = {
'bilinear': bilinear_interp_np,
'nearest': nearest_neighbor_interp_np,
}
class TestInterpolateOp(OpTest):
def setUp(self):
self.out_size = None
self.actual_shape = None
self.init_test_case()
self.op_type = "bilinear_interp"
self.op_type = "interpolate"
input_np = np.random.random(self.input_shape).astype("float32")
output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
self.out_size)
output_np = INTERPOLATE_FUNCS[self.interp_method](
input_np, self.out_h, self.out_w, self.out_size, self.actual_shape)
self.inputs = {'X': input_np}
if self.out_size is not None:
self.inputs['OutSize'] = self.out_size
self.attrs = {'out_h': self.out_h, 'out_w': self.out_w}
if self.actual_shape is not None:
self.inputs['OutSize'] = self.actual_shape
self.attrs = {
'out_h': self.out_h,
'out_w': self.out_w,
'interp_method': self.interp_method
}
self.outputs = {'Out': output_np}
def test_check_output(self):
......@@ -74,90 +122,209 @@ class TestBilinearInterpOp(OpTest):
self.check_grad(['X'], 'Out', in_place=True)
def init_test_case(self):
self.interp_method = 'bilinear'
self.input_shape = [2, 3, 4, 4]
self.out_h = 2
self.out_w = 2
self.out_size = np.array([3, 3]).astype("int32")
class TestCase1(TestBilinearInterpOp):
class TestBilinearInterpCase1(TestInterpolateOp):
def init_test_case(self):
self.interp_method = 'bilinear'
self.input_shape = [4, 1, 7, 8]
self.out_h = 1
self.out_w = 1
class TestCase2(TestBilinearInterpOp):
class TestBilinearInterpCase2(TestInterpolateOp):
def init_test_case(self):
self.interp_method = 'bilinear'
self.input_shape = [3, 3, 9, 6]
self.out_h = 12
self.out_w = 12
class TestCase3(TestBilinearInterpOp):
class TestBilinearInterpCase3(TestInterpolateOp):
def init_test_case(self):
self.interp_method = 'bilinear'
self.input_shape = [1, 1, 128, 64]
self.out_h = 64
self.out_w = 128
class TestCase4(TestBilinearInterpOp):
class TestBilinearInterpCase4(TestInterpolateOp):
def init_test_case(self):
self.interp_method = 'bilinear'
self.input_shape = [4, 1, 7, 8]
self.out_h = 1
self.out_w = 1
self.out_size = np.array([2, 2]).astype("int32")
class TestCase5(TestBilinearInterpOp):
class TestBilinearInterpCase5(TestInterpolateOp):
def init_test_case(self):
self.interp_method = 'bilinear'
self.input_shape = [3, 3, 9, 6]
self.out_h = 12
self.out_w = 12
self.out_size = np.array([11, 11]).astype("int32")
class TestCase6(TestBilinearInterpOp):
class TestBilinearInterpCase6(TestInterpolateOp):
def init_test_case(self):
self.interp_method = 'bilinear'
self.input_shape = [1, 1, 128, 64]
self.out_h = 64
self.out_w = 128
self.out_size = np.array([65, 129]).astype("int32")
class TestBilinearInterpOpUint8(OpTest):
class TestBilinearInterpActualShape(TestInterpolateOp):
def init_test_case(self):
self.interp_method = 'bilinear'
self.input_shape = [3, 2, 32, 16]
self.out_h = 64
self.out_w = 32
self.out_size = np.array([66, 40]).astype("int32")
class TestBilinearInterpBigScale(TestInterpolateOp):
def init_test_case(self):
self.interp_method = 'bilinear'
self.input_shape = [4, 4, 64, 32]
self.out_h = 100
self.out_w = 50
self.out_size = np.array([101, 51]).astype('int32')
class TestInterpolateOpUint8(OpTest):
def setUp(self):
self.out_size = None
self.actual_shape = None
self.init_test_case()
self.op_type = "bilinear_interp"
self.op_type = "interpolate"
input_np = np.random.randint(
low=0, high=256, size=self.input_shape).astype("uint8")
output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
self.out_size)
output_np = INTERPOLATE_FUNCS[self.interp_method](
input_np, self.out_h, self.out_w, self.out_size, self.actual_shape)
self.inputs = {'X': input_np}
if self.out_size is not None:
self.inputs['OutSize'] = self.out_size
self.attrs = {'out_h': self.out_h, 'out_w': self.out_w}
self.attrs = {
'out_h': self.out_h,
'out_w': self.out_w,
'interp_method': self.interp_method
}
self.outputs = {'Out': output_np}
def test_check_output(self):
self.check_output_with_place(place=core.CPUPlace(), atol=1)
def init_test_case(self):
self.interp_method = 'bilinear'
self.input_shape = [1, 3, 9, 6]
self.out_h = 10
self.out_w = 9
class TestCase1Uint8(TestBilinearInterpOpUint8):
class TestBilinearInterpCase1Uint8(TestInterpolateOpUint8):
def init_test_case(self):
self.interp_method = 'bilinear'
self.input_shape = [2, 3, 128, 64]
self.out_h = 120
self.out_w = 50
class TestBilinearInterpCase2Uint8(TestInterpolateOpUint8):
def init_test_case(self):
self.interp_method = 'bilinear'
self.input_shape = [4, 1, 7, 8]
self.out_h = 5
self.out_w = 13
self.out_size = np.array([6, 15]).astype("int32")
class TestNearestNeighborInterpCase1(TestInterpolateOp):
def init_test_case(self):
self.interp_method = 'nearest'
self.input_shape = [4, 1, 7, 8]
self.out_h = 1
self.out_w = 1
class TestNearestNeighborInterpCase2(TestInterpolateOp):
def init_test_case(self):
self.interp_method = 'nearest'
self.input_shape = [3, 3, 9, 6]
self.out_h = 12
self.out_w = 12
class TestNearestNeighborInterpCase3(TestInterpolateOp):
def init_test_case(self):
self.interp_method = 'nearest'
self.input_shape = [1, 1, 128, 64]
self.out_h = 64
self.out_w = 128
class TestNearestNeighborInterpCase4(TestInterpolateOp):
def init_test_case(self):
self.interp_method = 'nearest'
self.input_shape = [4, 1, 7, 8]
self.out_h = 1
self.out_w = 1
self.out_size = np.array([2, 2]).astype("int32")
class TestNearestNeighborInterpCase5(TestInterpolateOp):
def init_test_case(self):
self.interp_method = 'nearest'
self.input_shape = [3, 3, 9, 6]
self.out_h = 12
self.out_w = 12
self.out_size = np.array([11, 11]).astype("int32")
class TestNearestNeighborInterpCase6(TestInterpolateOp):
def init_test_case(self):
self.interp_method = 'nearest'
self.input_shape = [1, 1, 128, 64]
self.out_h = 64
self.out_w = 128
self.out_size = np.array([65, 129]).astype("int32")
class TestNearestNeighborInterpActualShape(TestInterpolateOp):
def init_test_case(self):
self.interp_method = 'nearest'
self.input_shape = [3, 2, 32, 16]
self.out_h = 64
self.out_w = 32
self.out_size = np.array([66, 40]).astype("int32")
class TestNearestNeighborInterpBigScale(TestInterpolateOp):
def init_test_case(self):
self.interp_method = 'nearest'
self.input_shape = [4, 4, 64, 32]
self.out_h = 100
self.out_w = 50
self.out_size = np.array([101, 51]).astype('int32')
class TestNearestNeighborInterpCase1Uint8(TestInterpolateOpUint8):
def init_test_case(self):
self.interp_method = 'nearest'
self.input_shape = [2, 3, 128, 64]
self.out_h = 120
self.out_w = 50
class TestCase2Uint8(TestBilinearInterpOpUint8):
class TestNearestNeighborInterpCase2Uint8(TestInterpolateOpUint8):
def init_test_case(self):
self.interp_method = 'nearest'
self.input_shape = [4, 1, 7, 8]
self.out_h = 5
self.out_w = 13
......
......@@ -496,6 +496,16 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(output)
print(str(program))
def test_resize_nearest(self):
program = Program()
with program_guard(program):
x = layers.data(name='x', shape=[3, 9, 6], dtype="float32")
output = layers.resize_nearest(x, out_shape=[12, 12])
self.assertIsNotNone(output)
output = layers.resize_nearest(x, scale=3)
self.assertIsNotNone(output)
print(str(program))
def test_polygon_box_transform(self):
program = Program()
with program_guard(program):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle.fluid.core as core
from op_test import OpTest
class TestSimilarityFocusOp(OpTest):
def setUp(self):
self.op_type = "similarity_focus"
batch_size = 2
x_dim, y_dim, z_dim = 3, 2, 2
self.inputs = {
'X': np.array([[[[0.8, 0.1], [0.4, 0.5]], [[0.9, 0.7], [0.9, 0.9]],
[[0.8, 0.9], [0.1, 0.2]]],
[[[0.2, 0.5], [0.3, 0.4]], [[0.9, 0.7], [0.8, 0.4]],
[[0.0, 0.2], [0.4, 0.7]]]]),
}
self.attrs = {
'axis': 1,
'indexes': [0],
}
output = None
for batch in range(batch_size):
res = np.zeros((1, y_dim, z_dim)).astype("float32").reshape(-1)
for index in self.attrs['indexes']:
channel = self.inputs['X'][batch, index, :, :].reshape(-1).copy(
)
tag1 = [0 for i in range(y_dim)]
tag2 = [0 for i in range(z_dim)]
cnt = 0
for i in range(channel.size):
index = channel.argmax()
idx1 = index // z_dim
idx2 = index % z_dim
if tag1[idx1] + tag2[idx2] == 0:
tag1[idx1] = 1
tag2[idx2] = 1
res[index] = 1
cnt += 1
if cnt == min(y_dim, z_dim):
break
channel[index] = -1
res = res.reshape(1, y_dim, z_dim).repeat([x_dim], axis=0)
res = res.reshape(1, x_dim, y_dim, z_dim)
if output is not None:
output = np.concatenate((output, res), axis=0)
else:
output = res
self.outputs = {'Out': output}
def test_check_output(self):
self.check_output()
class TestSimilarityFocusOp_axis1(OpTest):
def setUp(self):
self.op_type = "similarity_focus"
batch_size = 3
x_dim, y_dim, z_dim = 4, 5, 6
self.inputs = {
'X': np.random.random(
(batch_size, x_dim, y_dim, z_dim)).astype("float32"),
}
self.attrs = {
'axis': 1,
'indexes': [0, 3],
}
output = None
for batch in range(batch_size):
res = np.zeros((1, y_dim, z_dim)).astype("float32").reshape(-1)
for index in self.attrs['indexes']:
channel = self.inputs['X'][batch, index, :, :].reshape(-1).copy(
)
tag1 = [0 for i in range(y_dim)]
tag2 = [0 for i in range(z_dim)]
cnt = 0
for i in range(channel.size):
index = channel.argmax()
idx1 = index // z_dim
idx2 = index % z_dim
if tag1[idx1] + tag2[idx2] == 0:
tag1[idx1] = 1
tag2[idx2] = 1
res[index] = 1
cnt += 1
if cnt == min(y_dim, z_dim):
break
channel[index] = -1
res = res.reshape(1, y_dim, z_dim)
res = res.repeat([x_dim], axis=0)
res = res.reshape(1, x_dim, y_dim, z_dim)
if output is not None:
output = np.concatenate((output, res), axis=0)
else:
output = res
self.outputs = {'Out': output}
def test_check_output(self):
self.check_output()
class TestSimilarityFocusOp_axis2(OpTest):
def setUp(self):
self.op_type = "similarity_focus"
batch_size = 6
x_dim, y_dim, z_dim = 7, 8, 9
self.inputs = {
'X': np.random.random(
(batch_size, x_dim, y_dim, z_dim)).astype("float32"),
}
self.attrs = {
'axis': 2,
'indexes': [0, 3, 5],
}
output = None
for batch in range(batch_size):
res = np.zeros((x_dim, 1, z_dim)).astype("float32").reshape(-1)
for index in self.attrs['indexes']:
channel = self.inputs['X'][batch, :, index, :].reshape(-1).copy(
)
tag1 = [0 for i in range(x_dim)]
tag2 = [0 for i in range(z_dim)]
cnt = 0
for i in range(channel.size):
index = channel.argmax()
idx1 = index // z_dim
idx2 = index % z_dim
if tag1[idx1] + tag2[idx2] == 0:
tag1[idx1] = 1
tag2[idx2] = 1
res[index] = 1
cnt += 1
if cnt == min(x_dim, z_dim):
break
channel[index] = -1
res = res.reshape(x_dim, 1, z_dim)
res = res.repeat([y_dim], axis=1)
res = res.reshape(1, x_dim, y_dim, z_dim)
if output is not None:
output = np.concatenate((output, res), axis=0)
else:
output = res
self.outputs = {'Out': output}
def test_check_output(self):
self.check_output()
class TestSimilarityFocusOp_axis3(OpTest):
def setUp(self):
self.op_type = "similarity_focus"
batch_size = 64
x_dim, y_dim, z_dim = 48, 48, 13
self.inputs = {
'X': np.random.random(
(batch_size, x_dim, y_dim, z_dim)).astype("float32"),
}
self.attrs = {
'axis': 3,
'indexes': [0, 2, 7, 9],
}
output = None
for batch in range(batch_size):
res = np.zeros((x_dim, y_dim, 1)).astype("float32").reshape(-1)
for index in self.attrs['indexes']:
channel = self.inputs['X'][batch, :, :, index].reshape(-1).copy(
)
tag1 = [0 for i in range(x_dim)]
tag2 = [0 for i in range(y_dim)]
cnt = 0
for i in range(channel.size):
index = channel.argmax()
idx1 = index // y_dim
idx2 = index % y_dim
if tag1[idx1] + tag2[idx2] == 0:
tag1[idx1] = 1
tag2[idx2] = 1
res[index] = 1
cnt += 1
if cnt == min(x_dim, y_dim):
break
channel[index] = -1
res = res.reshape(x_dim, y_dim, 1)
res = res.repeat([z_dim], axis=2)
res = res.reshape(1, x_dim, y_dim, z_dim)
if output is not None:
output = np.concatenate((output, res), axis=0)
else:
output = res
self.outputs = {'Out': output}
def test_check_output(self):
self.check_output()
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.op import Operator
from paddle.fluid.executor import Executor
class TestLoDTensorArrayConcat(unittest.TestCase):
def setUp(self):
self.op_type = "tensor_array_to_tensor"
self.attrs = {"axis": 0}
self.outputs = ["Out"]
def test_get_set(self):
scope = core.Scope()
program = fluid.Program()
block = program.global_block()
input_arr = block.create_var(
name="tmp_lod_tensor_array",
type=core.VarDesc.VarType.LOD_TENSOR_ARRAY)
input_arr.persistable = True
input_arr_var = scope.var('tmp_lod_tensor_array')
input_tensor_array = input_arr_var.get_lod_tensor_array()
self.assertEqual(0, len(input_tensor_array))
cpu = core.CPUPlace()
for i in range(10):
t = core.LoDTensor()
if i == 0:
t.set(numpy.array([[i], [i]], dtype='float32'), cpu)
else:
t.set(numpy.array([[i]], dtype='float32'), cpu)
input_tensor_array.append(t)
self.assertEqual(10, len(input_tensor_array))
random_grad = numpy.random.random_sample([11]).astype(numpy.float32)
y_out = block.create_var(name="Out")
y_out.persistable = True
y_out_index = block.create_var(name="OutIndex")
y_out_index.persistable = True
y_grad_arr = block.create_var(
name='Out@GRAD', dtype='float32', shape=[11])
y_grad_arr.persistable = True
y_grad = scope.var('Out@GRAD')
y_grad_tensor = y_grad.get_tensor()
y_grad_tensor.set(random_grad, cpu)
op = block.append_op(
type=self.op_type,
inputs={"X": input_arr},
outputs={"Out": y_out,
"OutIndex": y_out_index},
attrs=self.attrs)
out_grad = block.create_var(
name="tmp_lod_tensor_array@GRAD",
type=core.VarDesc.VarType.LOD_TENSOR_ARRAY)
out_grad.persistable = True
grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(op.desc,
set(), [])
grad_op_desc = grad_op_desc_list[0]
new_op_desc = block.desc.append_op()
new_op_desc.copy_from(grad_op_desc)
for var_name in grad_op_desc.output_arg_names():
block.desc.var(var_name.encode("ascii"))
grad_op_desc.infer_var_type(block.desc)
grad_op_desc.infer_shape(block.desc)
for arg in grad_op_desc.output_arg_names():
grad_var = block.desc.find_var(arg.encode("ascii"))
grad_var.set_dtype(core.VarDesc.VarType.FP32)
fetch_list = []
fetch_list.append(block.var('Out'))
fetch_list.append(block.var('OutIndex'))
exe = fluid.Executor(fluid.CPUPlace())
out = exe.run(program, fetch_list=fetch_list, scope=scope)
#print ("index: ", numpy.array(out[1]))
# test forward
tensor_res = numpy.array(out[0])
tensor_res_out_idx = numpy.array(out[1])
tensor_gt = numpy.array(
[0] + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='float32')
self.assertEqual(len(tensor_res), len(tensor_gt))
self.assertEqual(len(tensor_res_out_idx), 10)
for i in range(len(tensor_res)):
self.assertEqual(tensor_res[i], tensor_gt[i])
for i in range(len(tensor_res_out_idx)):
if i == 0:
self.assertEqual(tensor_res_out_idx[i], 2)
else:
self.assertEqual(tensor_res_out_idx[i], 1)
# test backward
grad_tensor = scope.var('tmp_lod_tensor_array@GRAD')
grad_tensor_array = grad_tensor.get_lod_tensor_array()
self.assertEqual(10, len(grad_tensor_array))
for i in range(len(grad_tensor_array)):
if i == 0:
self.assertEqual(
numpy.array(grad_tensor_array[i])[0],
numpy.array(random_grad[i]))
self.assertEqual(
numpy.array(grad_tensor_array[i])[1],
numpy.array(random_grad[i + 1]))
if i == 1:
self.assertEqual(
numpy.array(grad_tensor_array[i]),
numpy.array(random_grad[i + 1]))
if __name__ == '__main__':
unittest.main()
......@@ -31,18 +31,17 @@ Steps to transpile pserver:
"""
import math
import sys
import numpy as np
import collections
import six
import logging
from .ps_dispatcher import RoundRobin, HashName, PSDispatcher
from .ps_dispatcher import RoundRobin, PSDispatcher
from .. import core, framework, unique_name
from ..framework import Program, default_main_program, \
default_startup_program, Block, \
Parameter, grad_var_name
from .details import *
from ..distribute_lookup_table import find_distributed_lookup_table
from functools import reduce
LOOKUP_TABLE_TYPE = "lookup_table"
......@@ -292,7 +291,8 @@ class DistributeTranspiler(object):
self.optimize_ops, self.params_grads = self._get_optimize_pass()
ps_dispatcher = self.config.split_method(self.pserver_endpoints)
self.has_distributed_lookup_table = self._has_distributed_lookup_table()
self.table_name = find_distributed_lookup_table(self.origin_program)
self.has_distributed_lookup_table = self.table_name != None
self.param_name_to_grad_name = dict()
self.grad_name_to_param_name = dict()
for param_var, grad_var in self.params_grads:
......@@ -966,28 +966,6 @@ to transpile() call.")
# ====================== private transpiler functions =====================
def _has_distributed_lookup_table(self):
# process lookup_table_op
# 1. check all lookup_table_op is distributed
# 2. check all lookup_table_op share the same table.
distributed_lookup_table_ops = []
# support only one distributed_lookup_table now
self.table_name = None
for op in self.origin_program.global_block().ops:
if op.type == LOOKUP_TABLE_TYPE:
if op.attr('is_distributed') is True:
if self.table_name is None:
self.table_name = op.input("W")[0]
if self.table_name != op.input("W")[0]:
raise RuntimeError("all distributed lookup_table_ops"
" should have only one table")
distributed_lookup_table_ops.append(op)
else:
if self.table_name is not None:
assert op.input("W")[0] != self.table_name
return len(distributed_lookup_table_ops) > 0
def _update_dist_lookup_table_vars(self, param_list, grad_list,
params_grads):
# TODO(wuyi): put find a way to put dist lookup table stuff all together.
......@@ -1341,7 +1319,6 @@ to transpile() call.")
"""
create a new block to handle save checkpoint.
"""
import os
pserver_program.global_block().create_var(
name="kLookupTablePath",
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册