提交 38612695 编写于 作者: D dzhwinter

merge develop branch

...@@ -18,7 +18,7 @@ function(copy TARGET) ...@@ -18,7 +18,7 @@ function(copy TARGET)
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DSTS DEPS) set(multiValueArgs SRCS DSTS DEPS)
cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(inference_lib_dist_dep ${TARGET} ${inference_lib_dist_dep} PARENT_SCOPE) set(fluid_lib_dist_dep ${TARGET} ${fluid_lib_dist_dep} PARENT_SCOPE)
list(LENGTH copy_lib_SRCS copy_lib_SRCS_len) list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
list(LENGTH copy_lib_DSTS copy_lib_DSTS_len) list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
...@@ -185,7 +185,8 @@ copy(cmake_cache ...@@ -185,7 +185,8 @@ copy(cmake_cache
SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
DSTS ${FLUID_INSTALL_DIR}) DSTS ${FLUID_INSTALL_DIR})
add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep}) # This command generates a complete fluid library for both train and inference
add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep})
# paddle fluid version # paddle fluid version
execute_process( execute_process(
......
...@@ -75,7 +75,8 @@ paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'outp ...@@ -75,7 +75,8 @@ paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'outp
paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None)) paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None))
paddle.fluid.layers.sequence_expand_as ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_expand_as ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen', 'name'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.sequence_unpad ArgSpec(args=['x', 'length', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None)) paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None))
paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)) paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
paddle.fluid.layers.reduce_mean ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)) paddle.fluid.layers.reduce_mean ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
...@@ -127,6 +128,7 @@ paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None ...@@ -127,6 +128,7 @@ paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None
paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.margin_rank_loss ArgSpec(args=['label', 'left', 'right', 'margin', 'name'], varargs=None, keywords=None, defaults=(0.1, None))
paddle.fluid.layers.elu ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(1.0, None)) paddle.fluid.layers.elu ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(1.0, None))
paddle.fluid.layers.relu6 ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(6.0, None)) paddle.fluid.layers.relu6 ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(6.0, None))
paddle.fluid.layers.pow ArgSpec(args=['x', 'factor', 'name'], varargs=None, keywords=None, defaults=(1.0, None)) paddle.fluid.layers.pow ArgSpec(args=['x', 'factor', 'name'], varargs=None, keywords=None, defaults=(1.0, None))
......
...@@ -12,6 +12,5 @@ endif(NOT WIN32) ...@@ -12,6 +12,5 @@ endif(NOT WIN32)
if(WITH_INFERENCE) if(WITH_INFERENCE)
# NOTE: please add subdirectory inference at last. # NOTE: please add subdirectory inference at last.
add_subdirectory(inference) add_subdirectory(inference)
add_subdirectory(train)
endif() endif()
add_subdirectory(train)
...@@ -64,7 +64,8 @@ class OpHandleBase { ...@@ -64,7 +64,8 @@ class OpHandleBase {
virtual bool IsMultiDeviceTransfer() { return false; } virtual bool IsMultiDeviceTransfer() { return false; }
const platform::DeviceContext *DeviceContext(platform::Place place) { const platform::DeviceContext *DeviceContext(platform::Place place) {
return dev_ctxes_[place]; auto it = dev_ctxes_.find(place);
return it != dev_ctxes_.end() ? it->second : nullptr;
} }
void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) { void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) {
......
...@@ -46,6 +46,41 @@ ExecutorPrepareContext::~ExecutorPrepareContext() { ...@@ -46,6 +46,41 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
VLOG(5) << "destroy ExecutorPrepareContext"; VLOG(5) << "destroy ExecutorPrepareContext";
} }
template <typename RefCntMap>
static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
GarbageCollector<Tensor>* gc,
RefCntMap* ref_cnts) {
std::unordered_set<Tensor*> erase_tensors;
auto handler = [&](const VariableNameMap& name_map) {
for (auto& name_pair : name_map) {
for (auto& name : name_pair.second) {
auto it = ref_cnts->find(name);
if (it == ref_cnts->end()) continue;
if ((it->second)-- == 1) {
auto* var = scope.FindVar(name);
if (var != nullptr) {
VLOG(10) << "Erase tensor \'" << name << "\'";
if (var->IsType<LoDTensor>()) {
erase_tensors.insert(var->GetMutable<LoDTensor>());
} else if (var->IsType<SelectedRows>()) {
erase_tensors.insert(
var->GetMutable<SelectedRows>()->mutable_value());
}
}
}
}
}
};
handler(op->Inputs());
handler(op->Outputs());
if (!erase_tensors.empty()) {
gc->Add(erase_tensors);
}
}
Executor::Executor(const platform::Place& place) : place_(place) {} Executor::Executor(const platform::Place& place) : place_(place) {}
void Executor::Close() { void Executor::Close() {
...@@ -331,9 +366,13 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, ...@@ -331,9 +366,13 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
} }
int64_t max_memory_size = GetEagerDeletionThreshold(); int64_t max_memory_size = GetEagerDeletionThreshold();
std::unique_ptr<GarbageCollector<Tensor>> gc; std::unique_ptr<GarbageCollector<Tensor>> gc;
if (max_memory_size >= 0) { // WhileOp would set keep_kids to false
// WhileGradOp would need the scopes created in WhileOp
// Perhaps, we should not perform eager deletion in WhileOp
// The scopes and variables created by WhileOp would be deleted
// in WhileGradOp.
if (max_memory_size >= 0 && !keep_kids) {
ctx->ResetReferenceCount(); ctx->ResetReferenceCount();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(place_)) { if (platform::is_gpu_place(place_)) {
...@@ -352,45 +391,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, ...@@ -352,45 +391,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
op->Run(*local_scope, place_); op->Run(*local_scope, place_);
if (gc != nullptr) { if (gc != nullptr) {
std::vector<std::string> erase_vars; DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
for (auto& input : op->Inputs()) { &(ctx->cur_ref_cnts_));
for (auto& input_name : input.second) {
auto it = ctx->cur_ref_cnts_.find(input_name);
if (it == ctx->cur_ref_cnts_.end()) continue;
if (it->second == 1) { // should delete it
erase_vars.emplace_back(input_name);
ctx->cur_ref_cnts_.erase(input_name);
} else {
--(it->second);
}
}
}
for (auto& output : op->Outputs()) {
for (auto& output_name : output.second) {
auto it = ctx->cur_ref_cnts_.find(output_name);
if (it == ctx->cur_ref_cnts_.end()) continue;
if (it->second == 1) {
erase_vars.emplace_back(output_name);
ctx->cur_ref_cnts_.erase(output_name);
} else {
--(it->second);
}
}
}
if (!erase_vars.empty()) {
std::vector<framework::LoDTensor*> erase_tensors;
for (auto& name : erase_vars) {
auto* var = local_scope->FindVar(name);
if (var == nullptr) continue;
if (var->IsType<framework::LoDTensor>()) {
auto* tensor = var->GetMutable<framework::LoDTensor>();
erase_tensors.push_back(tensor);
}
}
if (!erase_tensors.empty()) gc->Add(erase_tensors);
}
} }
if (FLAGS_benchmark) { if (FLAGS_benchmark) {
......
...@@ -32,38 +32,32 @@ template <typename T> ...@@ -32,38 +32,32 @@ template <typename T>
std::unordered_map<std::string, T> GetNonPersistableReferenceCount( std::unordered_map<std::string, T> GetNonPersistableReferenceCount(
const ProgramDesc& prog, size_t block_id) { const ProgramDesc& prog, size_t block_id) {
auto& block = prog.Block(block_id); auto& block = prog.Block(block_id);
std::unordered_set<std::string> ignored_vars;
std::unordered_map<std::string, T> ref_cnts; std::unordered_map<std::string, T> ref_cnts;
for (auto var_desc : block.AllVars()) { auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) {
auto type = var_desc->Proto()->type().type(); for (auto& name_pair : name_map) {
if (type != proto::VarType::LOD_TENSOR || var_desc->Persistable()) { for (auto& name : name_pair.second) {
ignored_vars.insert(var_desc->Name()); // ignore persistable vars auto* var_desc = block.FindVar(name);
} if (var_desc == nullptr || var_desc->Persistable()) continue;
} auto type = var_desc->Proto()->type().type();
if (type != proto::VarType::LOD_TENSOR &&
for (auto op_desc : block.AllOps()) { type != proto::VarType::SELECTED_ROWS) {
for (auto& input : op_desc->Inputs()) { continue;
for (auto& input_name : input.second) {
if (!ignored_vars.count(input_name)) {
if (ref_cnts.count(input_name))
++ref_cnts[input_name];
else
ref_cnts[input_name] = 1;
} }
}
}
for (auto& output : op_desc->Outputs()) { auto it = ref_cnts.find(name);
for (auto output_name : output.second) { if (it != ref_cnts.end()) {
if (!ignored_vars.count(output_name)) { ++it->second;
if (ref_cnts.count(output_name)) } else {
++ref_cnts[output_name]; ref_cnts[name] = 1;
else
ref_cnts[output_name] = 1;
} }
} }
} }
};
for (auto op_desc : block.AllOps()) {
update_ref_cnts(op_desc, op_desc->Inputs());
update_ref_cnts(op_desc, op_desc->Outputs());
} }
return ref_cnts; return ref_cnts;
} }
......
...@@ -30,7 +30,6 @@ pass_library(graph_to_program_pass base) ...@@ -30,7 +30,6 @@ pass_library(graph_to_program_pass base)
pass_library(graph_viz_pass base) pass_library(graph_viz_pass base)
pass_library(fc_fuse_pass inference) pass_library(fc_fuse_pass inference)
if (WITH_MKLDNN) if (WITH_MKLDNN)
pass_library(conv_bias_mkldnn_fuse_pass inference)
pass_library(conv_relu_mkldnn_fuse_pass inference) pass_library(conv_relu_mkldnn_fuse_pass inference)
endif () endif ()
pass_library(attention_lstm_fuse_pass inference) pass_library(attention_lstm_fuse_pass inference)
...@@ -53,7 +52,6 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r ...@@ -53,7 +52,6 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r
cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
if(WITH_MKLDNN) if (WITH_MKLDNN)
cc_test(test_conv_bias_mkldnn_fuse_pass SRCS conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass)
cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
endif() endif ()
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h"
#include <string>
#include <vector>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
namespace ir {
std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
PADDLE_ENFORCE(graph.get());
FusePassBase::Init("conv_bias_mkldnn_fuse", graph.get());
GraphPatternDetector gpd;
auto* conv_input = gpd.mutable_pattern()
->NewNode("conv_bias_mkldnn_fuse/conv_input")
->AsInput()
->assert_is_op_input("conv2d", "Input");
patterns::ConvBias conv_bias_pattern(gpd.mutable_pattern(),
"conv_bias_mkldnn_fuse");
conv_bias_pattern(conv_input);
int found_conv_bias_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(4) << "handle ConvBias fuse";
GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
conv_bias_pattern); // Filter
GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_bias_pattern); // tmp
GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_bias_pattern); // CONV op
// bias
GET_IR_NODE_FROM_SUBGRAPH(eltwise_bias, eltwise_bias, conv_bias_pattern);
// output
GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_bias_pattern);
// elementwise_add op
GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_bias_pattern);
// Create an ConvBias Node.
OpDesc desc;
std::string conv_bias_i_in = subgraph.at(conv_input)->Name();
std::string conv_bias_w_in = conv_weight->Name();
std::string conv_bias_b_in = eltwise_bias->Name();
std::string conv_bias_out = eltwise_out->Name();
desc.SetInput("Input", std::vector<std::string>({conv_bias_i_in}));
desc.SetInput("Filter", std::vector<std::string>({conv_bias_w_in}));
desc.SetInput("Bias", std::vector<std::string>({conv_bias_b_in}));
desc.SetOutput("Output", std::vector<std::string>({conv_bias_out}));
desc.SetType("conv2d");
for (auto& attr : conv->Op()->GetAttrMap()) {
desc.SetAttr(attr.first, attr.second);
}
auto conv_bias_node = g->CreateOpNode(&desc); // OpDesc will be copied.
GraphSafeRemoveNodes(graph.get(), {conv, eltwise, conv_out});
PADDLE_ENFORCE(subgraph.count(conv_input));
IR_NODE_LINK_TO(subgraph.at(conv_input), conv_bias_node);
IR_NODE_LINK_TO(conv_weight, conv_bias_node);
IR_NODE_LINK_TO(eltwise_bias, conv_bias_node);
IR_NODE_LINK_TO(conv_bias_node, eltwise_out);
found_conv_bias_count++;
};
gpd(graph.get(), handler);
AddStatis(found_conv_bias_count);
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(conv_bias_mkldnn_fuse_pass,
paddle::framework::ir::ConvBiasFusePass);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h"
#include <gtest/gtest.h>
namespace paddle {
namespace framework {
namespace ir {
void SetOp(ProgramDesc* prog, const std::string& type,
const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs) {
auto* op = prog->MutableBlock(0)->AppendOp();
op->SetType(type);
if (type == "conv2d") {
op->SetAttr("use_mkldnn", true);
op->SetInput("Input", {inputs[0]});
op->SetInput("Filter", {inputs[1]});
} else if (type == "elementwise_add") {
op->SetInput("X", {inputs[0]});
op->SetInput("Y", {inputs[1]});
}
op->SetOutput("Out", outputs);
}
// a->OP0->b
// b->OP1->c
// (c, weights)->conv->f
// (f, bias)->elementwise_add->g
ProgramDesc BuildProgramDesc() {
ProgramDesc prog;
for (auto& v :
std::vector<std::string>({"a", "b", "c", "weights", "bias", "f", "g"})) {
auto* var = prog.MutableBlock(0)->Var(v);
var->SetType(proto::VarType::SELECTED_ROWS);
if (v == "weights" || v == "bias") {
var->SetPersistable(true);
}
}
SetOp(&prog, "OP0", std::vector<std::string>({"a"}),
std::vector<std::string>({"b"}));
SetOp(&prog, "OP1", std::vector<std::string>({"b"}),
std::vector<std::string>({"c"}));
SetOp(&prog, "conv2d", std::vector<std::string>({"c", "weights"}),
std::vector<std::string>({"f"}));
SetOp(&prog, "elementwise_add", std::vector<std::string>({"f", "bias"}),
std::vector<std::string>({"g"}));
return prog;
}
TEST(ConvBiasFusePass, basic) {
auto prog = BuildProgramDesc();
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
auto pass = PassRegistry::Instance().Get("conv_bias_mkldnn_fuse_pass");
int original_nodes_num = graph->Nodes().size();
graph = pass->Apply(std::move(graph));
int current_nodes_num = graph->Nodes().size();
// Remove 3 Nodes: conv, elementwise_add, conv_out
// Add 1 Node: ConvBias
EXPECT_EQ(original_nodes_num - 2, current_nodes_num);
// Assert conv_bias op in newly generated graph
int conv_bias_count = 0;
for (auto* node : graph->Nodes()) {
if (node->IsOp() && node->Op()->Type() == "conv2d") {
if (node->Op()->HasAttr("use_mkldnn")) {
bool use_mkldnn = boost::get<bool>(node->Op()->GetAttr("use_mkldnn"));
if (use_mkldnn) {
auto names = node->Op()->InputNames();
if (std::find(names.begin(), names.end(), "Bias") != names.end()) {
conv_bias_count++;
}
}
}
}
}
EXPECT_EQ(conv_bias_count, 1);
}
} // namespace ir
} // namespace framework
} // namespace paddle
USE_PASS(conv_bias_mkldnn_fuse_pass);
...@@ -44,89 +44,6 @@ namespace ir { ...@@ -44,89 +44,6 @@ namespace ir {
GET_IR_NODE_FROM_SUBGRAPH(bn_saved_mean, bn_saved_mean, pattern_name); \ GET_IR_NODE_FROM_SUBGRAPH(bn_saved_mean, bn_saved_mean, pattern_name); \
GET_IR_NODE_FROM_SUBGRAPH(bn_saved_variance, bn_saved_variance, pattern_name) GET_IR_NODE_FROM_SUBGRAPH(bn_saved_variance, bn_saved_variance, pattern_name)
template <typename UnaryOperation>
LoDTensor tensor_apply(const LoDTensor& vec, UnaryOperation f) {
LoDTensor vec_y;
vec_y.Resize(vec.dims());
const float* x = vec.data<float>();
float* y = vec_y.mutable_data<float>(platform::CPUPlace());
for (int64_t i = 0; i < vec.numel(); i++) {
y[i] = f(x[i]);
}
return vec_y;
}
void tensor_apply_inplace(LoDTensor* vec, float (*f)(float)) {
float* data = vec->mutable_data<float>(platform::CPUPlace());
for (int64_t i = 0; i < vec->numel(); i++) {
data[i] = f(data[i]);
}
}
template <typename BinaryOperation>
LoDTensor tensor_apply_eltwise(const LoDTensor& vec_a, const LoDTensor& vec_b,
BinaryOperation f) {
PADDLE_ENFORCE_EQ(vec_a.dims(), vec_b.dims());
LoDTensor vec_y;
vec_y.Resize(vec_a.dims());
const float* a = vec_a.data<float>();
const float* b = vec_b.data<float>();
float* y = vec_y.mutable_data<float>(platform::CPUPlace());
for (int64_t i = 0; i < vec_a.numel(); i++) {
y[i] = f(a[i], b[i]);
}
return vec_y;
}
template <typename BinaryOperation>
LoDTensor tensor_apply_eltwise_broadcast(const LoDTensor& vec_a,
const LoDTensor& vec_b,
BinaryOperation f) {
PADDLE_ENFORCE_EQ(vec_a.dims().size(), 2);
PADDLE_ENFORCE_EQ(vec_b.dims().size(), 2);
PADDLE_ENFORCE_EQ(vec_a.dims()[0], vec_b.dims()[0]);
PADDLE_ENFORCE_EQ(vec_b.dims()[1], 1);
LoDTensor vec_y;
vec_y.Resize(vec_a.dims());
const float* a = vec_a.data<float>();
const float* b = vec_b.data<float>();
float* y = vec_y.mutable_data<float>(platform::CPUPlace());
size_t a_height = vec_a.dims()[0];
size_t a_width = vec_a.dims()[1];
for (size_t h = 0; h < a_height; h++) {
for (size_t w = 0; w < a_width; ++w) {
*(y++) = f(*(a++), b[h]);
}
}
return vec_y;
}
// reshape to two dimensions {A, B * C * ...}
void make_tensor_2d(LoDTensor* tensor_to_reshape) {
auto dims_count = tensor_to_reshape->dims().size();
PADDLE_ENFORCE_GT(dims_count, 0);
int size2 = 1;
for (int i = 1; i < dims_count; i++) {
size2 *= tensor_to_reshape->dims()[i];
}
tensor_to_reshape->Resize(make_ddim({tensor_to_reshape->dims()[0], size2}));
}
void recompute_conv_weights(LoDTensor* weights, LoDTensor* tmp) {
// remember the weights tensor shape {A, B, C, ...}
auto weights_shape = weights->dims();
// reduce the weights to 2d {A, B * C * ...}
make_tensor_2d(weights);
// make tmp tensor 2d by adding 1 as second dim {A, 1}
make_tensor_2d(tmp);
*weights =
tensor_apply_eltwise_broadcast(*weights, *tmp, std::multiplies<float>());
// reshape weights to the original dims {A, B, C, ...}
weights->Resize(weights_shape);
}
void recompute_bias_and_weights(const Scope* scope, void recompute_bias_and_weights(const Scope* scope,
ir::Node* conv_weight, // ir::Node* conv_weight, //
const ir::Node& bn_scale, // const ir::Node& bn_scale, //
...@@ -135,6 +52,13 @@ void recompute_bias_and_weights(const Scope* scope, ...@@ -135,6 +52,13 @@ void recompute_bias_and_weights(const Scope* scope,
const ir::Node& bn_variance, // const ir::Node& bn_variance, //
LoDTensor* eltwise_y_in_tensor, // LoDTensor* eltwise_y_in_tensor, //
float epsilon) { float epsilon) {
using EigenVectorArrayMap =
Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
using EigenMatrixArrayMap = Eigen::Map<
Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
// Re-compute bias of conv2d from BN // Re-compute bias of conv2d from BN
PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(), bn_bias_tensor.dims()); PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(), bn_bias_tensor.dims());
...@@ -143,31 +67,38 @@ void recompute_bias_and_weights(const Scope* scope, ...@@ -143,31 +67,38 @@ void recompute_bias_and_weights(const Scope* scope,
scope->FindVar(bn_variance.Name())->GetMutable<LoDTensor>(); scope->FindVar(bn_variance.Name())->GetMutable<LoDTensor>();
auto* mean_tensor = scope->FindVar(bn_mean.Name())->GetMutable<LoDTensor>(); auto* mean_tensor = scope->FindVar(bn_mean.Name())->GetMutable<LoDTensor>();
auto std_tensor = LoDTensor(); ConstEigenVectorArrayMap scale_array(scale_tensor->data<float>(),
std_tensor.Resize(bn_bias_tensor.dims()); scale_tensor->numel(), 1);
std_tensor = EigenVectorArrayMap variance_array(
tensor_apply(*variance_tensor, [&](float x) { return x + epsilon; }); variance_tensor->mutable_data<float>(platform::CPUPlace()),
variance_tensor->numel(), 1);
ConstEigenVectorArrayMap mean_array(mean_tensor->data<float>(),
mean_tensor->numel(), 1);
ConstEigenVectorArrayMap bn_bias_array(bn_bias_tensor.data<float>(),
bn_bias_tensor.numel(), 1);
using EigenVectorArrayMap = // variance will not be used anymore, so make it std_array and then tmp_array
Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>; variance_array += epsilon;
variance_array = variance_array.sqrt();
variance_array = scale_array / variance_array;
EigenVectorArrayMap eltwise_y_in_array(
eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
eltwise_y_in_tensor->numel(), 1);
EigenVectorArrayMap std_vec( eltwise_y_in_array =
std_tensor.mutable_data<float>(platform::CPUPlace()), std_tensor.numel(), ((eltwise_y_in_array - mean_array) * variance_array) + bn_bias_array;
1);
std_vec = std_vec.sqrt();
auto tmp_tensor =
tensor_apply_eltwise(*scale_tensor, std_tensor, std::divides<float>());
auto tensor_minus = tensor_apply_eltwise(*eltwise_y_in_tensor, *mean_tensor,
std::minus<float>());
auto tensor_mul =
tensor_apply_eltwise(tensor_minus, tmp_tensor, std::multiplies<float>());
*eltwise_y_in_tensor =
tensor_apply_eltwise(tensor_mul, bn_bias_tensor, std::plus<float>());
// Re-compute weight of conv2d from BN // Re-compute weight of conv2d from BN
auto* current_param = auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>(); auto weights_shape = weights->dims();
recompute_conv_weights(current_param, &tmp_tensor); auto weights_shape_2d = flatten_to_2d(weights_shape, 1);
EigenMatrixArrayMap weights_array_2d(
weights->mutable_data<float>(platform::CPUPlace()), weights_shape_2d[0],
weights_shape_2d[1]);
weights_array_2d.colwise() *= variance_array;
} }
std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl( std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
......
...@@ -964,38 +964,6 @@ PDNode *patterns::ElewiseAddActInplaceGrad::operator()( ...@@ -964,38 +964,6 @@ PDNode *patterns::ElewiseAddActInplaceGrad::operator()(
return ele_add_grad; return ele_add_grad;
} }
PDNode *patterns::ConvBias::operator()(
paddle::framework::ir::PDNode *conv_input) {
// Create Operators
conv_input->assert_is_op_input("conv2d", "Input");
auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
auto *eltiwse_op =
pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add");
// Create variables
// Filter
auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
->AsInput()
->assert_is_persistable_var()
->assert_is_op_input("conv2d", "Filter");
// intermediate variable, will be removed in the IR after fuse.
auto *conv_out_var = pattern->NewNode(conv_out_repr())
->AsIntermediate()
->assert_is_only_output_of_op("conv2d")
->assert_is_op_input("elementwise_add");
// Bias stored in elementwise_add
auto *eltwise_bias_var = pattern->NewNode(eltwise_bias_repr())
->AsInput()
->assert_is_op_input("elementwise_add", "Y");
// output
auto *eltwise_out_var = pattern->NewNode(eltwise_out_repr())
->AsOutput()
->assert_is_op_output("elementwise_add");
conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
eltiwse_op->LinksFrom({conv_out_var, eltwise_bias_var})
.LinksTo({eltwise_out_var});
return eltwise_out_var;
}
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -578,27 +578,6 @@ struct ElewiseAddActInplaceGrad : public PatternBase { ...@@ -578,27 +578,6 @@ struct ElewiseAddActInplaceGrad : public PatternBase {
PATTERN_DECL_NODE(d_ele_y); PATTERN_DECL_NODE(d_ele_y);
PATTERN_DECL_NODE(ele_y); PATTERN_DECL_NODE(ele_y);
}; };
// Conv with Elementwise_add as bias
// op: conv + elementwise_add
// named nodes:
// conv_input, conv_weight,
// conv_out, conv,
// eltwise_bias, eltwise_out,
// elementwise_add
struct ConvBias : public PatternBase {
ConvBias(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "conv_bias") {}
PDNode* operator()(PDNode* conv_input);
// declare operator node's name
PATTERN_DECL_NODE(conv);
PATTERN_DECL_NODE(eltwise);
// declare variable node's name
PATTERN_DECL_NODE(conv_weight);
PATTERN_DECL_NODE(conv_out);
PATTERN_DECL_NODE(eltwise_bias);
PATTERN_DECL_NODE(eltwise_out);
};
} // namespace patterns } // namespace patterns
// Link two ir::Nodes from each other. // Link two ir::Nodes from each other.
......
...@@ -307,6 +307,10 @@ ParallelExecutor::~ParallelExecutor() { ...@@ -307,6 +307,10 @@ ParallelExecutor::~ParallelExecutor() {
} }
} }
} }
// member_ must be destructed before gcs_ since the destructor of
// ReferenceCountOpHandle use raw pointers of gcs_ inside.
member_.reset();
} }
} // namespace framework } // namespace framework
......
...@@ -75,7 +75,7 @@ class ParallelExecutor { ...@@ -75,7 +75,7 @@ class ParallelExecutor {
private: private:
void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const; void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;
ParallelExecutorPrivate *member_; std::unique_ptr<ParallelExecutorPrivate> member_;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
// ref_cnts_ is only initialized when ParallelExecutor constructs, and then // ref_cnts_ is only initialized when ParallelExecutor constructs, and then
......
...@@ -49,18 +49,18 @@ int64_t GetEagerDeletionThreshold() { ...@@ -49,18 +49,18 @@ int64_t GetEagerDeletionThreshold() {
Scope::~Scope() { DropKids(); } Scope::~Scope() { DropKids(); }
Scope& Scope::NewScope() const { Scope& Scope::NewScope() const {
std::unique_lock<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
kids_.push_back(new Scope(this)); kids_.push_back(new Scope(this));
return *kids_.back(); return *kids_.back();
} }
Variable* Scope::Var(const std::string& name) { Variable* Scope::Var(const std::string& name) {
std::unique_lock<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
return VarInternal(name); return VarInternal(name);
} }
Variable* Scope::Var(std::string* name) { Variable* Scope::Var(std::string* name) {
std::unique_lock<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
auto new_name = string::Sprintf("%p.%d", this, vars_.size()); auto new_name = string::Sprintf("%p.%d", this, vars_.size());
if (name != nullptr) { if (name != nullptr) {
*name = new_name; *name = new_name;
...@@ -69,29 +69,34 @@ Variable* Scope::Var(std::string* name) { ...@@ -69,29 +69,34 @@ Variable* Scope::Var(std::string* name) {
} }
Variable* Scope::FindVar(const std::string& name) const { Variable* Scope::FindVar(const std::string& name) const {
std::unique_lock<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
return FindVarInternal(name); return FindVarInternal(name);
} }
Variable* Scope::FindLocalVar(const std::string& name) const {
std::lock_guard<std::mutex> lock(mutex_);
return FindVarLocally(name);
}
const Scope* Scope::FindScope(const Variable* var) const { const Scope* Scope::FindScope(const Variable* var) const {
std::unique_lock<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
return FindScopeInternal(var); return FindScopeInternal(var);
} }
void Scope::DropKids() { void Scope::DropKids() {
std::unique_lock<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
for (Scope* s : kids_) delete s; for (Scope* s : kids_) delete s;
kids_.clear(); kids_.clear();
} }
bool Scope::HasKid(const Scope* scope) const { bool Scope::HasKid(const Scope* scope) const {
std::unique_lock<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
return it != this->kids_.end(); return it != this->kids_.end();
} }
std::vector<std::string> Scope::LocalVarNames() const { std::vector<std::string> Scope::LocalVarNames() const {
std::unique_lock<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
std::vector<std::string> known_vars; std::vector<std::string> known_vars;
known_vars.reserve(this->vars_.size()); known_vars.reserve(this->vars_.size());
for (auto& p : vars_) { for (auto& p : vars_) {
...@@ -101,7 +106,7 @@ std::vector<std::string> Scope::LocalVarNames() const { ...@@ -101,7 +106,7 @@ std::vector<std::string> Scope::LocalVarNames() const {
} }
void Scope::DeleteScope(Scope* scope) const { void Scope::DeleteScope(Scope* scope) const {
std::unique_lock<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope); PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
this->kids_.erase(it); this->kids_.erase(it);
...@@ -114,7 +119,7 @@ void Scope::DeleteScope(Scope* scope) const { ...@@ -114,7 +119,7 @@ void Scope::DeleteScope(Scope* scope) const {
} }
void Scope::EraseVars(const std::vector<std::string>& var_names) { void Scope::EraseVars(const std::vector<std::string>& var_names) {
std::unique_lock<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
std::set<std::string> var_set(var_names.begin(), var_names.end()); std::set<std::string> var_set(var_names.begin(), var_names.end());
for (auto it = vars_.begin(); it != vars_.end();) { for (auto it = vars_.begin(); it != vars_.end();) {
if (var_set.find(it->first) != var_set.end()) { if (var_set.find(it->first) != var_set.end()) {
...@@ -127,12 +132,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) { ...@@ -127,12 +132,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
void Scope::Rename(const std::string& origin_name, void Scope::Rename(const std::string& origin_name,
const std::string& new_name) const { const std::string& new_name) const {
std::unique_lock<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
RenameInternal(origin_name, new_name); RenameInternal(origin_name, new_name);
} }
std::string Scope::Rename(const std::string& origin_name) const { std::string Scope::Rename(const std::string& origin_name) const {
std::unique_lock<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
auto new_name = string::Sprintf("%p.%d", this, vars_.size()); auto new_name = string::Sprintf("%p.%d", this, vars_.size());
RenameInternal(origin_name, new_name); RenameInternal(origin_name, new_name);
return new_name; return new_name;
......
...@@ -63,6 +63,11 @@ class Scope { ...@@ -63,6 +63,11 @@ class Scope {
/// Caller doesn't own the returned Variable. /// Caller doesn't own the returned Variable.
Variable* FindVar(const std::string& name) const; Variable* FindVar(const std::string& name) const;
/// Find a variable in the current scope.
/// Return nullptr if cannot find.
/// Caller doesn't own the returned Variable.
Variable* FindLocalVar(const std::string& name) const;
const Scope* parent() const { return parent_; } const Scope* parent() const { return parent_; }
/// Find the scope or an ancestor scope that contains the given variable. /// Find the scope or an ancestor scope that contains the given variable.
......
...@@ -36,6 +36,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, ...@@ -36,6 +36,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
auto size = src.numel() * SizeOfType(src.type()); auto size = src.numel() * SizeOfType(src.type());
if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
return;
}
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
boost::get<platform::CPUPlace>(src_place), src_ptr, size); boost::get<platform::CPUPlace>(src_place), src_ptr, size);
} }
...@@ -71,6 +76,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, ...@@ -71,6 +76,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
auto stream = auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream(); reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
if (platform::is_same_place(src_place, dst_place)) { if (platform::is_same_place(src_place, dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
return;
}
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
stream); stream);
} else { } else {
...@@ -114,6 +124,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, ...@@ -114,6 +124,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
auto dst_ptr = dst->mutable_data(dst_place, src.type()); auto dst_ptr = dst->mutable_data(dst_place, src.type());
auto size = src.numel() * SizeOfType(src.type()); auto size = src.numel() * SizeOfType(src.type());
if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data from " << src_place << " to "
<< dst_place;
return;
}
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
boost::get<platform::CPUPlace>(src_place), src_ptr, size); boost::get<platform::CPUPlace>(src_place), src_ptr, size);
} }
...@@ -130,6 +145,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, ...@@ -130,6 +145,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr); memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
} else if (platform::is_gpu_place(src_place) && } else if (platform::is_gpu_place(src_place) &&
platform::is_gpu_place(dst_place)) { platform::is_gpu_place(dst_place)) {
if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) {
VLOG(3) << "Skip copy the same data from " << src_place << " to "
<< dst_place;
return;
}
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place); auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place); auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
......
...@@ -41,6 +41,11 @@ TEST(TensorCopy, Tensor) { ...@@ -41,6 +41,11 @@ TEST(TensorCopy, Tensor) {
EXPECT_EQ(src_ptr[i], dst_ptr[i]); EXPECT_EQ(src_ptr[i], dst_ptr[i]);
} }
TensorCopy(dst_tensor, *cpu_place, &dst_tensor);
for (size_t i = 0; i < 9; ++i) {
EXPECT_EQ(src_ptr[i], dst_ptr[i]);
}
EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout()); EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
Tensor slice_tensor = src_tensor.Slice(1, 2); Tensor slice_tensor = src_tensor.Slice(1, 2);
...@@ -82,6 +87,15 @@ TEST(TensorCopy, Tensor) { ...@@ -82,6 +87,15 @@ TEST(TensorCopy, Tensor) {
EXPECT_EQ(src_ptr[i], dst_ptr[i]); EXPECT_EQ(src_ptr[i], dst_ptr[i]);
} }
// Copy the same tensor
TensorCopy(gpu_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
gpu_ctx.Wait();
const int* dst_ptr_tmp = dst_tensor.data<int>();
EXPECT_NE(src_ptr, dst_ptr_tmp);
for (size_t i = 0; i < 9; ++i) {
EXPECT_EQ(src_ptr[i], dst_ptr_tmp[i]);
}
Tensor slice_tensor = src_tensor.Slice(1, 2); Tensor slice_tensor = src_tensor.Slice(1, 2);
// CPU Slice Tensor to GPU Tensor // CPU Slice Tensor to GPU Tensor
......
...@@ -70,7 +70,7 @@ class DfgPassManagerImpl final : public DfgPassManager { ...@@ -70,7 +70,7 @@ class DfgPassManagerImpl final : public DfgPassManager {
auto trt_teller = [&](const Node* node) { auto trt_teller = [&](const Node* node) {
std::unordered_set<std::string> teller_set( std::unordered_set<std::string> teller_set(
{"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid", {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
"depthwise_conv2d", "batch_norm", "concat", "tanh", "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
"elementwise_add", "dropout"}); "elementwise_add", "dropout"});
if (!node->IsFunction()) return false; if (!node->IsFunction()) return false;
......
...@@ -76,7 +76,6 @@ class Analyzer : public OrderedRegistry<PassManager> { ...@@ -76,7 +76,6 @@ class Analyzer : public OrderedRegistry<PassManager> {
"conv_bn_fuse_pass", // "conv_bn_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", // "conv_eltwiseadd_bn_fuse_pass", //
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
"conv_bias_mkldnn_fuse_pass", //
"conv_relu_mkldnn_fuse_pass", // "conv_relu_mkldnn_fuse_pass", //
#endif #endif
}}; }};
......
...@@ -25,9 +25,11 @@ ...@@ -25,9 +25,11 @@
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
DECLARE_bool(profile); DECLARE_bool(profile);
DECLARE_int32(paddle_num_threads);
namespace paddle { namespace paddle {
...@@ -47,6 +49,9 @@ bool AnalysisPredictor::Init( ...@@ -47,6 +49,9 @@ bool AnalysisPredictor::Init(
} }
#endif #endif
// no matter with or without MKLDNN
paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
if (config_.use_gpu) { if (config_.use_gpu) {
place_ = paddle::platform::CUDAPlace(config_.device); place_ = paddle::platform::CUDAPlace(config_.device);
LOG(WARNING) << "ir optimize only supports CPU currently, enable_ir_optim " LOG(WARNING) << "ir optimize only supports CPU currently, enable_ir_optim "
......
...@@ -23,9 +23,11 @@ limitations under the License. */ ...@@ -23,9 +23,11 @@ limitations under the License. */
#include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
DEFINE_bool(profile, false, "Turn on profiler for fluid"); DEFINE_bool(profile, false, "Turn on profiler for fluid");
DECLARE_int32(paddle_num_threads);
namespace paddle { namespace paddle {
namespace { namespace {
...@@ -72,6 +74,9 @@ bool NativePaddlePredictor::Init( ...@@ -72,6 +74,9 @@ bool NativePaddlePredictor::Init(
} }
#endif #endif
// no matter with or without MKLDNN
paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
if (config_.use_gpu) { if (config_.use_gpu) {
place_ = paddle::platform::CUDAPlace(config_.device); place_ = paddle::platform::CUDAPlace(config_.device);
} else { } else {
......
...@@ -185,3 +185,4 @@ USE_TRT_CONVERTER(softmax); ...@@ -185,3 +185,4 @@ USE_TRT_CONVERTER(softmax);
USE_TRT_CONVERTER(batch_norm); USE_TRT_CONVERTER(batch_norm);
USE_TRT_CONVERTER(concat); USE_TRT_CONVERTER(concat);
USE_TRT_CONVERTER(dropout); USE_TRT_CONVERTER(dropout);
USE_TRT_CONVERTER(pad);
...@@ -100,19 +100,17 @@ for WITH_STATIC_LIB in ON OFF; do ...@@ -100,19 +100,17 @@ for WITH_STATIC_LIB in ON OFF; do
rm -rf * rm -rf *
cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \ cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \
-DWITH_MKL=$TURN_ON_MKL \ -DWITH_MKL=$TURN_ON_MKL \
-DDEMO_NAME=vis_demo \ -DDEMO_NAME=trt_mobilenet_demo \
-DWITH_GPU=$TEST_GPU_CPU \ -DWITH_GPU=$TEST_GPU_CPU \
-DWITH_STATIC_LIB=$WITH_STATIC_LIB \ -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
-DUSE_TENSORRT=$USE_TENSORRT \ -DUSE_TENSORRT=$USE_TENSORRT \
-DTENSORRT_INCLUDE_DIR=$TENSORRT_INCLUDE_DIR \ -DTENSORRT_INCLUDE_DIR=$TENSORRT_INCLUDE_DIR \
-DTENSORRT_LIB_DIR=$TENSORRT_LIB_DIR -DTENSORRT_LIB_DIR=$TENSORRT_LIB_DIR
make -j make -j
./vis_demo \ ./trt_mobilenet_demo \
--modeldir=$DATA_DIR/mobilenet/model \ --modeldir=$DATA_DIR/mobilenet/model \
--data=$DATA_DIR/mobilenet/data.txt \ --data=$DATA_DIR/mobilenet/data.txt \
--refer=$DATA_DIR/mobilenet/result.txt \ --refer=$DATA_DIR/mobilenet/result.txt
--use_gpu=true \
--use_trt=true
fi fi
done done
set +x set +x
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
/*
* This file contains demo of mobilenet for tensorrt.
*/
#include <gflags/gflags.h>
#include <glog/logging.h> // use glog instead of CHECK to avoid importing other paddle header files.
#include "paddle/fluid/inference/demo_ci/utils.h"
DECLARE_double(fraction_of_gpu_memory_to_use);
DEFINE_string(modeldir, "", "Directory of the inference model.");
DEFINE_string(refer, "", "path to reference result for comparison.");
DEFINE_string(
data, "",
"path of data; each line is a record, format is "
"'<space splitted floats as data>\t<space splitted ints as shape'");
namespace paddle {
namespace demo {
/*
* Use the tensorrt fluid engine to inference the demo.
*/
void Main() {
std::unique_ptr<PaddlePredictor> predictor;
paddle::contrib::MixedRTConfig config;
config.param_file = FLAGS_modeldir + "/__params__";
config.prog_file = FLAGS_modeldir + "/__model__";
config.use_gpu = true;
config.device = 0;
config.max_batch_size = 1;
config.fraction_of_gpu_memory = 0.1; // set by yourself
predictor = CreatePaddlePredictor<paddle::contrib::MixedRTConfig>(config);
VLOG(3) << "begin to process data";
// Just a single batch of data.
std::string line;
std::ifstream file(FLAGS_data);
std::getline(file, line);
auto record = ProcessALine(line);
file.close();
// Inference.
PaddleTensor input;
input.shape = record.shape;
input.data =
PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
input.dtype = PaddleDType::FLOAT32;
VLOG(3) << "run executor";
std::vector<PaddleTensor> output;
predictor->Run({input}, &output, 1);
VLOG(3) << "output.size " << output.size();
auto& tensor = output.front();
VLOG(3) << "output: " << SummaryTensor(tensor);
// compare with reference result
CheckOutput(FLAGS_refer, tensor);
}
} // namespace demo
} // namespace paddle
int main(int argc, char** argv) {
google::ParseCommandLineFlags(&argc, &argv, true);
paddle::demo::Main();
return 0;
}
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
#pragma once #pragma once
#include <algorithm> #include <algorithm>
#include <fstream>
#include <iostream>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/inference/paddle_inference_api.h" #include "paddle/fluid/inference/paddle_inference_api.h"
...@@ -21,6 +23,11 @@ ...@@ -21,6 +23,11 @@
namespace paddle { namespace paddle {
namespace demo { namespace demo {
struct Record {
std::vector<float> data;
std::vector<int32_t> shape;
};
static void split(const std::string& str, char sep, static void split(const std::string& str, char sep,
std::vector<std::string>* pieces) { std::vector<std::string>* pieces) {
pieces->clear(); pieces->clear();
...@@ -39,6 +46,58 @@ static void split(const std::string& str, char sep, ...@@ -39,6 +46,58 @@ static void split(const std::string& str, char sep,
} }
} }
Record ProcessALine(const std::string& line) {
VLOG(3) << "process a line";
std::vector<std::string> columns;
split(line, '\t', &columns);
CHECK_EQ(columns.size(), 2UL)
<< "data format error, should be <data>\t<shape>";
Record record;
std::vector<std::string> data_strs;
split(columns[0], ' ', &data_strs);
for (auto& d : data_strs) {
record.data.push_back(std::stof(d));
}
std::vector<std::string> shape_strs;
split(columns[1], ' ', &shape_strs);
for (auto& s : shape_strs) {
record.shape.push_back(std::stoi(s));
}
VLOG(3) << "data size " << record.data.size();
VLOG(3) << "data shape size " << record.shape.size();
return record;
}
void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
std::string line;
std::ifstream file(referfile);
std::getline(file, line);
auto refer = ProcessALine(line);
file.close();
size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
VLOG(3) << "predictor output numel " << numel;
VLOG(3) << "reference output numel " << refer.data.size();
CHECK_EQ(numel, refer.data.size());
switch (output.dtype) {
case PaddleDType::INT64: {
for (size_t i = 0; i < numel; ++i) {
CHECK_EQ(static_cast<int64_t*>(output.data.data())[i], refer.data[i]);
}
break;
}
case PaddleDType::FLOAT32:
for (size_t i = 0; i < numel; ++i) {
CHECK_LT(
fabs(static_cast<float*>(output.data.data())[i] - refer.data[i]),
1e-5);
}
break;
}
}
/* /*
* Get a summary of a PaddleTensor content. * Get a summary of a PaddleTensor content.
*/ */
......
...@@ -18,10 +18,6 @@ limitations under the License. */ ...@@ -18,10 +18,6 @@ limitations under the License. */
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <glog/logging.h> // use glog instead of CHECK to avoid importing other paddle header files. #include <glog/logging.h> // use glog instead of CHECK to avoid importing other paddle header files.
#include <fstream>
#include <iostream>
// #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/inference/demo_ci/utils.h" #include "paddle/fluid/inference/demo_ci/utils.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -34,99 +30,28 @@ DEFINE_string( ...@@ -34,99 +30,28 @@ DEFINE_string(
"path of data; each line is a record, format is " "path of data; each line is a record, format is "
"'<space splitted floats as data>\t<space splitted ints as shape'"); "'<space splitted floats as data>\t<space splitted ints as shape'");
DEFINE_bool(use_gpu, false, "Whether use gpu."); DEFINE_bool(use_gpu, false, "Whether use gpu.");
DEFINE_bool(use_trt, false, "Whether use trt.");
namespace paddle { namespace paddle {
namespace demo { namespace demo {
struct Record {
std::vector<float> data;
std::vector<int32_t> shape;
};
void split(const std::string& str, char sep, std::vector<std::string>* pieces);
Record ProcessALine(const std::string& line) {
VLOG(3) << "process a line";
std::vector<std::string> columns;
split(line, '\t', &columns);
CHECK_EQ(columns.size(), 2UL)
<< "data format error, should be <data>\t<shape>";
Record record;
std::vector<std::string> data_strs;
split(columns[0], ' ', &data_strs);
for (auto& d : data_strs) {
record.data.push_back(std::stof(d));
}
std::vector<std::string> shape_strs;
split(columns[1], ' ', &shape_strs);
for (auto& s : shape_strs) {
record.shape.push_back(std::stoi(s));
}
VLOG(3) << "data size " << record.data.size();
VLOG(3) << "data shape size " << record.shape.size();
return record;
}
void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
std::string line;
std::ifstream file(referfile);
std::getline(file, line);
auto refer = ProcessALine(line);
file.close();
size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
VLOG(3) << "predictor output numel " << numel;
VLOG(3) << "reference output numel " << refer.data.size();
CHECK_EQ(numel, refer.data.size());
switch (output.dtype) {
case PaddleDType::INT64: {
for (size_t i = 0; i < numel; ++i) {
CHECK_EQ(static_cast<int64_t*>(output.data.data())[i], refer.data[i]);
}
break;
}
case PaddleDType::FLOAT32:
for (size_t i = 0; i < numel; ++i) {
CHECK_LT(
fabs(static_cast<float*>(output.data.data())[i] - refer.data[i]),
1e-5);
}
break;
}
}
/* /*
* Use the native fluid engine to inference the demo. * Use the native fluid engine to inference the demo.
*/ */
void Main(bool use_gpu, bool use_trt) { void Main(bool use_gpu) {
std::unique_ptr<PaddlePredictor> predictor; std::unique_ptr<PaddlePredictor> predictor;
if (!use_trt) { NativeConfig config;
NativeConfig config; config.param_file = FLAGS_modeldir + "/__params__";
config.param_file = FLAGS_modeldir + "/__params__"; config.prog_file = FLAGS_modeldir + "/__model__";
config.prog_file = FLAGS_modeldir + "/__model__"; config.use_gpu = use_gpu;
config.use_gpu = use_gpu; config.device = 0;
config.device = 0; if (FLAGS_use_gpu) {
if (FLAGS_use_gpu) {
config.fraction_of_gpu_memory = 0.1; // set by yourself
}
VLOG(3) << "init predictor";
predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
} else {
paddle::contrib::MixedRTConfig config;
config.param_file = FLAGS_modeldir + "/__params__";
config.prog_file = FLAGS_modeldir + "/__model__";
config.use_gpu = true;
config.device = 0;
config.max_batch_size = 1;
config.fraction_of_gpu_memory = 0.1; // set by yourself config.fraction_of_gpu_memory = 0.1; // set by yourself
predictor = CreatePaddlePredictor<paddle::contrib::MixedRTConfig>(config);
} }
VLOG(3) << "init predictor";
predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
VLOG(3) << "begin to process data"; VLOG(3) << "begin to process data";
// Just a single batch of data. // Just a single batch of data.
std::string line; std::string line;
...@@ -159,12 +84,10 @@ void Main(bool use_gpu, bool use_trt) { ...@@ -159,12 +84,10 @@ void Main(bool use_gpu, bool use_trt) {
int main(int argc, char** argv) { int main(int argc, char** argv) {
google::ParseCommandLineFlags(&argc, &argv, true); google::ParseCommandLineFlags(&argc, &argv, true);
if (FLAGS_use_gpu && FLAGS_use_trt) { if (FLAGS_use_gpu) {
paddle::demo::Main(true /*use_gpu*/, true); paddle::demo::Main(true /*use_gpu*/);
} else if (FLAGS_use_gpu) {
paddle::demo::Main(true /*use_gpu*/, false);
} else { } else {
paddle::demo::Main(false /*use_gpu*/, false /*use_tensorrt*/); paddle::demo::Main(false /*use_gpu*/);
} }
return 0; return 0;
} }
# Add TRT tests # Add TRT tests
nv_library(tensorrt_converter nv_library(tensorrt_converter
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc pad_op.cc
DEPS tensorrt_engine operator scope framework_proto op_registry) DEPS tensorrt_engine operator scope framework_proto op_registry)
nv_test(test_op_converter SRCS test_op_converter.cc DEPS nv_test(test_op_converter SRCS test_op_converter.cc DEPS
...@@ -26,6 +26,8 @@ nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc ...@@ -26,6 +26,8 @@ nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine batch_norm_op SERIAL) DEPS ${FLUID_CORE_MODULES} tensorrt_engine batch_norm_op SERIAL)
nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine concat_op SERIAL) DEPS ${FLUID_CORE_MODULES} tensorrt_engine concat_op SERIAL)
nv_test(test_trt_dropout_op SRCS test_dropout_op.cc dropout_op.cc nv_test(test_trt_dropout_op SRCS test_dropout_op.cc dropout_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine dropout_op SERIAL) DEPS ${FLUID_CORE_MODULES} tensorrt_engine dropout_op SERIAL)
nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine pad_op SERIAL)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace tensorrt {
/*
* PadOp.
*/
class PadOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
VLOG(4) << "convert a fluid transpose op to tensorrt tranpose layer";
framework::OpDesc op_desc(op, nullptr);
// Declare inputs
auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
const std::vector<int> paddings =
boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
const float pad_value = boost::get<float>(op_desc.GetAttr("pad_value"));
nvinfer1::Dims input_shape = input->getDimensions();
int nbDims = input_shape.nbDims;
int pad_size = static_cast<int>(paddings.size());
PADDLE_ENFORCE_GE(nbDims, 2);
PADDLE_ENFORCE_EQ((nbDims + 1) * 2, pad_size);
PADDLE_ENFORCE(pad_value == 0.0, "The pad layer of TRT only support zero.");
nvinfer1::DimsHW pre_pad(paddings[pad_size - 4], paddings[pad_size - 2]);
nvinfer1::DimsHW post_pad(paddings[pad_size - 3], paddings[pad_size - 1]);
auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Padding,
*const_cast<nvinfer1::ITensor*>(input),
pre_pad, post_pad);
PADDLE_ENFORCE(layer != nullptr);
auto output_name = op_desc.Output("Out")[0];
engine_->SetITensor(output_name, layer->getOutput(0));
layer->setName(("scale (Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str());
if (test_mode) { // the test framework can not determine which is the
// output, so place the declaration inside.
engine_->DeclareOutput(output_name);
}
}
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
REGISTER_TRT_OP_CONVERTER(pad, PadOpConverter);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace tensorrt {
TEST(PadConverter, main) {
framework::Scope scope;
std::unordered_set<std::string> parameters;
TRTConvertValidation validator(10, parameters, scope, 1000);
validator.DeclInputVar("pad-X", nvinfer1::Dims3(3, 2, 2));
validator.DeclOutputVar("pad-Out", nvinfer1::Dims3(3, 3, 5));
// Prepare Op description
framework::OpDesc desc;
desc.SetType("pad");
desc.SetInput("X", {"pad-X"});
desc.SetOutput("Out", {"pad-Out"});
std::vector<int> paddings = {0, 0, 0, 0, 0, 1, 1, 2};
float pad_value = 0.0;
desc.SetAttr("paddings", paddings);
desc.SetAttr("pad_value", pad_value);
LOG(INFO) << "set OP";
validator.SetOp(*desc.Proto());
LOG(INFO) << "execute";
validator.Execute(2);
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
USE_OP(pad);
...@@ -230,7 +230,7 @@ if(WITH_DISTRIBUTE) ...@@ -230,7 +230,7 @@ if(WITH_DISTRIBUTE)
op_library(${dist_op} DEPS ${DISTRIBUTE_DEPS}) op_library(${dist_op} DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(${dist_op}.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(${dist_op}.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
endforeach() endforeach()
#set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) #set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
#cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op #cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op
# listen_and_serv_op sum_op executor SERIAL) # listen_and_serv_op sum_op executor SERIAL)
...@@ -268,6 +268,7 @@ if (WITH_GPU AND TENSORRT_FOUND) ...@@ -268,6 +268,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
else() else()
set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op) set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
endif() endif()
op_library(clip_by_norm_op DEPS selected_rows_functor selected_rows)
op_library(sum_op DEPS selected_rows_functor) op_library(sum_op DEPS selected_rows_functor)
op_library(sgd_op DEPS selected_rows_functor) op_library(sgd_op DEPS selected_rows_functor)
op_library(print_op DEPS lod_tensor) op_library(print_op DEPS lod_tensor)
......
...@@ -18,6 +18,7 @@ namespace paddle { ...@@ -18,6 +18,7 @@ namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
class AdadeltaOp : public framework::OperatorWithKernel { class AdadeltaOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -31,6 +32,16 @@ class AdadeltaOp : public framework::OperatorWithKernel { ...@@ -31,6 +32,16 @@ class AdadeltaOp : public framework::OperatorWithKernel {
"Input(AvgSquaredGrad) of AdadeltaOp should not be null."); "Input(AvgSquaredGrad) of AdadeltaOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("AvgSquaredUpdate"), PADDLE_ENFORCE(ctx->HasInput("AvgSquaredUpdate"),
"Input(AvgSquaredUpdate) of AdadeltaOp should not be null."); "Input(AvgSquaredUpdate) of AdadeltaOp should not be null.");
PADDLE_ENFORCE(
ctx->GetInputsVarType("Param").front() ==
framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
PADDLE_ENFORCE(
ctx->GetInputsVarType("Grad").front() ==
framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
"Output(ParamOut) of AdadeltaOp should not be null."); "Output(ParamOut) of AdadeltaOp should not be null.");
...@@ -56,6 +67,7 @@ class AdadeltaOp : public framework::OperatorWithKernel { ...@@ -56,6 +67,7 @@ class AdadeltaOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("AvgSquaredGradOut", param_dim); ctx->SetOutputDim("AvgSquaredGradOut", param_dim);
ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim); ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim);
} }
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
auto input_data_type = auto input_data_type =
......
...@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T> ...@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T>
class AdadeltaOpKernel : public framework::OpKernel<T> { class AdadeltaOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const auto* param_var = ctx.InputVar("Param");
PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Param").front(), param_var->Type().name());
const auto* grad_var = ctx.InputVar("Grad");
PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Grad").front(), grad_var->Type().name());
auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut"); auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
auto avg_squared_grad_out_tensor = auto avg_squared_grad_out_tensor =
ctx.Output<framework::Tensor>("AvgSquaredGradOut"); ctx.Output<framework::Tensor>("AvgSquaredGradOut");
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -21,25 +22,31 @@ namespace operators { ...@@ -21,25 +22,31 @@ namespace operators {
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
struct SparseAdagradFunctor { struct SparseAdagradFunctor {
void operator()(const DeviceContext& context, void operator()(const DeviceContext &context,
const framework::SelectedRows& grad, const framework::SelectedRows &grad,
const framework::Tensor& learning_rate, T epsilon, const framework::Tensor &learning_rate, T epsilon,
framework::Tensor* moment, framework::Tensor* param); framework::Tensor *moment, framework::Tensor *param);
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class AdagradOpKernel : public framework::OpKernel<T> { class AdagradOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
auto* param_out_tensor = ctx.Output<framework::Tensor>("ParamOut"); const auto *param_var = ctx.InputVar("Param");
auto* moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut"); PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Param").front(), param_var->Type().name());
auto *param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
auto *moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
param_out_tensor->mutable_data<T>(ctx.GetPlace()); param_out_tensor->mutable_data<T>(ctx.GetPlace());
moment_out_tensor->mutable_data<T>(ctx.GetPlace()); moment_out_tensor->mutable_data<T>(ctx.GetPlace());
T epsilon = static_cast<T>(ctx.Attr<float>("epsilon")); T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
auto* grad_var = ctx.InputVar("Grad"); auto *grad_var = ctx.InputVar("Grad");
if (grad_var->IsType<framework::LoDTensor>()) { if (grad_var->IsType<framework::LoDTensor>()) {
auto param = framework::EigenVector<T>::Flatten( auto param = framework::EigenVector<T>::Flatten(
*ctx.Input<framework::Tensor>("Param")); *ctx.Input<framework::Tensor>("Param"));
...@@ -47,16 +54,16 @@ class AdagradOpKernel : public framework::OpKernel<T> { ...@@ -47,16 +54,16 @@ class AdagradOpKernel : public framework::OpKernel<T> {
*ctx.Input<framework::Tensor>("Grad")); *ctx.Input<framework::Tensor>("Grad"));
auto moment = framework::EigenVector<T>::Flatten( auto moment = framework::EigenVector<T>::Flatten(
*ctx.Input<framework::Tensor>("Moment")); *ctx.Input<framework::Tensor>("Moment"));
auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate"); auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor); auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor); auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
auto* place = ctx.template device_context<DeviceContext>().eigen_device(); auto *place = ctx.template device_context<DeviceContext>().eigen_device();
moment_out.device(*place) = moment + grad * grad; moment_out.device(*place) = moment + grad * grad;
Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel()); Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
if (platform::is_cpu_place(ctx.GetPlace())) { if (platform::is_cpu_place(ctx.GetPlace())) {
auto* lr = learning_rate->data<T>(); auto *lr = learning_rate->data<T>();
param_out.device(*place) = param_out.device(*place) =
param - lr[0] * grad / (moment_out.sqrt() + epsilon); param - lr[0] * grad / (moment_out.sqrt() + epsilon);
} else { } else {
...@@ -66,10 +73,10 @@ class AdagradOpKernel : public framework::OpKernel<T> { ...@@ -66,10 +73,10 @@ class AdagradOpKernel : public framework::OpKernel<T> {
lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon); lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
} }
} else if (grad_var->IsType<framework::SelectedRows>()) { } else if (grad_var->IsType<framework::SelectedRows>()) {
auto* param_tensor = ctx.Input<framework::Tensor>("Param"); auto *param_tensor = ctx.Input<framework::Tensor>("Param");
PADDLE_ENFORCE_EQ(param_tensor, param_out_tensor); PADDLE_ENFORCE_EQ(param_tensor, param_out_tensor);
auto* moment_tensor = ctx.Input<framework::Tensor>("Moment"); auto *moment_tensor = ctx.Input<framework::Tensor>("Moment");
PADDLE_ENFORCE_EQ(moment_tensor, moment_out_tensor); PADDLE_ENFORCE_EQ(moment_tensor, moment_out_tensor);
SparseAdagradFunctor<DeviceContext, T> functor; SparseAdagradFunctor<DeviceContext, T> functor;
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/math/algorithm.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/for_range.h"
...@@ -199,23 +200,9 @@ struct SparseAdamFunctor { ...@@ -199,23 +200,9 @@ struct SparseAdamFunctor {
row_numel_(row_numel), row_numel_(row_numel),
row_count_(row_count) {} row_count_(row_count) {}
inline HOSTDEVICE int64_t BinarySearchInRows(int64_t row) const {
int64_t beg = 0, end = row_count_ - 1;
while (beg <= end) {
auto mid = ((beg + end) >> 1);
if (rows_[mid] == row)
return mid;
else if (rows_[mid] < row)
beg = mid + 1;
else
end = mid - 1;
}
return -1;
}
inline HOSTDEVICE void operator()(size_t i) const { inline HOSTDEVICE void operator()(size_t i) const {
int64_t row = i / row_numel_; auto row_idx =
auto row_idx = BinarySearchInRows(row); math::BinarySearch<int64_t>(rows_, row_count_, i / row_numel_);
T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0; T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
// The following code is the same as dense // The following code is the same as dense
...@@ -244,6 +231,12 @@ template <typename DeviceContext, typename T> ...@@ -244,6 +231,12 @@ template <typename DeviceContext, typename T>
class AdamOpKernel : public framework::OpKernel<T> { class AdamOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const auto* param_var = ctx.InputVar("Param");
PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Param").front(), param_var->Type().name());
using paddle::framework::LoDTensor; using paddle::framework::LoDTensor;
using paddle::operators::detail::Ref; using paddle::operators::detail::Ref;
......
...@@ -35,6 +35,16 @@ class AdamaxOp : public framework::OperatorWithKernel { ...@@ -35,6 +35,16 @@ class AdamaxOp : public framework::OperatorWithKernel {
"Input(LearningRate) of AdamaxOp should not be null."); "Input(LearningRate) of AdamaxOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"), PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
"Input(Beta1Pow) of AdamaxOp should not be null."); "Input(Beta1Pow) of AdamaxOp should not be null.");
PADDLE_ENFORCE(
ctx->GetInputsVarType("Param").front() ==
framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
PADDLE_ENFORCE(
ctx->GetInputsVarType("Grad").front() ==
framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
"Output(ParamOut) of AdamaxOp should not be null."); "Output(ParamOut) of AdamaxOp should not be null.");
......
...@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T> ...@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T>
class AdamaxOpKernel : public framework::OpKernel<T> { class AdamaxOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const auto* param_var = ctx.InputVar("Param");
PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Param").front(), param_var->Type().name());
const auto* grad_var = ctx.InputVar("Grad");
PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Grad").front(), grad_var->Type().name());
auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut"); auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut"); auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
auto inf_norm_out_tensor = ctx.Output<framework::Tensor>("InfNormOut"); auto inf_norm_out_tensor = ctx.Output<framework::Tensor>("InfNormOut");
......
...@@ -16,12 +16,15 @@ limitations under the License. */ ...@@ -16,12 +16,15 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/transform.h" #include "paddle/fluid/platform/transform.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using SelectedRows = framework::SelectedRows;
template <typename T, int MajorType = Eigen::RowMajor, template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex> typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>; using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
...@@ -31,9 +34,40 @@ class ClipByNormKernel : public framework::OpKernel<T> { ...@@ -31,9 +34,40 @@ class ClipByNormKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto max_norm = context.Attr<T>("max_norm"); auto max_norm = context.Attr<T>("max_norm");
auto* input = context.Input<Tensor>("X"); auto in_var = context.InputVar("X");
auto* output = context.Output<Tensor>("Out");
output->mutable_data<T>(context.GetPlace()); Tensor* output = nullptr;
const Tensor* input = nullptr;
if (in_var->IsType<framework::LoDTensor>()) {
input = context.Input<Tensor>("X");
output = context.Output<Tensor>("Out");
output->mutable_data<T>(context.GetPlace());
} else if (in_var->IsType<SelectedRows>()) {
auto* x = context.Input<SelectedRows>("X");
// merge ids in selected rows first
math::scatter::MergeAdd<DeviceContext, T> merge_func;
SelectedRows* merged_input =
const_cast<framework::Scope&>(context.scope())
.Var()
->GetMutable<SelectedRows>();
merge_func(context.template device_context<DeviceContext>(), *x,
merged_input);
input = &(merged_input->value());
SelectedRows* output_selected_rows = context.Output<SelectedRows>("Out");
output_selected_rows->set_rows(merged_input->rows());
output_selected_rows->set_height(merged_input->height());
output = output_selected_rows->mutable_value();
output->Resize(merged_input->value().dims());
output->mutable_data<T>(context.GetPlace());
} else {
PADDLE_THROW("Unexpected branch, input variable type is %s",
in_var->Type().name());
}
PADDLE_ENFORCE_NOT_NULL(input);
auto x = EigenVector<T>::Flatten(*input); auto x = EigenVector<T>::Flatten(*input);
auto out = EigenVector<T>::Flatten(*output); auto out = EigenVector<T>::Flatten(*output);
......
...@@ -32,6 +32,16 @@ class DecayedAdagradOp : public framework::OperatorWithKernel { ...@@ -32,6 +32,16 @@ class DecayedAdagradOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE( PADDLE_ENFORCE(
ctx->HasInput("LearningRate"), ctx->HasInput("LearningRate"),
"Input(LearningRate) of DecayedAdagradOp should not be null."); "Input(LearningRate) of DecayedAdagradOp should not be null.");
PADDLE_ENFORCE(
ctx->GetInputsVarType("Param").front() ==
framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
PADDLE_ENFORCE(
ctx->GetInputsVarType("Grad").front() ==
framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
"Output(ParamOut) of DecayedAdagradOp should not be null."); "Output(ParamOut) of DecayedAdagradOp should not be null.");
......
...@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T> ...@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T>
class DecayedAdagradOpKernel : public framework::OpKernel<T> { class DecayedAdagradOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const auto* param_var = ctx.InputVar("Param");
PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Param").front(), param_var->Type().name());
const auto* grad_var = ctx.InputVar("Grad");
PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Grad").front(), grad_var->Type().name());
auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut"); auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut"); auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
......
...@@ -70,6 +70,12 @@ class FillConstantOp : public framework::OperatorBase { ...@@ -70,6 +70,12 @@ class FillConstantOp : public framework::OperatorBase {
} }
}; };
class FillConstantOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {}
};
class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker { class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
...@@ -102,4 +108,5 @@ Fill up a variable with specified constant value. ...@@ -102,4 +108,5 @@ Fill up a variable with specified constant value.
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, REGISTER_OPERATOR(fill_constant, ops::FillConstantOp,
ops::FillConstantInferShape, ops::FillConstantOpMaker, ops::FillConstantInferShape, ops::FillConstantOpMaker,
paddle::framework::EmptyGradOpMaker); paddle::framework::EmptyGradOpMaker,
ops::FillConstantOpVarTypeInference);
...@@ -34,6 +34,16 @@ class FTRLOp : public framework::OperatorWithKernel { ...@@ -34,6 +34,16 @@ class FTRLOp : public framework::OperatorWithKernel {
"Input(Grad) of FTRL should not be null."); "Input(Grad) of FTRL should not be null.");
PADDLE_ENFORCE(ctx->HasInput("LearningRate"), PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
"Input(LearningRate) of FTRL should not be null."); "Input(LearningRate) of FTRL should not be null.");
PADDLE_ENFORCE(
ctx->GetInputsVarType("Param").front() ==
framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
PADDLE_ENFORCE(
ctx->GetInputsVarType("Grad").front() ==
framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
"Output(ParamOut) of FTRL should not be null."); "Output(ParamOut) of FTRL should not be null.");
......
...@@ -28,6 +28,17 @@ template <typename DeviceContext, typename T> ...@@ -28,6 +28,17 @@ template <typename DeviceContext, typename T>
class FTRLOpKernel : public framework::OpKernel<T> { class FTRLOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const auto* param_var = ctx.InputVar("Param");
PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Param").front(), param_var->Type().name());
const auto* grad_var = ctx.InputVar("Grad");
PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Grad").front(), grad_var->Type().name());
auto* param_out = ctx.Output<Tensor>("ParamOut"); auto* param_out = ctx.Output<Tensor>("ParamOut");
auto* sq_accum_out = ctx.Output<Tensor>("SquaredAccumOut"); auto* sq_accum_out = ctx.Output<Tensor>("SquaredAccumOut");
auto* lin_accum_out = ctx.Output<Tensor>("LinearAccumOut"); auto* lin_accum_out = ctx.Output<Tensor>("LinearAccumOut");
......
...@@ -60,7 +60,7 @@ class OverflowOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -60,7 +60,7 @@ class OverflowOpMaker : public framework::OpProtoAndCheckerMaker {
"(Tensor) 1-dim tensor, contains a bool scalar. The output " "(Tensor) 1-dim tensor, contains a bool scalar. The output "
"tensor of overflow operator."); "tensor of overflow operator.");
AddComment(string::Sprintf(R"DOC( AddComment(string::Sprintf(R"DOC(
Overflow operator. Overflow %s operator.
$$Out = any(X)$$ $$Out = any(X)$$
...@@ -69,6 +69,8 @@ Out = Inf if any X contains Inf, ...@@ -69,6 +69,8 @@ Out = Inf if any X contains Inf,
Out = Nan if any X contains Nan, Out = Nan if any X contains Nan,
Out = 0 if no Inf/Nan detected. Out = 0 if no Inf/Nan detected.
If X contains both Inf/Nan, it will return the first indicator it meeted. If X contains both Inf/Nan, it will return the first indicator it meeted.
%s
)DOC", )DOC",
GetName(), GetComments())); GetName(), GetComments()));
} }
......
...@@ -3,8 +3,8 @@ add_subdirectory(detail) ...@@ -3,8 +3,8 @@ add_subdirectory(detail)
endif(NOT WIN32) endif(NOT WIN32)
function(math_library TARGET) function(math_library TARGET)
# math_library is a function to create math library. # math_library is a function to create math library.
# The interface is the same as cc_library. # The interface is the same as cc_library.
# But it handle split GPU/CPU code and link some common library. # But it handle split GPU/CPU code and link some common library.
set(cc_srcs) set(cc_srcs)
set(cu_srcs) set(cu_srcs)
...@@ -53,7 +53,7 @@ cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context) ...@@ -53,7 +53,7 @@ cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
math_library(math_function DEPS blas) math_library(math_function DEPS blas)
math_library(maxouting) math_library(maxouting)
math_library(pooling) math_library(pooling)
math_library(selected_rows_functor DEPS selected_rows math_function) math_library(selected_rows_functor DEPS selected_rows math_function blas)
math_library(sequence2batch) math_library(sequence2batch)
math_library(sequence_padding) math_library(sequence_padding)
math_library(sequence_pooling DEPS math_function) math_library(sequence_pooling DEPS math_function)
......
...@@ -11,24 +11,34 @@ ...@@ -11,24 +11,34 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h" #include <algorithm>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include <cstdint> // for int64_t
#include "paddle/fluid/framework/ir/pass.h" #include <numeric>
#include "paddle/fluid/platform/hostdevice.h"
namespace paddle { namespace paddle {
namespace framework { namespace operators {
namespace ir { namespace math {
/*
* Fuse the Conv and Elementwise_add to a ConvBiasOp. template <typename T>
*/ HOSTDEVICE inline int64_t BinarySearch(const T *x, int64_t num, const T &val) {
class ConvBiasFusePass : public FusePassBase { int64_t beg = 0, end = num - 1;
public: while (beg <= end) {
virtual ~ConvBiasFusePass() {} auto mid = ((beg + end) >> 1);
if (x[mid] == val)
return mid;
else if (x[mid] < val)
beg = mid + 1;
else
end = mid - 1;
}
return -1;
}
protected: } // namespace math
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; } // namespace operators
};
} // namespace ir
} // namespace framework
} // namespace paddle } // namespace paddle
...@@ -46,17 +46,20 @@ __forceinline__ __device__ unsigned warp_id() { ...@@ -46,17 +46,20 @@ __forceinline__ __device__ unsigned warp_id() {
return ret; return ret;
} }
#define ARG_DEFINE_KernelDepthwiseConv \
const T *const input_data, const T *const filter_data, const int batch_size, \
const int output_channels, const int output_height, \
const int output_width, const int input_channels, \
const int input_height, const int input_width, \
const int filter_multiplier, const int filter_height, \
const int filter_width, const int stride_height, const int stride_width, \
const int padding_height, const int padding_width, \
const int dilate_height, const int dilate_width, T *const output_data
// A Cuda kernel to compute the depthwise convolution forward pass // A Cuda kernel to compute the depthwise convolution forward pass
// in NCHW format. // in NCHW format.
template <typename T> template <typename T>
__device__ __inline__ void KernelDepthwiseConv( __device__ __inline__ void KernelDepthwiseConv(ARG_DEFINE_KernelDepthwiseConv) {
const T* const input_data, const T* const filter_data, const int batch_size,
const int output_channels, const int output_height, const int output_width,
const int input_channels, const int input_height, const int input_width,
const int filter_multiplier, const int filter_height,
const int filter_width, const int stride_height, const int stride_width,
const int padding_height, const int padding_width, const int dilate_height,
const int dilate_width, T* const output_data) {
for (int w_out = threadIdx.x; w_out < output_width; w_out += blockDim.x) { for (int w_out = threadIdx.x; w_out < output_width; w_out += blockDim.x) {
for (int h_out = threadIdx.y; h_out < output_height; h_out += blockDim.y) { for (int h_out = threadIdx.y; h_out < output_height; h_out += blockDim.y) {
const int batch = blockIdx.y; const int batch = blockIdx.y;
...@@ -97,42 +100,105 @@ __device__ __inline__ void KernelDepthwiseConv( ...@@ -97,42 +100,105 @@ __device__ __inline__ void KernelDepthwiseConv(
} }
} }
template <typename T, int c_filter_multiplier, int c_stride> template <typename T, int c_filter>
__global__ void KernelDepthwiseConvSp( __device__ __inline__ void KernelDepthwiseConvCFilter(
const T* const input_data, const T* const filter_data, const int batch_size, ARG_DEFINE_KernelDepthwiseConv) {
const int output_channels, const int output_height, const int output_width, const int kWeghtSize = c_filter * c_filter;
const int input_channels, const int input_height, const int input_width, T r_weight[kWeghtSize];
const int filter_multiplier, const int filter_height, const int batch = blockIdx.y;
const int filter_width, const int stride_height, const int stride_width, const int c_out = blockIdx.x;
const int padding_height, const int padding_width, const int dilate_height, const T* weight = filter_data + c_out * c_filter * c_filter;
const int dilate_width, T* const output_data) { for (int i = 0; i < c_filter * c_filter; i++) r_weight[i] = weight[i];
if (c_filter_multiplier == 0)
KernelDepthwiseConv<T>(input_data, filter_data, batch_size, output_channels,
output_height, output_width, input_channels,
input_height, input_width, filter_multiplier,
filter_height, filter_width, stride_height,
stride_width, padding_height, padding_width,
dilate_height, dilate_width, output_data);
else for (int w_out = threadIdx.x; w_out < output_width; w_out += blockDim.x) {
KernelDepthwiseConv<T>(input_data, filter_data, batch_size, output_channels, for (int h_out = threadIdx.y; h_out < output_height; h_out += blockDim.y) {
output_height, output_width, input_channels, const int batch = blockIdx.y;
input_height, input_width, c_filter_multiplier, const int c_out = blockIdx.x;
filter_height, filter_height, c_stride, c_stride,
padding_height, padding_width, dilate_height, const int c_in = c_out / filter_multiplier;
dilate_width, output_data); T value = 0;
const int h_in_start = -padding_height + h_out * stride_height;
const int w_in_start = -padding_width + w_out * stride_width;
const int h_in_end = h_in_start + c_filter * dilate_height;
const int w_in_end = w_in_start + c_filter * dilate_width;
const int in_offset =
((batch * input_channels + c_in) * input_height) * input_width;
const int h_end = h_in_end < input_height ? h_in_end : input_height;
const int w_end = w_in_end < input_width ? w_in_end : input_width;
const int h_start = h_in_start > 0 ? h_in_start : 0;
const int w_start = w_in_start > 0 ? w_in_start : 0;
for (int h_in = h_in_start, h_f = 0; h_f < c_filter;
h_in += dilate_height, h_f++) {
for (int w_in = w_in_start, w_f = 0; w_f < c_filter;
w_in += dilate_width, w_f++) {
if (h_in >= 0 && h_in < input_height && w_in >= 0 &&
w_in < input_width) {
const int offset = in_offset + h_in * input_width + w_in;
value += r_weight[h_f * c_filter + w_f] * input_data[offset];
}
}
}
int index =
((batch * gridDim.x + c_out) * output_height + h_out) * output_width +
w_out;
output_data[index] = value;
}
}
}
template <typename T, int c_filter_multiplier, int c_stride, int c_filter>
__global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
if (c_filter_multiplier == 0) {
if (c_filter == -1)
KernelDepthwiseConv<T>(
input_data, filter_data, batch_size, output_channels, output_height,
output_width, input_channels, input_height, input_width,
filter_multiplier, filter_height, filter_width, stride_height,
stride_width, padding_height, padding_width, dilate_height,
dilate_width, output_data);
else
KernelDepthwiseConvCFilter<T, c_filter>(
input_data, filter_data, batch_size, output_channels, output_height,
output_width, input_channels, input_height, input_width,
filter_multiplier, filter_height, filter_width, stride_height,
stride_width, padding_height, padding_width, dilate_height,
dilate_width, output_data);
} else {
if (c_filter == -1)
KernelDepthwiseConv<T>(input_data, filter_data, batch_size,
output_channels, output_height, output_width,
input_channels, input_height, input_width,
c_filter_multiplier, filter_height, filter_height,
c_stride, c_stride, padding_height, padding_width,
dilate_height, dilate_width, output_data);
else
KernelDepthwiseConvCFilter<T, c_filter>(
input_data, filter_data, batch_size, output_channels, output_height,
output_width, input_channels, input_height, input_width,
c_filter_multiplier, filter_height, filter_height, c_stride, c_stride,
padding_height, padding_width, dilate_height, dilate_width,
output_data);
}
} }
// CUDA kernel to compute the depthwise convolution backprop w.r.t input. // CUDA kernel to compute the depthwise convolution backprop w.r.t input.
#define ARG_DEFINE_KernelDepthwiseConvInputGrad \
const T *const output_grad_data, const T *const filter_data, \
const int batch_size, const int output_channels, \
const int output_height, const int output_width, \
const int input_channels, const int input_height, const int input_width, \
const int filter_multiplier, const int filter_height, \
const int filter_width, const int stride_height, const int stride_width, \
const int padding_height, const int padding_width, \
const int dilate_height, const int dilate_width, \
T *const input_grad_data
template <typename T> template <typename T>
__device__ __inline__ void KernelDepthwiseConvInputGrad( __device__ __inline__ void KernelDepthwiseConvInputGrad(
const T* const output_grad_data, const T* const filter_data, ARG_DEFINE_KernelDepthwiseConvInputGrad) {
const int batch_size, const int output_channels, const int output_height,
const int output_width, const int input_channels, const int input_height,
const int input_width, const int filter_multiplier, const int filter_height,
const int filter_width, const int stride_height, const int stride_width,
const int padding_height, const int padding_width, const int dilate_height,
const int dilate_width, T* const input_grad_data) {
for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) { for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) {
for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) { for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) {
const int batch = blockIdx.y; const int batch = blockIdx.y;
...@@ -184,15 +250,67 @@ __device__ __inline__ void KernelDepthwiseConvInputGrad( ...@@ -184,15 +250,67 @@ __device__ __inline__ void KernelDepthwiseConvInputGrad(
} }
} }
template <typename T, int c_filter_multiplier, int c_stride> template <typename T, int c_filter, int c_filter_multiplier>
__device__ __inline__ void KernelDepthwiseConvInputGradCFilter(
ARG_DEFINE_KernelDepthwiseConvInputGrad) {
const int kWeghtSize = c_filter * c_filter * c_filter_multiplier + 1;
T r_weight[kWeghtSize];
const int batch = blockIdx.y;
const int c_in = blockIdx.x;
for (int c_i = 0; c_i < filter_multiplier; c_i++) {
int c_out = c_in * filter_multiplier + c_i;
const T* weight = filter_data + c_out * c_filter * c_filter;
for (int i = 0; i < c_filter * c_filter; i++)
r_weight[i + c_i * c_filter * c_filter] =
weight[c_filter * c_filter - i - 1];
}
for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) {
for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) {
const int batch = blockIdx.y;
const int c_in = blockIdx.x;
int h_out_start = h_in - (c_filter - 1) * dilate_height + padding_height;
int w_out_start = w_in - (c_filter - 1) * dilate_width + padding_width;
T value = 0;
for (int c_i = 0; c_i < filter_multiplier; c_i++) {
int c_out = c_in * filter_multiplier + c_i;
for (int h_out = h_out_start, h_f = 0; h_f < c_filter;
h_out += dilate_height, h_f++) {
for (int w_out = w_out_start, w_f = 0; w_f < c_filter;
w_out += dilate_width, w_f++) {
int s_h_out = h_out / stride_height;
int s_w_out = w_out / stride_width;
if (h_out % stride_height == 0 && w_out % stride_width == 0 &&
s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 &&
s_w_out < output_width) {
const int output_grad_offset =
((batch * output_channels + c_out) * output_height +
s_h_out) *
output_width +
s_w_out;
value +=
output_grad_data[output_grad_offset] *
r_weight[h_f * c_filter + w_f + c_i * c_filter * c_filter];
}
}
}
}
int index =
((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
w_in;
input_grad_data[index] = value;
}
}
}
template <typename T, int c_filter_multiplier, int c_stride, int c_filter>
__global__ void KernelDepthwiseConvInputGradSp( __global__ void KernelDepthwiseConvInputGradSp(
const T* const output_grad_data, const T* const filter_data, ARG_DEFINE_KernelDepthwiseConvInputGrad) {
const int batch_size, const int output_channels, const int output_height,
const int output_width, const int input_channels, const int input_height,
const int input_width, const int filter_multiplier, const int filter_height,
const int filter_width, const int stride_height, const int stride_width,
const int padding_height, const int padding_width, const int dilate_height,
const int dilate_width, T* const input_grad_data) {
if (c_filter_multiplier == 0) if (c_filter_multiplier == 0)
KernelDepthwiseConvInputGrad<T>( KernelDepthwiseConvInputGrad<T>(
output_grad_data, filter_data, batch_size, output_channels, output_grad_data, filter_data, batch_size, output_channels,
...@@ -200,13 +318,20 @@ __global__ void KernelDepthwiseConvInputGradSp( ...@@ -200,13 +318,20 @@ __global__ void KernelDepthwiseConvInputGradSp(
filter_multiplier, filter_height, filter_width, stride_height, filter_multiplier, filter_height, filter_width, stride_height,
stride_width, padding_height, padding_width, dilate_height, stride_width, padding_height, padding_width, dilate_height,
dilate_width, input_grad_data); dilate_width, input_grad_data);
else else if (c_filter == -1)
KernelDepthwiseConvInputGrad<T>( KernelDepthwiseConvInputGrad<T>(
output_grad_data, filter_data, batch_size, output_channels, output_grad_data, filter_data, batch_size, output_channels,
output_height, output_width, input_channels, input_height, input_width, output_height, output_width, input_channels, input_height, input_width,
c_filter_multiplier, filter_height, filter_width, c_stride, c_stride, c_filter_multiplier, filter_height, filter_width, c_stride, c_stride,
padding_height, padding_width, dilate_height, dilate_width, padding_height, padding_width, dilate_height, dilate_width,
input_grad_data); input_grad_data);
else
KernelDepthwiseConvInputGradCFilter<T, c_filter, c_filter_multiplier>(
output_grad_data, filter_data, batch_size, output_channels,
output_height, output_width, input_channels, input_height, input_width,
c_filter_multiplier, filter_height, filter_width, c_stride, c_stride,
padding_height, padding_width, dilate_height, dilate_width,
input_grad_data);
} }
// Cuda kernel to compute the depthwise convolution backprop w.r.t. filter. // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
...@@ -325,12 +450,14 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> { ...@@ -325,12 +450,14 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
dim3 threads(std::min(output_width, thread), blocks, 1); dim3 threads(std::min(output_width, thread), blocks, 1);
dim3 grid(output_channels, batch_size, 1); dim3 grid(output_channels, batch_size, 1);
int filter_multiplier = output_channels / input_channels; int filter_multiplier = output_channels / input_channels;
#define check_case(c_filter_multiplier, c_stride) \ #define check_case(c_filter_multiplier, c_stride, c_filter) \
if (c_filter_multiplier == 0 || \ if (c_filter_multiplier == 0 || \
filter_multiplier == c_filter_multiplier && \ filter_multiplier == c_filter_multiplier && \
stride_height == stride_width && stride_height == c_stride) { \ stride_height == stride_width && stride_height == c_stride && \
KernelDepthwiseConvSp<T, c_filter_multiplier, \ (ksize_height == ksize_width && ksize_height == c_filter || \
c_stride><<<grid, threads, 0, context.stream()>>>( \ c_filter == -1)) { \
KernelDepthwiseConvSp<T, c_filter_multiplier, c_stride, \
c_filter><<<grid, threads, 0, context.stream()>>>( \
input_data, filter_data, batch_size, output_channels, output_height, \ input_data, filter_data, batch_size, output_channels, output_height, \
output_width, input_channels, input_height, input_width, \ output_width, input_channels, input_height, input_width, \
filter_multiplier, ksize_height, ksize_width, stride_height, \ filter_multiplier, ksize_height, ksize_width, stride_height, \
...@@ -338,11 +465,17 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> { ...@@ -338,11 +465,17 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
dilate_width, output_data); \ dilate_width, output_data); \
return; \ return; \
} }
check_case(1, 1); check_case(1, 1, 3);
check_case(1, 2); check_case(1, 1, 5);
// NOTE(liangdun): 0,0 for other case check_case(1, 1, -1);
// add other case if needed, e.g. check_case(2^n,1) check_case(1, 2, 3);
check_case(0, 0); check_case(1, 2, 5);
check_case(1, 2, -1);
check_case(0, 0, 3);
check_case(0, 0, 5);
check_case(0, 0, -1);
// NOTE(liangdun): 0,0 for other case
// add other case if needed, e.g. check_case(2^n,1)
#undef check_case #undef check_case
} }
}; };
...@@ -384,13 +517,15 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> { ...@@ -384,13 +517,15 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
dim3 grid(input_channels, batch_size, 1); dim3 grid(input_channels, batch_size, 1);
int filter_multiplier = output_channels / input_channels; int filter_multiplier = output_channels / input_channels;
#define check_case(c_filter_multiplier, c_stride) \ #define check_case(c_filter_multiplier, c_stride, c_filter) \
if (c_filter_multiplier == 0 || \ if (c_filter_multiplier == 0 || \
filter_multiplier == c_filter_multiplier && \ filter_multiplier == c_filter_multiplier && \
stride_height == stride_width && stride_height == c_stride) { \ stride_height == stride_width && stride_height == c_stride && \
(ksize_height == ksize_width && ksize_height == c_filter || \
c_filter == -1)) { \
KernelDepthwiseConvInputGradSp< \ KernelDepthwiseConvInputGradSp< \
T, c_filter_multiplier, \ T, c_filter_multiplier, c_stride, \
c_stride><<<grid, threads, 0, context.stream()>>>( \ c_filter><<<grid, threads, 0, context.stream()>>>( \
output_grad_data, filter_data, batch_size, output_channels, \ output_grad_data, filter_data, batch_size, output_channels, \
output_height, output_width, input_channels, input_height, \ output_height, output_width, input_channels, input_height, \
input_width, filter_multiplier, ksize_height, ksize_width, \ input_width, filter_multiplier, ksize_height, ksize_width, \
...@@ -398,11 +533,21 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> { ...@@ -398,11 +533,21 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
dilate_height, dilate_width, input_grad_data); \ dilate_height, dilate_width, input_grad_data); \
return; \ return; \
} }
check_case(1, 1); check_case(1, 1, 3);
check_case(1, 2); check_case(1, 1, 5);
// NOTE(liangdun): 0,0 for other case check_case(1, 1, -1);
// add other case if needed, e.g. check_case(2^n,1) check_case(1, 2, 3);
check_case(0, 0); check_case(1, 2, 5);
check_case(1, 2, -1);
check_case(2, 1, 3);
check_case(2, 1, 5);
check_case(2, 1, -1);
check_case(2, 2, 3);
check_case(2, 2, 5);
check_case(2, 2, -1);
check_case(0, 0, -1);
// NOTE(liangdun): 0,0 for other case
// add other case if needed, e.g. check_case(2^n,1)
#undef check_case #undef check_case
} }
}; };
......
...@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <map>
#include <set> #include <set>
#include <vector> #include <vector>
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
namespace paddle { namespace paddle {
...@@ -150,6 +151,45 @@ template struct SelectedRowsAddTo<platform::CPUDeviceContext, double>; ...@@ -150,6 +151,45 @@ template struct SelectedRowsAddTo<platform::CPUDeviceContext, double>;
template struct SelectedRowsAddTo<platform::CPUDeviceContext, int>; template struct SelectedRowsAddTo<platform::CPUDeviceContext, int>;
template struct SelectedRowsAddTo<platform::CPUDeviceContext, int64_t>; template struct SelectedRowsAddTo<platform::CPUDeviceContext, int64_t>;
template <typename T>
struct SelectedRowsSumTo<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& context,
const std::vector<framework::SelectedRows*>& input1,
const std::vector<int64_t>& input2_offsets,
framework::SelectedRows* input2) {
// Ensure all selected rows have the same height
size_t size = 0u;
for (auto iter = input1.begin(); iter != input1.end(); ++iter) {
auto& in_rows = (*iter)->rows();
size += in_rows.end() - in_rows.begin();
auto in1_height = (*iter)->height();
PADDLE_ENFORCE_EQ(in1_height, input2->height());
}
// concat rows
std::vector<int64_t> in2_rows;
in2_rows.reserve(in2_rows.size() + size);
for (auto iter = input1.begin(); iter != input1.end(); ++iter) {
const framework::Vector<int64_t>& in_rows = (*iter)->rows();
in2_rows.insert(in2_rows.end(), in_rows.begin(), in_rows.end());
}
input2->set_rows(in2_rows);
auto* in2_value = input2->mutable_value();
auto* in2_data = in2_value->data<T>();
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
size_t offset = 0u;
for (size_t i = 0u; i != input1.size(); ++i) {
auto& in_value = input1[i]->value();
const auto* in_data = in_value.data<T>();
offset += input2_offsets[i];
blas.VCOPY(in_value.numel(), in_data, in2_data + offset);
}
}
};
template struct SelectedRowsSumTo<platform::CPUDeviceContext, float>;
template struct SelectedRowsSumTo<platform::CPUDeviceContext, double>;
template <typename T> template <typename T>
struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> { struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
...@@ -207,35 +247,45 @@ struct MergeAdd<platform::CPUDeviceContext, T> { ...@@ -207,35 +247,45 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
const framework::SelectedRows& input, const framework::SelectedRows& input,
framework::SelectedRows* output) { framework::SelectedRows* output) {
framework::SelectedRows& out = *output; framework::SelectedRows& out = *output;
auto input_rows = input.rows(); std::vector<int64_t> input_rows(input.rows());
std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
auto input_width = input.value().dims()[1]; std::map<int64_t, std::vector<int64_t>> merge_row_map;
out.set_rows(merge_rows); for (size_t i = 0; i < input_rows.size(); ++i) {
merge_row_map[input_rows[i]].push_back(i);
}
std::vector<int64_t> merge_rows(merge_row_map.size());
size_t idx = 0;
int64_t input_width = input.value().dims()[1];
out.set_height(input.height()); out.set_height(input.height());
out.mutable_value()->mutable_data<T>(
T* out_data = out.mutable_value()->mutable_data<T>(
framework::make_ddim( framework::make_ddim(
{static_cast<int64_t>(merge_rows.size()), input_width}), {static_cast<int64_t>(merge_rows.size()), input_width}),
context.GetPlace()); context.GetPlace());
const T* in_data = input.value().data<T>();
math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
constant_functor(context, out.mutable_value(), 0.0); for (auto& row_pair : merge_row_map) {
auto* out_ptr = out_data + idx * input_width;
auto* out_data = out.mutable_value()->data<T>(); auto& rows = row_pair.second;
auto* input_data = input.value().data<T>(); merge_rows[idx] = row_pair.first;
++idx;
for (size_t i = 0; i < input_rows.size(); i++) { // rows.size() is always larger than 0
size_t out_i = FindPos(merge_rows, input_rows[i]); std::memcpy(out_ptr, in_data + rows[0] * input_width,
for (int64_t j = 0; j < input_width; j++) { sizeof(T) * input_width);
out_data[out_i * input_width + j] += input_data[i * input_width + j];
for (size_t i = 1; i < rows.size(); ++i) {
auto* in_ptr = in_data + rows[i] * input_width;
for (int64_t j = 0; j < input_width; ++j) {
out_ptr[j] += in_ptr[j];
}
} }
} }
out.set_rows(merge_rows);
} }
}; };
template struct MergeAdd<platform::CPUDeviceContext, float>;
template struct MergeAdd<platform::CPUDeviceContext, double>;
template struct MergeAdd<platform::CPUDeviceContext, int>; template struct MergeAdd<platform::CPUDeviceContext, int>;
template struct MergeAdd<platform::CPUDeviceContext, int64_t>; template struct MergeAdd<platform::CPUDeviceContext, int64_t>;
......
...@@ -12,8 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <map>
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#define INLINE_FOR2(sizei, sizej) \ #define INLINE_FOR2(sizei, sizej) \
...@@ -49,6 +55,15 @@ struct SelectedRowsAddTo { ...@@ -49,6 +55,15 @@ struct SelectedRowsAddTo {
const int64_t input2_offset, framework::SelectedRows* input2); const int64_t input2_offset, framework::SelectedRows* input2);
}; };
// input2 = [all input in input1] + input2
template <typename DeviceContext, typename T>
struct SelectedRowsSumTo {
void operator()(const DeviceContext& context,
const std::vector<framework::SelectedRows*>& input1,
const std::vector<int64_t>& input2_offsets,
framework::SelectedRows* input2);
};
// input2 = input1 + input2 // input2 = input1 + input2
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
struct SelectedRowsAddToTensor { struct SelectedRowsAddToTensor {
...@@ -70,6 +85,104 @@ struct MergeAdd { ...@@ -70,6 +85,104 @@ struct MergeAdd {
framework::SelectedRows* output); framework::SelectedRows* output);
}; };
template <>
struct MergeAdd<platform::CPUDeviceContext, float> {
framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
const framework::SelectedRows& input) {
framework::SelectedRows out;
(*this)(context, input, &out);
return out;
}
void operator()(const platform::CPUDeviceContext& context,
const framework::SelectedRows& input,
framework::SelectedRows* output) {
framework::SelectedRows& out = *output;
std::vector<int64_t> input_rows(input.rows());
std::map<int64_t, std::vector<int64_t>> merge_row_map;
for (size_t i = 0; i < input_rows.size(); ++i) {
merge_row_map[input_rows[i]].push_back(i);
}
std::vector<int64_t> merge_rows(merge_row_map.size());
size_t idx = 0;
int64_t input_width = input.value().dims()[1];
out.set_height(input.height());
auto* out_data = out.mutable_value()->mutable_data<float>(
framework::make_ddim(
{static_cast<int64_t>(merge_rows.size()), input_width}),
context.GetPlace());
auto* in_data = input.value().data<float>();
auto blas = GetBlas<platform::CPUDeviceContext, float>(context);
for (auto& row_pair : merge_row_map) {
auto* out_ptr = out_data + idx * input_width;
auto& rows = row_pair.second;
merge_rows[idx] = row_pair.first;
++idx;
// rows.size() is always larger than 0
blas.VCOPY(input_width, in_data + rows[0] * input_width, out_ptr);
for (size_t i = 1; i < rows.size(); ++i) {
blas.AXPY(input_width, 1., in_data + rows[i] * input_width, out_ptr);
}
}
out.set_rows(merge_rows);
}
};
template <>
struct MergeAdd<platform::CPUDeviceContext, double> {
framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
const framework::SelectedRows& input) {
framework::SelectedRows out;
(*this)(context, input, &out);
return out;
}
void operator()(const platform::CPUDeviceContext& context,
const framework::SelectedRows& input,
framework::SelectedRows* output) {
framework::SelectedRows& out = *output;
std::vector<int64_t> input_rows(input.rows());
std::map<int64_t, std::vector<int64_t>> merge_row_map;
for (size_t i = 0; i < input_rows.size(); ++i) {
merge_row_map[input_rows[i]].push_back(i);
}
std::vector<int64_t> merge_rows(merge_row_map.size());
size_t idx = 0;
int64_t input_width = input.value().dims()[1];
out.set_height(input.height());
auto* out_data = out.mutable_value()->mutable_data<double>(
framework::make_ddim(
{static_cast<int64_t>(merge_rows.size()), input_width}),
context.GetPlace());
auto* in_data = input.value().data<double>();
auto blas = GetBlas<platform::CPUDeviceContext, double>(context);
for (auto& row_pair : merge_row_map) {
auto* out_ptr = out_data + idx * input_width;
auto& rows = row_pair.second;
merge_rows[idx] = row_pair.first;
++idx;
// rows.size() is always larger than 0
blas.VCOPY(input_width, in_data + rows[0] * input_width, out_ptr);
for (size_t i = 1; i < rows.size(); ++i) {
blas.AXPY(input_width, 1., in_data + rows[i] * input_width, out_ptr);
}
}
out.set_rows(merge_rows);
}
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
struct Add { struct Add {
framework::SelectedRows operator()(const DeviceContext& context, framework::SelectedRows operator()(const DeviceContext& context,
......
...@@ -219,3 +219,174 @@ TEST(selected_rows_functor, cpu_add_to) { ...@@ -219,3 +219,174 @@ TEST(selected_rows_functor, cpu_add_to) {
// row9: 2.0 + 3.0 // row9: 2.0 + 3.0
EXPECT_EQ(tensor1_data[9 * row_numel + 6], 5.0); EXPECT_EQ(tensor1_data[9 * row_numel + 6], 5.0);
} }
TEST(selected_rows_functor, cpu_merge_add_float) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CPUDeviceContext ctx(cpu_place);
paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
float>
functor;
int64_t height = 10;
int64_t row_numel = 10;
std::vector<int64_t> rows{0, 4, 4, 7};
std::unique_ptr<paddle::framework::SelectedRows> selected_rows{
new paddle::framework::SelectedRows(rows, height)};
auto* in_value = selected_rows->mutable_value();
in_value->mutable_data<float>(
paddle::framework::make_ddim(
{static_cast<int64_t>(rows.size()), row_numel}),
cpu_place);
functor(ctx, in_value, 1.0);
std::unique_ptr<paddle::framework::SelectedRows> output{
new paddle::framework::SelectedRows()};
paddle::operators::math::scatter::MergeAdd<paddle::platform::CPUDeviceContext,
float>
merge_add_functor;
merge_add_functor(ctx, *selected_rows, output.get());
auto out_height = output->height();
EXPECT_EQ(out_height, height);
auto& out_rows = output->rows();
EXPECT_EQ(out_rows[0], 0);
EXPECT_EQ(out_rows[1], 4);
EXPECT_EQ(out_rows[2], 7);
auto* out_data = output->value().data<float>();
EXPECT_EQ(out_data[0 * row_numel], 1.0);
EXPECT_EQ(out_data[1 * row_numel], 2.0);
EXPECT_EQ(out_data[2 * row_numel], 1.0);
}
TEST(selected_rows_functor, cpu_merge_add_int) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CPUDeviceContext ctx(cpu_place);
paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext, int>
functor;
int64_t height = 10;
int64_t row_numel = 10;
std::vector<int64_t> rows{0, 4, 4, 7};
std::unique_ptr<paddle::framework::SelectedRows> selected_rows{
new paddle::framework::SelectedRows(rows, height)};
auto* in_value = selected_rows->mutable_value();
in_value->mutable_data<int>(
paddle::framework::make_ddim(
{static_cast<int64_t>(rows.size()), row_numel}),
cpu_place);
functor(ctx, in_value, 1);
std::unique_ptr<paddle::framework::SelectedRows> output{
new paddle::framework::SelectedRows()};
paddle::operators::math::scatter::MergeAdd<paddle::platform::CPUDeviceContext,
int>
merge_add_functor;
merge_add_functor(ctx, *selected_rows, output.get());
auto out_height = output->height();
EXPECT_EQ(out_height, height);
auto& out_rows = output->rows();
EXPECT_EQ(out_rows[0], 0);
EXPECT_EQ(out_rows[1], 4);
EXPECT_EQ(out_rows[2], 7);
auto* out_data = output->value().data<int>();
EXPECT_EQ(out_data[0 * row_numel], 1);
EXPECT_EQ(out_data[1 * row_numel], 2);
EXPECT_EQ(out_data[2 * row_numel], 1);
}
TEST(selected_rows_functor, cpu_sum_to) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CPUDeviceContext ctx(cpu_place);
paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
float>
functor;
int64_t height = 10;
int64_t row_numel = 10;
std::vector<int64_t> rows1{0, 4, 7};
std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
new paddle::framework::SelectedRows(rows1, height)};
auto* in1_value = selected_rows1->mutable_value();
in1_value->mutable_data<float>(
paddle::framework::make_ddim(
{static_cast<int64_t>(rows1.size()), row_numel}),
cpu_place);
functor(ctx, in1_value, 1.0);
std::vector<int64_t> rows2{0, 5, 7, 9};
std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
new paddle::framework::SelectedRows(rows2, height)};
auto* in2_value = selected_rows2->mutable_value();
in2_value->mutable_data<float>(
paddle::framework::make_ddim(
{static_cast<int64_t>(rows2.size()), row_numel}),
cpu_place);
functor(ctx, in2_value, 2.0);
std::unique_ptr<paddle::framework::SelectedRows> output{
new paddle::framework::SelectedRows()};
output->set_height(height);
auto* out_value = output->mutable_value();
// simplely concat two SelectedRows
out_value->mutable_data<float>(paddle::framework::make_ddim({7, 10}),
cpu_place);
paddle::operators::math::SelectedRowsSumTo<paddle::platform::CPUDeviceContext,
float>
sum_to_functor;
sum_to_functor(ctx, std::vector<paddle::framework::SelectedRows*>(
{selected_rows1.get(), selected_rows2.get()}),
std::vector<int64_t>({0, in1_value->numel()}), output.get());
auto out_height = output->height();
EXPECT_EQ(out_height, height);
auto& out_rows = output->rows();
// input1 rows
EXPECT_EQ(out_rows[0], 0);
EXPECT_EQ(out_rows[1], 4);
EXPECT_EQ(out_rows[2], 7);
// input2 rows
EXPECT_EQ(out_rows[3], 0);
EXPECT_EQ(out_rows[4], 5);
EXPECT_EQ(out_rows[5], 7);
EXPECT_EQ(out_rows[6], 9);
auto* out_data = output->value().data<float>();
// input1 value
EXPECT_EQ(out_data[0 * row_numel + 0], 1.0);
EXPECT_EQ(out_data[0 * row_numel + 8], 1.0);
EXPECT_EQ(out_data[1 * row_numel + 1], 1.0);
EXPECT_EQ(out_data[2 * row_numel + 6], 1.0);
// input2 value
EXPECT_EQ(out_data[3 * row_numel + 3], 2.0);
EXPECT_EQ(out_data[3 * row_numel + 8], 2.0);
EXPECT_EQ(out_data[4 * row_numel + 4], 2.0);
EXPECT_EQ(out_data[5 * row_numel + 7], 2.0);
EXPECT_EQ(out_data[6 * row_numel + 9], 2.0);
std::unique_ptr<paddle::framework::Tensor> tensor1{
new paddle::framework::Tensor()};
tensor1->mutable_data<float>(
paddle::framework::make_ddim({height, row_numel}), cpu_place);
functor(ctx, tensor1.get(), 3.0);
paddle::operators::math::SelectedRowsAddToTensor<
paddle::platform::CPUDeviceContext, float>
add_to_tensor_functor;
add_to_tensor_functor(ctx, *output, tensor1.get());
auto* tensor1_data = tensor1->data<float>();
// row0: 1.0 + 2.0 + 3.0
EXPECT_EQ(tensor1_data[0 * row_numel + 0], 6.0);
// row1: 3.0
EXPECT_EQ(tensor1_data[1 * row_numel + 1], 3.0);
// row4 : 1.0 + 3.0
EXPECT_EQ(tensor1_data[4 * row_numel + 6], 4.0);
// row5: 2.0 + 3.0
EXPECT_EQ(tensor1_data[5 * row_numel + 7], 5.0);
// row6: 3.0
EXPECT_EQ(tensor1_data[6 * row_numel + 1], 3.0);
// row7: 1.0 + 2.0 + 3.0
EXPECT_EQ(tensor1_data[7 * row_numel + 3], 6.0);
// row9: 2.0 + 3.0
EXPECT_EQ(tensor1_data[9 * row_numel + 6], 5.0);
}
...@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/sequence_pooling.h"
#include <string> #include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence_pooling.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -180,6 +182,7 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> { ...@@ -180,6 +182,7 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
} }
auto lod = input.lod()[0]; auto lod = input.lod()[0];
auto& place = *context.eigen_device(); auto& place = *context.eigen_device();
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) { for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
Tensor in_t = Tensor in_t =
input.Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1])); input.Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
...@@ -191,7 +194,14 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> { ...@@ -191,7 +194,14 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
if (pooltype == "AVERAGE") { if (pooltype == "AVERAGE") {
out_e.device(place) = in_e.mean(Eigen::array<int, 1>({{0}})); out_e.device(place) = in_e.mean(Eigen::array<int, 1>({{0}}));
} else if (pooltype == "SUM") { } else if (pooltype == "SUM") {
out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})); if (h > 0) {
const T* in_data = in_t.data<T>();
T* out_data = out_t.mutable_data<T>(context.GetPlace());
blas.VCOPY(w, in_data, out_data);
for (int64_t r = 1; r != h; ++r) {
blas.AXPY(w, 1., in_data + r * w, out_data);
}
}
} else if (pooltype == "SQRT") { } else if (pooltype == "SQRT") {
out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) / out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
std::sqrt(static_cast<T>(h)); std::sqrt(static_cast<T>(h));
...@@ -223,6 +233,7 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> { ...@@ -223,6 +233,7 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
} }
auto lod = in_grad->lod()[0]; auto lod = in_grad->lod()[0];
auto& place = *context.eigen_device(); auto& place = *context.eigen_device();
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) { for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
auto in_g_t = in_grad->Slice(static_cast<int>(lod[i]), auto in_g_t = in_grad->Slice(static_cast<int>(lod[i]),
static_cast<int>(lod[i + 1])); static_cast<int>(lod[i + 1]));
...@@ -237,7 +248,11 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> { ...@@ -237,7 +248,11 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
if (pooltype == "AVERAGE") { if (pooltype == "AVERAGE") {
in_g_e.device(place) = (out_g_e / static_cast<T>(h)).broadcast(bcast); in_g_e.device(place) = (out_g_e / static_cast<T>(h)).broadcast(bcast);
} else if (pooltype == "SUM") { } else if (pooltype == "SUM") {
in_g_e.device(place) = (out_g_e).broadcast(bcast); const T* out_g_data = out_g_t.data<T>();
T* in_g_data = in_g_t.mutable_data<T>(context.GetPlace());
for (int r = 0; r != h; ++r) {
blas.VCOPY(w, out_g_data, in_g_data + r * w);
}
} else if (pooltype == "SQRT") { } else if (pooltype == "SQRT") {
in_g_e.device(place) = in_g_e.device(place) =
(out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast); (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
......
...@@ -33,6 +33,11 @@ class MomentumOp : public framework::OperatorWithKernel { ...@@ -33,6 +33,11 @@ class MomentumOp : public framework::OperatorWithKernel {
"Input(velocity) of Momentum should not be null."); "Input(velocity) of Momentum should not be null.");
PADDLE_ENFORCE(ctx->HasInput("LearningRate"), PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
"Input(LearningRate) of Momentum should not be null."); "Input(LearningRate) of Momentum should not be null.");
PADDLE_ENFORCE(
ctx->GetInputsVarType("Param").front() ==
framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
"Output(ParamOut) of Momentum should not be null."); "Output(ParamOut) of Momentum should not be null.");
......
...@@ -31,8 +31,8 @@ class BlockingQueue { ...@@ -31,8 +31,8 @@ class BlockingQueue {
// is a workaround and a simplified version of framework::Channel as it // is a workaround and a simplified version of framework::Channel as it
// doesn't support GPU and it implements on buffered blocking queue. // doesn't support GPU and it implements on buffered blocking queue.
public: public:
explicit BlockingQueue(size_t capacity) explicit BlockingQueue(size_t capacity, bool speed_test_mode = false)
: capacity_(capacity), closed_(false) { : capacity_(capacity), speed_test_mode_(speed_test_mode), closed_(false) {
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
capacity_, 0, capacity_, 0,
"The capacity of a reader::BlockingQueue must be greater than 0."); "The capacity of a reader::BlockingQueue must be greater than 0.");
...@@ -72,7 +72,9 @@ class BlockingQueue { ...@@ -72,7 +72,9 @@ class BlockingQueue {
if (!queue_.empty()) { if (!queue_.empty()) {
PADDLE_ENFORCE_NOT_NULL(elem); PADDLE_ENFORCE_NOT_NULL(elem);
*elem = queue_.front(); *elem = queue_.front();
queue_.pop_front(); if (LIKELY(!speed_test_mode_)) {
queue_.pop_front();
}
send_cv_.notify_one(); send_cv_.notify_one();
return true; return true;
} else { } else {
...@@ -114,6 +116,7 @@ class BlockingQueue { ...@@ -114,6 +116,7 @@ class BlockingQueue {
private: private:
size_t capacity_; size_t capacity_;
bool speed_test_mode_;
bool closed_; bool closed_;
std::deque<T> queue_; std::deque<T> queue_;
......
...@@ -33,8 +33,9 @@ class LoDTensorBlockingQueue { ...@@ -33,8 +33,9 @@ class LoDTensorBlockingQueue {
private: private:
LoDTensorBlockingQueue(size_t capacity, LoDTensorBlockingQueue(size_t capacity,
const std::vector<framework::DDim>& dims) const std::vector<framework::DDim>& dims,
: queue_(capacity), dims_(dims) {} bool speed_test_mode = false)
: queue_(capacity, speed_test_mode), dims_(dims) {}
public: public:
bool Push(const std::vector<framework::LoDTensor>& lod_tensor_vec) { bool Push(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
...@@ -69,11 +70,12 @@ class LoDTensorBlockingQueue { ...@@ -69,11 +70,12 @@ class LoDTensorBlockingQueue {
class LoDTensorBlockingQueueHolder { class LoDTensorBlockingQueueHolder {
public: public:
void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims) { void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims,
bool speed_test_mode = false) {
PADDLE_ENFORCE( PADDLE_ENFORCE(
queue_ == nullptr, queue_ == nullptr,
"LoDTensorBlockingQueueHolder::InitOnce() can only be called once"); "LoDTensorBlockingQueueHolder::InitOnce() can only be called once");
queue_.reset(new LoDTensorBlockingQueue(capacity, dims)); queue_.reset(new LoDTensorBlockingQueue(capacity, dims, speed_test_mode));
} }
inline const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const { inline const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const {
......
...@@ -217,3 +217,27 @@ TEST(BlockingQueue, MyClassTest) { ...@@ -217,3 +217,27 @@ TEST(BlockingQueue, MyClassTest) {
q.Receive(&b); q.Receive(&b);
EXPECT_EQ(a.val_, b.val_); EXPECT_EQ(a.val_, b.val_);
} }
TEST(BlockingQueue, speed_test_mode) {
size_t queue_size = 10;
BlockingQueue<size_t> q1(queue_size, false);
for (size_t i = 0; i < queue_size; ++i) {
q1.Send(i);
}
size_t b;
for (size_t i = 0; i < queue_size; ++i) {
q1.Receive(&b);
EXPECT_EQ(b, i);
}
EXPECT_EQ(q1.Size(), 0);
BlockingQueue<size_t> q2(queue_size, true);
for (size_t i = 0; i < queue_size; ++i) {
q2.Send(i);
}
for (size_t i = 0; i < queue_size; ++i) {
q2.Receive(&b);
EXPECT_EQ(b, 0);
}
EXPECT_EQ(q2.Size(), queue_size);
}
...@@ -164,7 +164,7 @@ dimension value will be copied from Input(X) at runtime. Note that the index of ...@@ -164,7 +164,7 @@ dimension value will be copied from Input(X) at runtime. Note that the index of
[2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input. [2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input.
3. Input(Shape) has a higher priority than Attr(shape) if it is provided, while 3. Input(Shape) has a higher priority than Attr(shape) if it is provided, while
Attr(shape) still should be set correctly to gurantee shape inference in Attr(shape) still should be set correctly to gurantee shape inference in
compile-time. compile-time.
)DOC"); )DOC");
...@@ -259,7 +259,6 @@ class Reshape2Op : public ReshapeOp { ...@@ -259,7 +259,6 @@ class Reshape2Op : public ReshapeOp {
: ReshapeOp(type, inputs, outputs, attrs) {} : ReshapeOp(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
ReshapeOp::InferShape(ctx);
PADDLE_ENFORCE(ctx->HasOutput("XShape"), PADDLE_ENFORCE(ctx->HasOutput("XShape"),
"Output(XShape) of ReshapeOp should not be null."); "Output(XShape) of ReshapeOp should not be null.");
const auto &x_dims = ctx->GetInputDim("X"); const auto &x_dims = ctx->GetInputDim("X");
...@@ -270,6 +269,8 @@ class Reshape2Op : public ReshapeOp { ...@@ -270,6 +269,8 @@ class Reshape2Op : public ReshapeOp {
} }
ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims)); ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
ctx->ShareLoD("X", /*->*/ "XShape"); ctx->ShareLoD("X", /*->*/ "XShape");
ReshapeOp::InferShape(ctx);
} }
}; };
......
...@@ -32,6 +32,11 @@ class RmspropOp : public framework::OperatorWithKernel { ...@@ -32,6 +32,11 @@ class RmspropOp : public framework::OperatorWithKernel {
"Input(Grad) of RmspropOp should not be null."); "Input(Grad) of RmspropOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Moment"), PADDLE_ENFORCE(ctx->HasInput("Moment"),
"Input(Moment) of RmspropOp should not be null."); "Input(Moment) of RmspropOp should not be null.");
PADDLE_ENFORCE(
ctx->GetInputsVarType("Param").front() ==
framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
"Output(param_out) of RmspropOp should not be null."); "Output(param_out) of RmspropOp should not be null.");
......
...@@ -13,66 +13,254 @@ See the License for the specific language governing permissions and ...@@ -13,66 +13,254 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <math.h>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/algorithm.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/for_range.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor, template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex> typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>; using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename T>
struct DenseRmspropGradFunctor {
inline explicit DenseRmspropGradFunctor(const T *grad) : grad_(grad) {}
HOSTDEVICE inline T operator()(int64_t idx) const { return grad_[idx]; }
const T *grad_;
};
template <typename T>
struct SparseRmspropGradFunctor {
inline SparseRmspropGradFunctor(const T *grad, const int64_t *rows,
int64_t row_numel, int64_t row_count)
: grad_(grad),
rows_(rows),
row_numel_(row_numel),
row_count_(row_count) {}
HOSTDEVICE inline T operator()(int64_t idx) const {
auto row_idx = math::BinarySearch(rows_, row_count_, idx / row_numel_);
return row_idx >= 0 ? grad_[row_idx * row_numel_ + idx % row_numel_] : 0;
}
const T *grad_;
const int64_t *rows_;
int64_t row_numel_;
int64_t row_count_;
};
template <typename T, typename GradFunctor>
struct UncenteredRmspropFunctor {
UncenteredRmspropFunctor(T *param, T *ms, T *mom, const T *lr, T rho,
T epsilon, T momentum,
const GradFunctor &grad_functor)
: param_(param),
ms_(ms),
mom_(mom),
lr_(lr),
rho_(rho),
epsilon_(epsilon),
momentum_(momentum),
grad_functor_(grad_functor) {}
HOSTDEVICE inline void operator()(int64_t idx) const {
T g = grad_functor_(idx);
T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g;
T mom_out = momentum_ * mom_[idx] + lr_[0] * g / sqrt(ms_out + epsilon_);
param_[idx] -= mom_out;
ms_[idx] = ms_out;
mom_[idx] = mom_out;
}
T *param_;
T *ms_;
T *mom_;
const T *lr_;
T rho_;
T epsilon_;
T momentum_;
GradFunctor grad_functor_;
};
template <typename T, typename GradFunctor>
struct CenteredRmspropFunctor {
CenteredRmspropFunctor(T *param, T *ms, T *mom, T *mean_grad, const T *lr,
T rho, T epsilon, T momentum,
const GradFunctor &grad_functor)
: param_(param),
ms_(ms),
mom_(mom),
mean_grad_(mean_grad),
lr_(lr),
rho_(rho),
epsilon_(epsilon),
momentum_(momentum),
grad_functor_(grad_functor) {}
HOSTDEVICE inline void operator()(int64_t idx) const {
T g = grad_functor_(idx);
T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g;
T mg_out = rho_ * mean_grad_[idx] + (1 - rho_) * g;
T mom_out = momentum_ * mom_[idx] +
lr_[0] * g / sqrt(ms_out - mg_out * mg_out + epsilon_);
param_[idx] -= mom_out;
ms_[idx] = ms_out;
mom_[idx] = mom_out;
mean_grad_[idx] = mg_out;
}
T *param_;
T *ms_;
T *mom_;
T *mean_grad_;
const T *lr_;
T rho_;
T epsilon_;
T momentum_;
GradFunctor grad_functor_;
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class RmspropOpKernel : public framework::OpKernel<T> { class RmspropOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
auto* param_out = ctx.Output<Tensor>("ParamOut"); using LoDTensor = framework::LoDTensor;
auto* moment_out = ctx.Output<Tensor>("MomentOut"); auto *grad_var = ctx.InputVar("Grad");
auto* mean_square_out = ctx.Output<Tensor>("MeanSquareOut"); auto *param_out = ctx.Output<LoDTensor>("ParamOut");
auto *moment_out = ctx.Output<LoDTensor>("MomentOut");
auto *mean_square_out = ctx.Output<LoDTensor>("MeanSquareOut");
auto grad = ctx.Input<Tensor>("Grad"); auto epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
auto rho = static_cast<T>(ctx.Attr<float>("decay"));
auto momentum = static_cast<T>(ctx.Attr<float>("momentum"));
bool centered = ctx.Attr<bool>("centered");
param_out->mutable_data<T>(ctx.GetPlace()); auto &p_tensor = *ctx.Input<LoDTensor>("Param");
moment_out->mutable_data<T>(ctx.GetPlace()); auto &ms_tensor = *ctx.Input<LoDTensor>("MeanSquare");
mean_square_out->mutable_data<T>(ctx.GetPlace()); auto &lr_tensor = *ctx.Input<LoDTensor>("LearningRate");
auto &mom_tensor = *ctx.Input<LoDTensor>("Moment");
float epsilon = ctx.Attr<float>("epsilon"); PADDLE_ENFORCE_EQ(&p_tensor, param_out,
float rho = ctx.Attr<float>("decay"); "Param and ParamOut must be the same Tensor");
float momentum = ctx.Attr<float>("momentum"); PADDLE_ENFORCE_EQ(&mom_tensor, moment_out,
bool centered = ctx.Attr<bool>("centered"); "Moment and MomentOut must be the same Tensor");
PADDLE_ENFORCE_EQ(&ms_tensor, mean_square_out,
"MeanSquare and MeanSquareOut must be the same Tensor");
auto &dev_ctx = ctx.template device_context<DeviceContext>();
size_t limit = static_cast<size_t>(ms_tensor.numel());
if (grad_var->IsType<LoDTensor>()) {
auto &grad_tensor = grad_var->Get<LoDTensor>();
if (std::is_same<DeviceContext, platform::CPUDeviceContext>::value) {
auto &place =
*ctx.template device_context<DeviceContext>().eigen_device();
auto lr_value = lr_tensor.data<T>()[0];
auto p = EigenVector<T>::Flatten(p_tensor);
auto ms = EigenVector<T>::Flatten(ms_tensor);
auto g = EigenVector<T>::Flatten(grad_tensor);
auto mom = EigenVector<T>::Flatten(mom_tensor);
auto p_out = EigenVector<T>::Flatten(*param_out);
auto mom_out = EigenVector<T>::Flatten(*moment_out);
auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
ms_out.device(place) = rho * ms + (1 - rho) * g * g;
if (centered) {
auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
auto mg = EigenVector<T>::Flatten(mg_tensor);
auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
PADDLE_ENFORCE(&mg_tensor, mean_grad_out,
"MeanGrad and MeanGradOut must be the same Tensor");
auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
mg_out.device(place) = rho * mg + (1 - rho) * g;
mom_out.device(place) =
momentum * mom +
lr_value * g / (ms_out - mg_out.square() + epsilon).sqrt();
} else {
mom_out.device(place) =
momentum * mom + lr_value * g / (ms_out + epsilon).sqrt();
}
p_out.device(place) = p - mom_out;
} else {
DenseRmspropGradFunctor<T> grad_func(grad_tensor.data<T>());
platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
if (centered) {
auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
PADDLE_ENFORCE(&mg_tensor, mean_grad_out,
"MeanGrad and MeanGradOut must be the same Tensor");
for_range(CenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
param_out->mutable_data<T>(ctx.GetPlace()),
mean_square_out->mutable_data<T>(ctx.GetPlace()),
moment_out->mutable_data<T>(ctx.GetPlace()),
mean_grad_out->mutable_data<T>(ctx.GetPlace()),
lr_tensor.data<T>(), rho, epsilon, momentum, grad_func));
} else {
for_range(UncenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
param_out->mutable_data<T>(ctx.GetPlace()),
mean_square_out->mutable_data<T>(ctx.GetPlace()),
moment_out->mutable_data<T>(ctx.GetPlace()), lr_tensor.data<T>(),
rho, epsilon, momentum, grad_func));
}
}
} else if (grad_var->IsType<framework::SelectedRows>()) {
auto &grad = grad_var->Get<framework::SelectedRows>();
auto *merged_grad = const_cast<framework::Scope &>(ctx.scope())
.Var()
->GetMutable<framework::SelectedRows>();
math::scatter::MergeAdd<DeviceContext, T> merge_func;
merge_func(dev_ctx, grad, merged_grad);
platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
const int64_t *rows;
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(ctx.GetPlace())) {
rows = merged_grad->rows().CUDAData(ctx.GetPlace());
} else {
#endif
rows = merged_grad->rows().data();
#ifdef PADDLE_WITH_CUDA
}
#endif
auto &merged_tensor = merged_grad->value();
int64_t row_count = merged_grad->rows().size();
int64_t row_numel = merged_tensor.numel() / row_count;
SparseRmspropGradFunctor<T> grad_func(merged_tensor.data<T>(), rows,
row_numel, row_count);
auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param")); if (centered) {
auto ms = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanSquare")); auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate")); auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
auto g = EigenVector<T>::Flatten(*grad); PADDLE_ENFORCE(&mg_tensor, mean_grad_out,
auto mom = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Moment")); "MeanGrad and MeanGradOut must be the same Tensor");
for_range(CenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
auto p_out = EigenVector<T>::Flatten(*param_out); param_out->mutable_data<T>(ctx.GetPlace()),
auto mom_out = EigenVector<T>::Flatten(*moment_out); mean_square_out->mutable_data<T>(ctx.GetPlace()),
auto ms_out = EigenVector<T>::Flatten(*mean_square_out); moment_out->mutable_data<T>(ctx.GetPlace()),
auto& place = *ctx.template device_context<DeviceContext>().eigen_device(); mean_grad_out->mutable_data<T>(ctx.GetPlace()), lr_tensor.data<T>(),
rho, epsilon, momentum, grad_func));
Eigen::DSizes<int, 1> grad_dsize(static_cast<int>(grad->numel())); } else {
for_range(UncenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
ms_out.device(place) = rho * ms + (1 - rho) * g * g; param_out->mutable_data<T>(ctx.GetPlace()),
if (centered) { mean_square_out->mutable_data<T>(ctx.GetPlace()),
auto mg = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanGrad")); moment_out->mutable_data<T>(ctx.GetPlace()), lr_tensor.data<T>(),
auto* mean_grad_out = ctx.Output<Tensor>("MeanGradOut"); rho, epsilon, momentum, grad_func));
mean_grad_out->mutable_data<T>(ctx.GetPlace()); }
auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
mg_out.device(place) = rho * mg + (1 - rho) * g;
mom_out.device(place) = momentum * mom +
lr.broadcast(grad_dsize) * g /
(ms_out - mg_out.square() + epsilon).sqrt();
} else { } else {
mom_out.device(place) = PADDLE_THROW("RMSProp only supports LoDTensor or SelectedRows gradient");
momentum * mom +
lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt();
} }
p_out.device(place) = p - mom_out;
} }
}; };
......
...@@ -90,11 +90,13 @@ REGISTER_OPERATOR(sequence_concat, paddle::framework::OperatorWithKernel, ...@@ -90,11 +90,13 @@ REGISTER_OPERATOR(sequence_concat, paddle::framework::OperatorWithKernel,
paddle::framework::DefaultGradOpDescMaker<false>); paddle::framework::DefaultGradOpDescMaker<false>);
template <typename T> template <typename T>
using Kernel = op::SeqConcatKernel<paddle::platform::CPUDeviceContext, T>; using Kernel = op::SeqConcatKernel<paddle::platform::CPUDeviceContext, T>;
REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel<float>, Kernel<double>); REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel<float>, Kernel<double>,
Kernel<int64_t>);
REGISTER_OPERATOR(sequence_concat_grad, paddle::framework::OperatorWithKernel, REGISTER_OPERATOR(sequence_concat_grad, paddle::framework::OperatorWithKernel,
op::SeqConcatGradShapeInferer); op::SeqConcatGradShapeInferer);
template <typename T> template <typename T>
using GradKernel = using GradKernel =
op::SeqConcatGradKernel<paddle::platform::CPUDeviceContext, T>; op::SeqConcatGradKernel<paddle::platform::CPUDeviceContext, T>;
REGISTER_OP_CPU_KERNEL(sequence_concat_grad, GradKernel<float>, REGISTER_OP_CPU_KERNEL(sequence_concat_grad, GradKernel<float>,
GradKernel<double>); GradKernel<double>, GradKernel<int64_t>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/sequence_unpad_op.h"
namespace paddle {
namespace operators {
class SequenceUnpadOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of SequenceUnpadOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Length"),
"Input(Length) of SequenceUnpadOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of SequenceUnpadOp should not be null.");
auto x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_GE(x_dims.size(), 2,
"The rank of Input(X) can't be less than 2.");
auto len_dims = ctx->GetInputDim("Length");
PADDLE_ENFORCE(len_dims.size() == 2 && len_dims[1] == 1,
"The shape of Input(Length) should be [batch_size, 1].");
PADDLE_ENFORCE(
len_dims[0] == x_dims[0],
"Input(X) and Input(Length) should have the same first dimension.");
int64_t out_dim_0 = -1;
if (ctx->IsRuntime()) {
out_dim_0 = x_dims[0] * x_dims[1];
}
std::vector<int64_t> out_dims_vec{out_dim_0};
if (x_dims.size() == 2) {
out_dims_vec.push_back(1);
} else {
for (size_t i = 2; i < x_dims.size(); ++i) {
out_dims_vec.push_back(x_dims[i]);
}
}
ctx->SetOutputDim("Out", framework::make_ddim(out_dims_vec));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X"));
return framework::OpKernelType(data_type, ctx.device_context());
}
};
class SequenceUnpadOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(LoDTensor, default LoDTensor<float>) Input tensor which "
"contains the padded sequences with equal length.");
AddInput("Length",
"(LoDTensor) The input tensor which specifies the actual ength of "
"sequences after unpadding.");
AddOutput(
"Out",
"(LoDTensor) The output tensor which contains unpadded sequences.");
AddComment(R"DOC(
Sequence Unpad Operator
This operator removes the padding data in the input sequences and convert
them into sequences with actual length as output, identitied by lod
information.
Example:
Given input tensor Input(X):
X.data = [[ 1.0, 2.0, 3.0, 4.0, 5.0],
[ 6.0, 7.0, 8.0, 9.0, 10.0],
[11.0, 12.0, 13.0, 14.0, 15.0]],
`
in which there are 3 sequences padded to length 5, and the acutal length
specified by Input(Length):
Length.data = [[2], [3], [4]],
after unpadding, Output(Out) will be:
Out.data = [[1.0, 2.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0]]
Out.lod = [[0, 2, 5, 9]]
)DOC");
}
};
class SequenceUnpadGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of SequenceUnpadGradOp should not be null.");
PADDLE_ENFORCE(
ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) of SequenceUnpadGradOp should not be null.");
if (ctx->HasOutput(framework::GradVarName("X"))) {
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X"));
return framework::OpKernelType(data_type, ctx.device_context());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(sequence_unpad, ops::SequenceUnpadOp,
ops::SequenceUnpadOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(sequence_unpad_grad, ops::SequenceUnpadGradOp);
REGISTER_OP_CPU_KERNEL(
sequence_unpad,
ops::SequenceUnpadOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::SequenceUnpadOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::SequenceUnpadOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::SequenceUnpadOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
REGISTER_OP_CPU_KERNEL(
sequence_unpad_grad,
ops::SequenceUnpadGradOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::SequenceUnpadGradOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::SequenceUnpadGradOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::SequenceUnpadGradOpKernel<paddle::platform::CPUDeviceContext,
int64_t>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/sequence_unpad_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
sequence_unpad,
ops::SequenceUnpadOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::SequenceUnpadOpKernel<paddle::platform::CUDADeviceContext, double>,
ops::SequenceUnpadOpKernel<paddle::platform::CUDADeviceContext, int>,
ops::SequenceUnpadOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL(
sequence_unpad_grad,
ops::SequenceUnpadGradOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::SequenceUnpadGradOpKernel<paddle::platform::CUDADeviceContext, double>,
ops::SequenceUnpadGradOpKernel<paddle::platform::CUDADeviceContext, int>,
ops::SequenceUnpadGradOpKernel<paddle::platform::CUDADeviceContext,
int64_t>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence_padding.h"
namespace paddle {
namespace operators {
using LoDTensor = framework::LoDTensor;
using LoD = framework::LoD;
template <typename DeviceContext, typename T>
class SequenceUnpadOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x_t = ctx.Input<LoDTensor>("X");
auto* len_t = ctx.Input<LoDTensor>("Length");
auto* out_t = ctx.Output<LoDTensor>("Out");
out_t->mutable_data<T>(ctx.GetPlace());
const int64_t* seq_len_ptr = nullptr;
if (platform::is_gpu_place(ctx.GetPlace())) {
LoDTensor seq_len_cpu;
seq_len_cpu.Resize(len_t->dims());
seq_len_ptr = seq_len_cpu.mutable_data<int64_t>(platform::CPUPlace());
framework::TensorCopy(*len_t, platform::CPUPlace(),
ctx.template device_context<DeviceContext>(),
&seq_len_cpu);
} else {
seq_len_ptr = len_t->data<int64_t>();
}
size_t batch_size = x_t->dims()[0];
std::vector<size_t> out_lod0(batch_size + 1, 0);
for (size_t i = 0; i < batch_size; ++i) {
out_lod0[i + 1] = out_lod0[i] + seq_len_ptr[i];
}
framework::LoD out_lod;
out_lod.push_back(out_lod0);
out_t->set_lod(out_lod);
std::vector<int64_t> out_dims_vec{static_cast<int64_t>(out_lod0.back())};
if (x_t->dims().size() == 2) {
out_dims_vec.push_back(1);
} else {
for (size_t i = 2; i < x_t->dims().size(); ++i) {
out_dims_vec.push_back(x_t->dims()[i]);
}
}
out_t->Resize(framework::make_ddim(out_dims_vec));
int64_t padded_length = x_t->dims()[1];
math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *x_t, out_t,
padded_length, 0, false, math::kBatchLengthWidth);
}
};
template <typename DeviceContext, typename T>
class SequenceUnpadGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* d_x = ctx.Output<LoDTensor>(framework::GradVarName("X"));
if (d_x) {
const auto* d_out = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
const auto* x_t = ctx.Input<LoDTensor>("X");
d_x->mutable_data<T>(ctx.GetPlace());
int padded_length = x_t->dims()[1];
LoDTensor zero_pads;
zero_pads.Resize({1, 1});
zero_pads.mutable_data<T>(ctx.GetPlace());
math::SetConstant<DeviceContext, T> set_zero;
auto& dev_ctx = ctx.template device_context<DeviceContext>();
set_zero(dev_ctx, &zero_pads, static_cast<T>(0));
math::PaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *d_out, d_x, zero_pads,
padded_length, 0, false, math::kBatchLengthWidth);
}
}
};
} // namespace operators
} // namespace paddle
...@@ -21,7 +21,7 @@ class SGDOp : public framework::OperatorWithKernel { ...@@ -21,7 +21,7 @@ class SGDOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Param"), PADDLE_ENFORCE(ctx->HasInput("Param"),
"Input(Param) of SGDOp should not be null."); "Input(Param) of SGDOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Grad"), PADDLE_ENFORCE(ctx->HasInput("Grad"),
...@@ -42,7 +42,7 @@ class SGDOp : public framework::OperatorWithKernel { ...@@ -42,7 +42,7 @@ class SGDOp : public framework::OperatorWithKernel {
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext &ctx) const override {
auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param")); auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param"));
return framework::OpKernelType(data_type, ctx.device_context()); return framework::OpKernelType(data_type, ctx.device_context());
} }
...@@ -50,17 +50,20 @@ class SGDOp : public framework::OperatorWithKernel { ...@@ -50,17 +50,20 @@ class SGDOp : public framework::OperatorWithKernel {
class SGDOpInferVarType : public framework::VarTypeInference { class SGDOpInferVarType : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc& op_desc, void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc* block) const override { framework::BlockDesc *block) const override {
auto input_var = op_desc.Input("Param")[0]; auto input_var_n = op_desc.Input("Param")[0];
for (auto& out_var : op_desc.Output("ParamOut")) { auto in_var_type = block->FindRecursiveOrCreateVar(input_var_n).GetType();
if (block->FindRecursiveOrCreateVar(input_var).GetType() == PADDLE_ENFORCE(in_var_type == framework::proto::VarType::SELECTED_ROWS ||
framework::proto::VarType::SELECTED_ROWS) { in_var_type == framework::proto::VarType::LOD_TENSOR,
block->FindRecursiveOrCreateVar(out_var).SetType( "The input Var's type should be LoDtensor or SelectedRows,"
framework::proto::VarType::SELECTED_ROWS); " but the received var(%s)'s type is %s",
} else { input_var_n, in_var_type);
block->FindRecursiveOrCreateVar(out_var).SetType(
framework::proto::VarType::LOD_TENSOR); for (auto &out_var_n : op_desc.Output("ParamOut")) {
auto &out_var = block->FindRecursiveOrCreateVar(out_var_n);
if (out_var.GetType() != in_var_type) {
out_var.SetType(in_var_type);
} }
} }
} }
......
...@@ -56,6 +56,12 @@ template <typename T> ...@@ -56,6 +56,12 @@ template <typename T>
class SGDOpCUDAKernel : public framework::OpKernel<T> { class SGDOpCUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const auto* param_var = ctx.InputVar("Param");
PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Param").front(), param_var->Type().name());
auto* param = ctx.Input<framework::Tensor>("Param"); auto* param = ctx.Input<framework::Tensor>("Param");
auto* param_out = ctx.Output<framework::Tensor>("ParamOut"); auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate"); auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
......
...@@ -23,14 +23,14 @@ namespace operators { ...@@ -23,14 +23,14 @@ namespace operators {
template <typename T> template <typename T>
class CPUUniformRandomKernel : public framework::OpKernel<T> { class CPUUniformRandomKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
framework::Tensor* tensor = nullptr; framework::Tensor *tensor = nullptr;
auto out_var = ctx.OutputVar("Out"); auto out_var = ctx.OutputVar("Out");
if (out_var->IsType<framework::LoDTensor>()) { if (out_var->IsType<framework::LoDTensor>()) {
tensor = out_var->GetMutable<framework::LoDTensor>(); tensor = out_var->GetMutable<framework::LoDTensor>();
} else if (out_var->IsType<framework::SelectedRows>()) { } else if (out_var->IsType<framework::SelectedRows>()) {
auto shape = ctx.Attr<std::vector<int>>("shape"); auto shape = ctx.Attr<std::vector<int>>("shape");
auto* selected_rows = out_var->GetMutable<framework::SelectedRows>(); auto *selected_rows = out_var->GetMutable<framework::SelectedRows>();
tensor = selected_rows->mutable_value(); tensor = selected_rows->mutable_value();
tensor->Resize(framework::make_ddim(shape)); tensor->Resize(framework::make_ddim(shape));
selected_rows->mutable_rows()->reserve(shape[0]); selected_rows->mutable_rows()->reserve(shape[0]);
...@@ -39,7 +39,7 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> { ...@@ -39,7 +39,7 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
"uniform_random_op's output only" "uniform_random_op's output only"
"supports SelectedRows and LoDTensor"); "supports SelectedRows and LoDTensor");
} }
T* data = tensor->mutable_data<T>(ctx.GetPlace()); T *data = tensor->mutable_data<T>(ctx.GetPlace());
unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed")); unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
std::minstd_rand engine; std::minstd_rand engine;
if (seed == 0) { if (seed == 0) {
...@@ -60,14 +60,14 @@ class UniformRandomOp : public framework::OperatorWithKernel { ...@@ -60,14 +60,14 @@ class UniformRandomOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of UniformRandomOp should not be null."); "Output(Out) of UniformRandomOp should not be null.");
PADDLE_ENFORCE( PADDLE_ENFORCE(
ctx->Attrs().Get<float>("min") < ctx->Attrs().Get<float>("max"), ctx->Attrs().Get<float>("min") < ctx->Attrs().Get<float>("max"),
"uniform_random's min must less then max"); "uniform_random's min must less then max");
auto& shape = ctx->Attrs().Get<std::vector<int>>("shape"); auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
std::vector<int64_t> temp; std::vector<int64_t> temp;
temp.reserve(shape.size()); temp.reserve(shape.size());
for (auto dim : shape) { for (auto dim : shape) {
...@@ -78,7 +78,7 @@ class UniformRandomOp : public framework::OperatorWithKernel { ...@@ -78,7 +78,7 @@ class UniformRandomOp : public framework::OperatorWithKernel {
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")), static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
ctx.GetPlace()); ctx.GetPlace());
...@@ -112,17 +112,17 @@ uniform distribution. The random result is in set [min, max]. ...@@ -112,17 +112,17 @@ uniform distribution. The random result is in set [min, max].
class UniformRandomOpVarTypeInference : public framework::VarTypeInference { class UniformRandomOpVarTypeInference : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc& op_desc, void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc* block) const override { framework::BlockDesc *block) const override {
auto out_var_name = op_desc.Output("Out").front(); auto out_var_name = op_desc.Output("Out").front();
if (block->FindRecursiveOrCreateVar(out_var_name).GetType() == auto var_data_type = static_cast<framework::proto::VarType::Type>(
framework::proto::VarType::SELECTED_ROWS) { boost::get<int>(op_desc.GetAttr("dtype")));
block->FindRecursiveOrCreateVar(out_var_name)
.SetType(framework::proto::VarType::SELECTED_ROWS); auto out_var = block->FindRecursiveOrCreateVar(out_var_name);
} else { if (out_var.GetType() != framework::proto::VarType::SELECTED_ROWS) {
block->FindRecursiveOrCreateVar(out_var_name) out_var.SetType(framework::proto::VarType::LOD_TENSOR);
.SetType(framework::proto::VarType::LOD_TENSOR);
} }
out_var.SetDataType(var_data_type);
} }
}; };
......
...@@ -198,9 +198,9 @@ class CudnnHolder { ...@@ -198,9 +198,9 @@ class CudnnHolder {
CUDADeviceContext::CUDADeviceContext(CUDAPlace place) CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
: place_(place), cudnn_holder_(nullptr) { : place_(place), cudnn_holder_(nullptr) {
SetDeviceId(place_.device); SetDeviceId(place_.device);
compute_capability = GetCUDAComputeCapability(place_.device); compute_capability_ = GetCUDAComputeCapability(place_.device);
multi_process = GetCUDAMultiProcessors(place_.device); multi_process_ = GetCUDAMultiProcessors(place_.device);
max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device); max_threads_per_mp_ = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
PADDLE_ENFORCE(cudaStreamCreate(&stream_)); PADDLE_ENFORCE(cudaStreamCreate(&stream_));
eigen_stream_.reset(new EigenCudaStreamDevice()); eigen_stream_.reset(new EigenCudaStreamDevice());
eigen_stream_->Reinitialize(&stream_, place); eigen_stream_->Reinitialize(&stream_, place);
...@@ -211,6 +211,16 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) ...@@ -211,6 +211,16 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
cudnn_holder_.reset(new CudnnHolder(&stream_, place)); cudnn_holder_.reset(new CudnnHolder(&stream_, place));
} }
driver_version_ = GetCUDADriverVersion(place_.device);
runtime_version_ = GetCUDARuntimeVersion(place_.device);
LOG(INFO) << "device: " << place_.device
<< ", CUDA Capability: " << compute_capability_
<< ", Driver Version: " << driver_version_ / 1000 << "."
<< (driver_version_ % 100) / 10
<< ", Runtime Version: " << runtime_version_ / 1000 << "."
<< (runtime_version_ % 100) / 10;
callback_manager_.reset(new StreamCallbackManager(stream_)); callback_manager_.reset(new StreamCallbackManager(stream_));
} }
...@@ -232,11 +242,11 @@ void CUDADeviceContext::Wait() const { ...@@ -232,11 +242,11 @@ void CUDADeviceContext::Wait() const {
} }
int CUDADeviceContext::GetComputeCapability() const { int CUDADeviceContext::GetComputeCapability() const {
return compute_capability; return compute_capability_;
} }
int CUDADeviceContext::GetMaxPhysicalThreadCount() const { int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
return multi_process * max_threads_per_mp; return multi_process_ * max_threads_per_mp_;
} }
Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
......
...@@ -135,9 +135,11 @@ class CUDADeviceContext : public DeviceContext { ...@@ -135,9 +135,11 @@ class CUDADeviceContext : public DeviceContext {
cudaStream_t stream_; cudaStream_t stream_;
cublasHandle_t cublas_handle_; cublasHandle_t cublas_handle_;
int compute_capability; int compute_capability_;
int multi_process; int runtime_version_;
int max_threads_per_mp; int driver_version_;
int multi_process_;
int max_threads_per_mp_;
mutable std::mutex mtx_; mutable std::mutex mtx_;
......
...@@ -130,6 +130,13 @@ struct EOFException : public std::exception { ...@@ -130,6 +130,13 @@ struct EOFException : public std::exception {
#define UNLIKELY(condition) (condition == 0) #define UNLIKELY(condition) (condition == 0)
#endif #endif
#if !defined(_WIN32)
#define LIKELY(condition) __builtin_expect(static_cast<bool>(condition), 1)
#else
// there is no equivalent intrinsics in msvc.
#define LIKELY(condition) (condition != 0)
#endif
template <typename... Args> template <typename... Args>
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error( inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
bool stat, const Args&... args) { bool stat, const Args&... args) {
......
...@@ -46,6 +46,24 @@ int GetCUDAComputeCapability(int id) { ...@@ -46,6 +46,24 @@ int GetCUDAComputeCapability(int id) {
return device_prop.major * 10 + device_prop.minor; return device_prop.major * 10 + device_prop.minor;
} }
int GetCUDARuntimeVersion(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
int runtime_version = 0;
PADDLE_ENFORCE(cudaRuntimeGetVersion(&runtime_version),
"cudaRuntimeGetVersion failed in "
"paddle::platform::cudaRuntimeGetVersion");
return runtime_version;
}
int GetCUDADriverVersion(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
int driver_version = 0;
PADDLE_ENFORCE(cudaDriverGetVersion(&driver_version),
"cudaDriverGetVersion failed in "
"paddle::platform::GetCUDADriverVersion");
return driver_version;
}
int GetCUDAMultiProcessors(int id) { int GetCUDAMultiProcessors(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
int count; int count;
......
...@@ -29,6 +29,12 @@ int GetCUDADeviceCount(); ...@@ -29,6 +29,12 @@ int GetCUDADeviceCount();
//! Get the compute capability of the ith GPU (format: major * 10 + minor) //! Get the compute capability of the ith GPU (format: major * 10 + minor)
int GetCUDAComputeCapability(int i); int GetCUDAComputeCapability(int i);
//! Get the runtime version of the ith GPU
int GetCUDARuntimeVersion(int id);
//! Get the driver version of the ith GPU
int GetCUDADriverVersion(int id);
//! Get the MultiProcessors of the ith GPU. //! Get the MultiProcessors of the ith GPU.
int GetCUDAMultiProcessors(int i); int GetCUDAMultiProcessors(int i);
......
...@@ -276,7 +276,7 @@ struct EventItem { ...@@ -276,7 +276,7 @@ struct EventItem {
// Print results // Print results
void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table, void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
const std::string& sorted_domain, const size_t name_width, const std::string& sorted_domain, const size_t name_width,
const size_t data_width, double total) { const size_t data_width, bool merge_thread) {
// Output header information // Output header information
std::cout << "\n------------------------->" std::cout << "\n------------------------->"
<< " Profiling Report " << " Profiling Report "
...@@ -292,6 +292,10 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table, ...@@ -292,6 +292,10 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
PADDLE_THROW("Invalid profiler state", g_state); PADDLE_THROW("Invalid profiler state", g_state);
} }
if (merge_thread) {
std::cout << "Note! This Report merge all thread info into one."
<< std::endl;
}
std::cout << "Place: " << place << std::endl; std::cout << "Place: " << place << std::endl;
std::cout << "Time unit: ms" << std::endl; std::cout << "Time unit: ms" << std::endl;
std::cout << "Sorted by " << sorted_domain std::cout << "Sorted by " << sorted_domain
...@@ -312,8 +316,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table, ...@@ -312,8 +316,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
<< std::setw(data_width) << event_item.min_time << std::setw(data_width) << event_item.min_time
<< std::setw(data_width) << event_item.max_time << std::setw(data_width) << event_item.max_time
<< std::setw(data_width) << event_item.ave_time << std::setw(data_width) << event_item.ave_time
<< std::setw(data_width) << event_item.total_time / total << std::setw(data_width) << event_item.ratio << std::endl;
<< std::endl;
} }
} }
std::cout << std::endl; std::cout << std::endl;
...@@ -321,8 +324,10 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table, ...@@ -321,8 +324,10 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
// Parse the event list and output the profiling report // Parse the event list and output the profiling report
void ParseEvents(const std::vector<std::vector<Event>>& events, void ParseEvents(const std::vector<std::vector<Event>>& events,
bool merge_thread,
EventSortingKey sorted_by = EventSortingKey::kDefault) { EventSortingKey sorted_by = EventSortingKey::kDefault) {
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
if (merge_thread && events.size() < 2) return;
std::string sorted_domain; std::string sorted_domain;
std::function<bool(const EventItem&, const EventItem&)> sorted_func; std::function<bool(const EventItem&, const EventItem&)> sorted_func;
...@@ -361,34 +366,55 @@ void ParseEvents(const std::vector<std::vector<Event>>& events, ...@@ -361,34 +366,55 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
sorted_domain = "event first end time"; sorted_domain = "event first end time";
} }
const std::vector<std::vector<Event>>* analyze_events;
std::vector<std::vector<Event>> merged_events_list;
if (merge_thread) {
std::vector<Event> merged_events;
for (int i = 0; i < events.size(); ++i) {
for (int j = 0; j < events[i].size(); ++j) {
merged_events.push_back(events[i][j]);
}
}
merged_events_list.push_back(merged_events);
analyze_events = &merged_events_list;
} else {
analyze_events = &events;
}
std::vector<std::vector<EventItem>> events_table; std::vector<std::vector<EventItem>> events_table;
size_t max_name_width = 0; size_t max_name_width = 0;
double total = 0.; // the total time for (size_t i = 0; i < (*analyze_events).size(); i++) {
for (size_t i = 0; i < events.size(); i++) { double total = 0.; // the total time in one thread
std::list<Event> pushed_events; std::list<Event> pushed_events;
std::vector<EventItem> event_items; std::vector<EventItem> event_items;
std::unordered_map<std::string, int> event_idx; std::unordered_map<std::string, int> event_idx;
for (size_t j = 0; j < events[i].size(); j++) { for (size_t j = 0; j < (*analyze_events)[i].size(); j++) {
if (events[i][j].type() == EventType::kPushRange) { if ((*analyze_events)[i][j].type() == EventType::kPushRange) {
pushed_events.push_back(events[i][j]); pushed_events.push_back((*analyze_events)[i][j]);
} else if (events[i][j].type() == EventType::kPopRange) { } else if ((*analyze_events)[i][j].type() == EventType::kPopRange) {
std::list<Event>::reverse_iterator rit = pushed_events.rbegin(); std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
while (rit != pushed_events.rend() && while (rit != pushed_events.rend() &&
rit->name() != events[i][j].name()) { rit->name() != (*analyze_events)[i][j].name()) {
++rit; ++rit;
} }
if (rit != pushed_events.rend()) { if (rit != pushed_events.rend()) {
double event_time = (g_state == ProfilerState::kCUDA || double event_time = (g_state == ProfilerState::kCUDA ||
g_state == ProfilerState::kAll) g_state == ProfilerState::kAll)
? rit->CudaElapsedMs(events[i][j]) ? rit->CudaElapsedMs((*analyze_events)[i][j])
: rit->CpuElapsedMs(events[i][j]); : rit->CpuElapsedMs((*analyze_events)[i][j]);
total += event_time; total += event_time;
std::string event_name = std::string event_name;
"thread" + std::to_string(rit->thread_id()) + "::" + rit->name(); if (merge_thread) {
max_name_width = std::max(max_name_width, event_name.size()); event_name = rit->name();
max_name_width = std::max(max_name_width, event_name.size());
} else {
event_name = "thread" + std::to_string(rit->thread_id()) + "::" +
rit->name();
max_name_width = std::max(max_name_width, event_name.size());
}
if (event_idx.find(event_name) == event_idx.end()) { if (event_idx.find(event_name) == event_idx.end()) {
event_idx[event_name] = event_items.size(); event_idx[event_name] = event_items.size();
...@@ -413,7 +439,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events, ...@@ -413,7 +439,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
pushed_events.erase((++rit).base()); pushed_events.erase((++rit).base());
} else { } else {
LOG(WARNING) << "Cannot find the push marker of event \'" LOG(WARNING) << "Cannot find the push marker of event \'"
<< events[i][j].name() << (*analyze_events)[i][j].name()
<< "\', which will be ignored in profiling report."; << "\', which will be ignored in profiling report.";
} }
} }
...@@ -421,6 +447,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events, ...@@ -421,6 +447,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
// average time // average time
for (auto& item : event_items) { for (auto& item : event_items) {
item.ave_time = item.total_time / item.calls; item.ave_time = item.total_time / item.calls;
item.ratio = item.total_time / total;
} }
// sort // sort
if (sorted_by != EventSortingKey::kDefault) { if (sorted_by != EventSortingKey::kDefault) {
...@@ -438,7 +465,8 @@ void ParseEvents(const std::vector<std::vector<Event>>& events, ...@@ -438,7 +465,8 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
} }
// Print report // Print report
PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12, total); PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12,
merge_thread);
} }
void DisableProfiler(EventSortingKey sorted_key, void DisableProfiler(EventSortingKey sorted_key,
...@@ -449,7 +477,8 @@ void DisableProfiler(EventSortingKey sorted_key, ...@@ -449,7 +477,8 @@ void DisableProfiler(EventSortingKey sorted_key,
Mark("_stop_profiler_", nullptr); Mark("_stop_profiler_", nullptr);
std::vector<std::vector<Event>> all_events = GetAllEvents(); std::vector<std::vector<Event>> all_events = GetAllEvents();
ParseEvents(all_events, sorted_key); ParseEvents(all_events, true, sorted_key);
ParseEvents(all_events, false, sorted_key);
ResetProfiler(); ResetProfiler();
DeviceTracer* tracer = GetDeviceTracer(); DeviceTracer* tracer = GetDeviceTracer();
if (tracer->IsEnabled()) { if (tracer->IsEnabled()) {
......
...@@ -57,6 +57,10 @@ limitations under the License. */ ...@@ -57,6 +57,10 @@ limitations under the License. */
#include "pybind11/stl.h" #include "pybind11/stl.h"
DEFINE_bool(reader_queue_speed_test_mode, false,
"If set true, the queue.pop will only get data from queue but not "
"remove the data from queue for speed testing");
// disable auto conversion to list in Python // disable auto conversion to list in Python
PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray); PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
...@@ -157,7 +161,50 @@ PYBIND11_PLUGIN(core) { ...@@ -157,7 +161,50 @@ PYBIND11_PLUGIN(core) {
.def("_get_double_element", TensorGetElement<double>) .def("_get_double_element", TensorGetElement<double>)
.def("_dtype", [](Tensor &self) { return ToDataType(self.type()); }); .def("_dtype", [](Tensor &self) { return ToDataType(self.type()); });
py::class_<LoDTensor, Tensor>(m, "LoDTensor") py::class_<LoDTensor, Tensor>(m, "LoDTensor", R"DOC(
LoDTensor is a Tensor with optional LoD information.
np.array(lod_tensor) can convert LoDTensor to numpy array.
lod_tensor.lod() can retrieve the LoD information.
LoD is short for Level of Details and is usually used for varied sequence
length. You can skip the following comment if you don't need optional LoD.
For example:
A LoDTensor X can look like the example below. It contains 2 sequences.
The first has length 2 and the second has length 3, as described by x.lod.
The first tensor dimension 5=2+3 is calculated from LoD if it's available.
It means the total number of sequence element. In X, each element has 2
columns, hence [5, 2].
x.lod = [[2, 3]]
x.data = [[1, 2], [3, 4],
[5, 6], [7, 8], [9, 10]]
x.shape = [5, 2]
LoD can have multiple levels (for example, a paragraph can have multiple
sentences and a sentence can have multiple words). In the following
LodTensor Y, the lod_level is 2. It means there are 2 sequence, the
first sequence length is 2 (has 2 sub-sequences), the second one's
length is 1. The first sequence's 2 sub-sequences have length 2 and 2,
respectively. And the second sequence's 1 sub-sequence has length 3.
y.lod = [[2 1], [2 2 3]]
y.shape = [2+2+3, ...]
Note:
In above description, LoD is length-based. In Paddle internal
implementation, lod is offset-based. Hence, internally,
y.lod is represented as [[0, 2, 3], [0, 2, 4, 7]] (length-based
equivlent would be [[2-0, 3-2], [2-0, 4-2, 7-4]]).
Sometimes LoD is called recursive_sequence_length to be more
self-explanatory. In this case, it must be length-based. Due to history
reasons. when LoD is called lod in public API, it might be offset-based.
Users should be careful about it.
)DOC")
.def_buffer( .def_buffer(
[](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); }) [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
.def("__init__", .def("__init__",
...@@ -337,7 +384,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -337,7 +384,8 @@ All parameter, weight, gradient are variables in Paddle.
return make_ddim(shape); return make_ddim(shape);
}); });
auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>(); auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>();
holder->InitOnce(capacity, dims); holder->InitOnce(capacity, dims,
FLAGS_reader_queue_speed_test_mode);
return holder->GetQueue(); return holder->GetQueue();
}, },
py::return_value_policy::copy); py::return_value_policy::copy);
...@@ -624,16 +672,17 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -624,16 +672,17 @@ All parameter, weight, gradient are variables in Paddle.
ExecutionStrategy allows the user to more preciously control how to run ExecutionStrategy allows the user to more preciously control how to run
the program in ParallelExecutor by setting the property. the program in ParallelExecutor by setting the property.
The available properties include: Examples:
use_cuda (bool): Whether to use CUDA or not. Default True. .. code-block:: python
num_threads (int): The number of threads that used to run the
operators in ParallelExecutor. If it is not set, it will be exec_strategy = fluid.ExecutionStrategy()
set in ParallelExecutor according to the device count. exec_strategy.num_threads = 4
Default 0.
allow_op_delay (bool): Whether to delay the communication operators train_exe = fluid.ParallelExecutor(use_cuda=True,
to run. Default False. loss_name=loss.name,
num_iteration_per_drop_scope (int): how many iterations between exec_strategy=exec_strategy)
the two dropping local scopes. Default 100.
train_loss, = train_exe.run([loss.name], feed=feed_dict)
)DOC"); )DOC");
...@@ -643,19 +692,34 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -643,19 +692,34 @@ All parameter, weight, gradient are variables in Paddle.
[](const ExecutionStrategy &self) { return self.num_threads_; }, [](const ExecutionStrategy &self) { return self.num_threads_; },
[](ExecutionStrategy &self, size_t num_threads) { [](ExecutionStrategy &self, size_t num_threads) {
self.num_threads_ = num_threads; self.num_threads_ = num_threads;
}) },
R"DOC(The type is INT, num_threads represents the size of thread pool that
used to run the operators of the current program in ParallelExecutor.
If :math:`num\_threads=1`, all the operators will execute one by one,
but the order maybe difference between iterations.
If it is not set, it will be set in ParallelExecutor according to the
device type and device count, for GPU, :math:`num\_threads=device\_count*4`, for CPU,
:math:`num\_threads=CPU\_NUM*4`, the explanation of:math:`CPU\_NUM` is in ParallelExecutor.
if it is not set, ParallelExecutor will get the cpu count by calling
`multiprocessing.cpu_count()`. Default 0.)DOC")
.def_property( .def_property(
"use_cuda", "use_cuda",
[](const ExecutionStrategy &self) { return self.use_cuda_; }, [](const ExecutionStrategy &self) { return self.use_cuda_; },
[](ExecutionStrategy &self, bool use_cuda) { [](ExecutionStrategy &self, bool use_cuda) {
self.use_cuda_ = use_cuda; self.use_cuda_ = use_cuda;
}) }) // FIXME(chengduo): Doesn't add doc for 'use_cuda', use_cuda may
// make user confuse, because ParallelExecutor has a parameter named
// 'use_cuda' too, in current implementation, ParallelExecutor's
// 'use_cuda' will rewrite ExecutionStrategy's 'use_cuda'.
.def_property( .def_property(
"allow_op_delay", "allow_op_delay",
[](const ExecutionStrategy &self) { return self.allow_op_delay_; }, [](const ExecutionStrategy &self) { return self.allow_op_delay_; },
[](ExecutionStrategy &self, bool allow_op_delay) { [](ExecutionStrategy &self, bool allow_op_delay) {
self.allow_op_delay_ = allow_op_delay; self.allow_op_delay_ = allow_op_delay;
}) },
R"DOC(The type is BOOL, allow_op_delay represents whether to delay the
communication operators to run, it may make the execution faster.
Note that in some models, allow_op_delay may cause program hang. Default False.)DOC")
.def_property( .def_property(
"num_iteration_per_drop_scope", "num_iteration_per_drop_scope",
[](const ExecutionStrategy &self) { [](const ExecutionStrategy &self) {
...@@ -663,7 +727,19 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -663,7 +727,19 @@ All parameter, weight, gradient are variables in Paddle.
}, },
[](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) { [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) {
self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope; self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope;
}); },
R"DOC(The type is INT, num_iteration_per_drop_scope indicates how
many iterations to clean up the temp variables which
is generated during execution. It may make the execution faster,
because the temp variable's shape maybe the same between two iterations. Default 100.
NOTES:
1. If you fetch data when calling the 'run', the ParallelExecutor
will clean up the temp variables at the end of the current iteration.
2. In some NLP model, it may cause the GPU memory is insufficient,
in this case, you should reduce `num_iteration_per_drop_scope`.
)DOC");
exec_strategy.def_property( exec_strategy.def_property(
"use_experimental_executor", "use_experimental_executor",
[](const ExecutionStrategy &self) { [](const ExecutionStrategy &self) {
...@@ -678,20 +754,17 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -678,20 +754,17 @@ All parameter, weight, gradient are variables in Paddle.
BuildStrategy allows the user to more preciously control how to BuildStrategy allows the user to more preciously control how to
build the SSA Graph in ParallelExecutor by setting the property. build the SSA Graph in ParallelExecutor by setting the property.
The available properties include: Examples:
reduce_strategy (str): There are two reduce strategies, 'AllReduce' .. code-block:: python
and 'Reduce'. If you want that all parameters will be optimized
on all devices, you can choose 'AllReduce'; if you choose build_strategy = fluid.BuildStrategy()
'Reduce', all parameters will be evenly allocated to different build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
devices for optimization, and then broadcast the optimized
parameter to other devices. Default 'AllReduce'. train_exe = fluid.ParallelExecutor(use_cuda=True,
gradient_scale_strategy (str): There are two ways of defining loss@grad, loss_name=loss.name,
'CoeffNumDevice' and 'Customized'. By default, ParallelExecutor build_strategy=build_strategy)
sets the loss@grad according to the number of devices. If you want
to customize loss@grad, you can choose 'Customized'. train_loss, = train_exe.run([loss.name], feed=feed_dict)
Default 'CoeffNumDevice'.
debug_graphviz_path (str): Whether to write the SSA Graph to file in the
form of graphviz. It is useful for debugging. Default "".
)DOC"); )DOC");
py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, "ReduceStrategy") py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, "ReduceStrategy")
...@@ -710,31 +783,51 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -710,31 +783,51 @@ All parameter, weight, gradient are variables in Paddle.
[](const BuildStrategy &self) { return self.reduce_; }, [](const BuildStrategy &self) { return self.reduce_; },
[](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) { [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) {
self.reduce_ = strategy; self.reduce_ = strategy;
}) },
R"DOC(The type is STR, there are two reduce strategies in ParallelExecutor,
'AllReduce' and 'Reduce'. If you want that all the parameters'
optimization are done on all devices independently, you should choose 'AllReduce';
if you choose 'Reduce', all the parameters' optimization will be evenly distributed
to different devices, and then broadcast the optimized parameter to other devices.
In some models, `Reduce` is faster. Default 'AllReduce'. )DOC")
.def_property( .def_property(
"gradient_scale_strategy", "gradient_scale_strategy",
[](const BuildStrategy &self) { return self.gradient_scale_; }, [](const BuildStrategy &self) { return self.gradient_scale_; },
[](BuildStrategy &self, [](BuildStrategy &self,
BuildStrategy::GradientScaleStrategy strategy) { BuildStrategy::GradientScaleStrategy strategy) {
self.gradient_scale_ = strategy; self.gradient_scale_ = strategy;
}) },
R"DOC(The type is STR, there are three ways of defining :math:`loss@grad` in
ParallelExecutor, 'CoeffNumDevice', 'One' and 'Customized'. By default,
ParallelExecutor sets the :math:`loss@grad` according to the number of devices.
If you want to customize :math:`loss@grad`, you can choose 'Customized'.
Default 'CoeffNumDevice'.)DOC")
.def_property( .def_property(
"debug_graphviz_path", "debug_graphviz_path",
[](const BuildStrategy &self) { return self.debug_graphviz_path_; }, [](const BuildStrategy &self) { return self.debug_graphviz_path_; },
[](BuildStrategy &self, const std::string &path) { [](BuildStrategy &self, const std::string &path) {
self.debug_graphviz_path_ = path; self.debug_graphviz_path_ = path;
}) },
R"DOC(The type is STR, debug_graphviz_path indicate the path that
writing the SSA Graph to file in the form of graphviz, you.
It is useful for debugging. Default "")DOC")
.def_property( .def_property(
"enable_data_balance", "enable_data_balance",
[](const BuildStrategy &self) { return self.enable_data_balance_; }, [](const BuildStrategy &self) { return self.enable_data_balance_; },
[](BuildStrategy &self, bool b) { self.enable_data_balance_ = b; }) [](BuildStrategy &self, bool b) {
.def_property("fuse_elewise_add_act_ops", self.enable_data_balance_ = b;
[](const BuildStrategy &self) { }) // FIXME(chengudo): enable_data_balance seems not important
return self.fuse_elewise_add_act_ops_; .def_property(
}, "fuse_elewise_add_act_ops",
[](BuildStrategy &self, bool b) { [](const BuildStrategy &self) {
self.fuse_elewise_add_act_ops_ = b; return self.fuse_elewise_add_act_ops_;
}) },
[](BuildStrategy &self, bool b) {
self.fuse_elewise_add_act_ops_ = b;
},
R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether
to fuse elementwise_add_op and activation_op,
it may make the execution faster. Default False)DOC")
.def("_create_passes_from_strategy", .def("_create_passes_from_strategy",
[](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> { [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
return self.CreatePassesFromStrategy(); return self.CreatePassesFromStrategy();
......
...@@ -15,7 +15,7 @@ cmake .. -DFLUID_INSTALL_DIR=$PADDLE_LIB \ ...@@ -15,7 +15,7 @@ cmake .. -DFLUID_INSTALL_DIR=$PADDLE_LIB \
-DWITH_MKL=OFF \ -DWITH_MKL=OFF \
-DWITH_MKLDNN=OFF -DWITH_MKLDNN=OFF
make -j8 make -j8
make -j8 inference_lib_dist make -j8 fluid_lib_dist
``` ```
### step 2. generate program desc ### step 2. generate program desc
......
...@@ -648,25 +648,25 @@ function gen_capi_package() { ...@@ -648,25 +648,25 @@ function gen_capi_package() {
fi fi
} }
function gen_fluid_inference_lib() { function gen_fluid_lib() {
mkdir -p ${PADDLE_ROOT}/build mkdir -p ${PADDLE_ROOT}/build
cd ${PADDLE_ROOT}/build cd ${PADDLE_ROOT}/build
if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
cat <<EOF cat <<EOF
======================================== ========================================
Generating fluid inference library ... Generating fluid library for train and inference ...
======================================== ========================================
EOF EOF
cmake .. -DWITH_DISTRIBUTE=OFF cmake .. -DWITH_DISTRIBUTE=OFF
make -j `nproc` inference_lib_dist make -j `nproc` fluid_lib_dist
fi fi
} }
function tar_fluid_inference_lib() { function tar_fluid_lib() {
if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
cat <<EOF cat <<EOF
======================================== ========================================
Taring fluid inference library ... Taring fluid library for train and inference ...
======================================== ========================================
EOF EOF
cd ${PADDLE_ROOT}/build cd ${PADDLE_ROOT}/build
...@@ -675,11 +675,11 @@ EOF ...@@ -675,11 +675,11 @@ EOF
fi fi
} }
function test_fluid_inference_lib() { function test_fluid_lib() {
if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
cat <<EOF cat <<EOF
======================================== ========================================
Testing fluid inference library ... Testing fluid library for inference ...
======================================== ========================================
EOF EOF
cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci
...@@ -731,9 +731,9 @@ function main() { ...@@ -731,9 +731,9 @@ function main() {
;; ;;
fluid_inference_lib) fluid_inference_lib)
cmake_gen ${PYTHON_ABI:-""} cmake_gen ${PYTHON_ABI:-""}
gen_fluid_inference_lib gen_fluid_lib
tar_fluid_inference_lib tar_fluid_lib
test_fluid_inference_lib test_fluid_lib
;; ;;
check_style) check_style)
check_style check_style
...@@ -744,8 +744,8 @@ function main() { ...@@ -744,8 +744,8 @@ function main() {
assert_api_not_changed ${PYTHON_ABI:-""} assert_api_not_changed ${PYTHON_ABI:-""}
run_test run_test
gen_capi_package gen_capi_package
gen_fluid_inference_lib gen_fluid_lib
test_fluid_inference_lib test_fluid_lib
assert_api_spec_approvals assert_api_spec_approvals
;; ;;
maccheck) maccheck)
......
...@@ -113,7 +113,8 @@ def __bootstrap__(): ...@@ -113,7 +113,8 @@ def __bootstrap__():
'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir', 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb', 'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
"dist_threadpool_size", 'cpu_deterministic', 'eager_delete_tensor_gb' 'dist_threadpool_size', 'cpu_deterministic', 'eager_delete_tensor_gb',
'reader_queue_speed_test_mode'
] ]
if core.is_compiled_with_dist(): if core.is_compiled_with_dist():
read_env_flags.append('rpc_deadline') read_env_flags.append('rpc_deadline')
......
...@@ -55,7 +55,11 @@ def data(name, ...@@ -55,7 +55,11 @@ def data(name,
Args: Args:
name(str): The name/alias of the function name(str): The name/alias of the function
shape(list): Tuple declaring the shape. shape(list): Tuple declaring the shape.
append_batch_size(bool): Whether or not to append the data as a batch. append_batch_size(bool):
1. If true, it prepends -1 to the shape.
For example if shape=[1], the resulting shape is [-1, 1].
2. If shape contains -1, such as shape=[1, -1],
append_batch_size will be enforced to be be False (ineffective).
dtype(int|float): The type of data : float32, float_16, int etc dtype(int|float): The type of data : float32, float_16, int etc
type(VarType): The output type. By default it is LOD_TENSOR. type(VarType): The output type. By default it is LOD_TENSOR.
lod_level(int): The LoD Level. 0 means the input data is not a sequence. lod_level(int): The LoD Level. 0 means the input data is not a sequence.
......
...@@ -56,6 +56,7 @@ __all__ = [ ...@@ -56,6 +56,7 @@ __all__ = [
'sequence_expand', 'sequence_expand',
'sequence_expand_as', 'sequence_expand_as',
'sequence_pad', 'sequence_pad',
'sequence_unpad',
'lstm_unit', 'lstm_unit',
'reduce_sum', 'reduce_sum',
'reduce_mean', 'reduce_mean',
...@@ -107,6 +108,7 @@ __all__ = [ ...@@ -107,6 +108,7 @@ __all__ = [
'log', 'log',
'crop', 'crop',
'rank_loss', 'rank_loss',
'margin_rank_loss',
'elu', 'elu',
'relu6', 'relu6',
'pow', 'pow',
...@@ -2792,7 +2794,7 @@ def sequence_expand_as(x, y, name=None): ...@@ -2792,7 +2794,7 @@ def sequence_expand_as(x, y, name=None):
@templatedoc() @templatedoc()
def sequence_pad(x, pad_value, maxlen=None): def sequence_pad(x, pad_value, maxlen=None, name=None):
""" """
${comment} ${comment}
...@@ -2806,7 +2808,9 @@ def sequence_pad(x, pad_value, maxlen=None): ...@@ -2806,7 +2808,9 @@ def sequence_pad(x, pad_value, maxlen=None):
None or any positive int. When it is None, all sequences will be None or any positive int. When it is None, all sequences will be
padded up to the length of the longest one among them; when it a padded up to the length of the longest one among them; when it a
certain positive value, it must be greater than the length of the certain positive value, it must be greater than the length of the
longest original sequence." longest original sequence.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns: Returns:
Variable: The padded sequence batch and the original lengths before Variable: The padded sequence batch and the original lengths before
...@@ -2843,6 +2847,66 @@ def sequence_pad(x, pad_value, maxlen=None): ...@@ -2843,6 +2847,66 @@ def sequence_pad(x, pad_value, maxlen=None):
return out, length return out, length
def sequence_unpad(x, length, name=None):
"""
**Sequence Unpad Layer**
This layer removes the padding data in the input sequences and convert
them into sequences with actual length as output, identitied by lod
information.
.. code-block:: text
Example:
Given input Variable **x**:
x.data = [[ 1.0, 2.0, 3.0, 4.0, 5.0],
[ 6.0, 7.0, 8.0, 9.0, 10.0],
[11.0, 12.0, 13.0, 14.0, 15.0]],
in which there are 3 sequences padded to length 5, and the acutal length
specified by input Variable **length**:
length.data = [[2], [3], [4]],
after unpadding, the output Variable will be:
out.data = [[1.0, 2.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0]]
out.lod = [[2, 3, 4]]
Args:
x(Variable): Input Variable which contains the padded sequences with
equal length.
length(Variable): The Variable that specifies the actual ength of
sequences after unpadding.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: The Variable contains the unpadded sequences.
Examples:
.. code-block:: python
x = fluid.layers.data(name='x', shape=[10, 5], dtype='float32')
len = fluid.layers.data(name='length', shape=[1], dtype='int64')
out = fluid.layers.sequence_unpad(x=x, length=len)
"""
helper = LayerHelper('sequence_unpad', input=x, **locals())
dtype = helper.input_dtype()
out = helper.create_tmp_variable(dtype)
length.stop_gradient = True
helper.append_op(
type='sequence_unpad',
inputs={'X': x,
'Length': length},
outputs={'Out': out})
return out
def beam_search(pre_ids, def beam_search(pre_ids,
pre_scores, pre_scores,
ids, ids,
...@@ -5827,6 +5891,54 @@ def rank_loss(label, left, right, name=None): ...@@ -5827,6 +5891,54 @@ def rank_loss(label, left, right, name=None):
return out return out
def margin_rank_loss(label, left, right, margin=0.1, name=None):
"""
Margin Ranking Loss Layer for ranking problem,
which compares left score and right score passed in.
The ranking loss can be defined as following equation:
.. math::
rank\_loss &= max(0, -label * (left - right) + margin)
Args:
label (Variable): Indicates whether the left is ranked higher than the right or not.
left (Variable): Ranking score for left.
right (Variable): Ranking score for right.
margin (float): Indicates the given margin.
name (str|None): A name for this layer (optional). If set None, the layer
will be named automatically.
Returns:
Variable: The ranking loss.
Raises:
ValueError: Any of label, left, and right is not a Variable.
Examples:
.. code-block:: python
label = fluid.layers.data(name="label", shape=[4, 1], dtype="float32")
left = fluid.layers.data(name="left", shape=[4, 1], dtype="float32")
right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32")
out = fluid.layers.margin_rank_loss(label, left, right)
"""
helper = LayerHelper('margin_rank_loss', **locals())
if not isinstance(label, Variable):
raise ValueError("The label should be a Variable.")
if not isinstance(left, Variable):
raise ValueError("The left should be a Variable.")
if not isinstance(right, Variable):
raise ValueError("The right should be a Variable.")
out = helper.create_tmp_variable(left.dtype)
act = helper.create_tmp_variable(left.dtype)
helper.append_op(
type='margin_rank_loss',
inputs={"Label": label,
"X1": left,
"X2": right},
outputs={'Out': out,
'Activated': act},
attrs={'margin': margin})
return out
def pad2d(input, def pad2d(input,
paddings=[0, 0, 0, 0], paddings=[0, 0, 0, 0],
mode='constant', mode='constant',
...@@ -6290,6 +6402,7 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None): ...@@ -6290,6 +6402,7 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
outputs={'Out': out}, outputs={'Out': out},
attrs={'win_size': win_size, attrs={'win_size': win_size,
'pad_value': pad_value}) 'pad_value': pad_value})
return out
def sequence_mask(x, maxlen=None, dtype='int64', name=None): def sequence_mask(x, maxlen=None, dtype='int64', name=None):
......
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
from __future__ import print_function from __future__ import print_function
from .layer_function_generator import generate_layer_fn, generate_layer_fn_noattr from .layer_function_generator import generate_layer_fn, generate_layer_fn_noattr
from .. import core
from ..framework import convert_np_dtype_to_dtype_
__activations_noattr__ = [ __activations_noattr__ = [
'sigmoid', 'sigmoid',
...@@ -58,8 +60,11 @@ _uniform_random_ = generate_layer_fn('uniform_random') ...@@ -58,8 +60,11 @@ _uniform_random_ = generate_layer_fn('uniform_random')
def uniform_random(shape, dtype=None, min=None, max=None, seed=None): def uniform_random(shape, dtype=None, min=None, max=None, seed=None):
locals_var = locals().keys()
if not isinstance(dtype, core.VarDesc.VarType):
dtype = convert_np_dtype_to_dtype_(dtype)
kwargs = dict() kwargs = dict()
for name in locals(): for name in locals_var:
val = locals()[name] val = locals()[name]
if val is not None: if val is not None:
kwargs[name] = val kwargs[name] = val
...@@ -78,8 +83,9 @@ _hard_shrink_ = generate_layer_fn('hard_shrink') ...@@ -78,8 +83,9 @@ _hard_shrink_ = generate_layer_fn('hard_shrink')
def hard_shrink(x, threshold=None): def hard_shrink(x, threshold=None):
locals_var = locals().keys()
kwargs = dict() kwargs = dict()
for name in locals(): for name in locals_var:
val = locals()[name] val = locals()[name]
if val is not None: if val is not None:
kwargs[name] = val kwargs[name] = val
...@@ -99,12 +105,12 @@ _cum_sum_ = generate_layer_fn('cumsum') ...@@ -99,12 +105,12 @@ _cum_sum_ = generate_layer_fn('cumsum')
def cumsum(x, axis=None, exclusive=None, reverse=None): def cumsum(x, axis=None, exclusive=None, reverse=None):
locals_var = locals().keys()
kwargs = dict() kwargs = dict()
for name in locals(): for name in locals_var:
val = locals()[name] val = locals()[name]
if val is not None: if val is not None:
kwargs[name] = val kwargs[name] = val
return _cum_sum_(**kwargs) return _cum_sum_(**kwargs)
...@@ -121,8 +127,9 @@ _thresholded_relu_ = generate_layer_fn('thresholded_relu') ...@@ -121,8 +127,9 @@ _thresholded_relu_ = generate_layer_fn('thresholded_relu')
def thresholded_relu(x, threshold=None): def thresholded_relu(x, threshold=None):
locals_var = locals().keys()
kwargs = dict() kwargs = dict()
for name in locals(): for name in locals_var:
val = locals()[name] val = locals()[name]
if val is not None: if val is not None:
kwargs[name] = val kwargs[name] = val
......
...@@ -100,7 +100,7 @@ def create_global_var(shape, ...@@ -100,7 +100,7 @@ def create_global_var(shape,
force_cpu=False, force_cpu=False,
name=None): name=None):
""" """
Create a new variable in the global block(block 0). Create a new tensor variable with value in the global block(block 0).
Args: Args:
shape(list[int]): shape of the variable shape(list[int]): shape of the variable
......
...@@ -659,6 +659,9 @@ class AdamaxOptimizer(Optimizer): ...@@ -659,6 +659,9 @@ class AdamaxOptimizer(Optimizer):
optimizer = fluid.optimizer.Adamax(learning_rate=0.2) optimizer = fluid.optimizer.Adamax(learning_rate=0.2)
optimizer.minimize(cost) optimizer.minimize(cost)
Notes:
Currently, AdamaxOptimizer doesn't support sparse parameter optimization.
""" """
_moment_acc_str = "moment" _moment_acc_str = "moment"
_inf_norm_acc_str = "inf_norm" _inf_norm_acc_str = "inf_norm"
...@@ -778,6 +781,9 @@ class DecayedAdagradOptimizer(Optimizer): ...@@ -778,6 +781,9 @@ class DecayedAdagradOptimizer(Optimizer):
optimizer = fluid.optimizer.DecayedAdagrad(learning_rate=0.2) optimizer = fluid.optimizer.DecayedAdagrad(learning_rate=0.2)
optimizer.minimize(cost) optimizer.minimize(cost)
Notes:
Currently, DecayedAdagradOptimizer doesn't support sparse parameter optimization.
""" """
_moment_acc_str = "moment" _moment_acc_str = "moment"
...@@ -858,6 +864,9 @@ class AdadeltaOptimizer(Optimizer): ...@@ -858,6 +864,9 @@ class AdadeltaOptimizer(Optimizer):
optimizer = fluid.optimizer.Adadelta( optimizer = fluid.optimizer.Adadelta(
learning_rate=0.0003, epsilon=1.0e-6, rho=0.95) learning_rate=0.0003, epsilon=1.0e-6, rho=0.95)
_, params_grads = optimizer.minimize(cost) _, params_grads = optimizer.minimize(cost)
Notes:
Currently, AdadeltaOptimizer doesn't support sparse parameter optimization.
""" """
_avg_squared_grad_acc_str = "_avg_squared_grad" _avg_squared_grad_acc_str = "_avg_squared_grad"
...@@ -1126,6 +1135,9 @@ class FtrlOptimizer(Optimizer): ...@@ -1126,6 +1135,9 @@ class FtrlOptimizer(Optimizer):
optimizer = fluid.optimizer.Ftrl(0.0001) optimizer = fluid.optimizer.Ftrl(0.0001)
_, params_grads = optimizer.minimize(cost) _, params_grads = optimizer.minimize(cost)
Notes:
Currently, FtrlOptimizer doesn't support sparse parameter optimization.
""" """
_squared_acc_str = "squared" _squared_acc_str = "squared"
......
...@@ -31,15 +31,32 @@ BuildStrategy = core.ParallelExecutor.BuildStrategy ...@@ -31,15 +31,32 @@ BuildStrategy = core.ParallelExecutor.BuildStrategy
class ParallelExecutor(object): class ParallelExecutor(object):
""" """
ParallelExecutor can run program in parallel. ParallelExecutor is designed for data parallelism, which focuses on distributing
the data across different nodes and every node operates on the data in parallel.
If you use ParallelExecutor to run the current program on GPU, the node means GPU
device, and ParallelExecutor will get the available GPU device automatically on
the current machine. If you use ParallelExecutor to run the current program on CPU,
the node means the CPU device, and you can specify the CPU device number by adding
'CPU_NUM' environment variable, for example 'CPU_NUM=4', if the environment variable
is not found, ParallelExecutor will call `multiprocessing.cpu_count` to get the number
of CPUs in the system.
Args: Args:
use_cuda (bool): Whether to use CUDA or not. use_cuda (bool): Whether to use CUDA or not.
loss_name (str): The loss name must set in training. Default None. loss_name (str): The loss name must set in training. Default None.
main_program (Program): The program that need to run, if not provided, main_program (Program): The program that need to run, if not provided,
then default_main_program will be used. Default None. then default_main_program will be used. Default None.
share_vars_from(ParallelExecutor): If provied, it will share variables share_vars_from(ParallelExecutor): If provide, it will share variables
from the specified ParallelExecutor. Default None. from the specified ParallelExecutor. Default None.
exec_strategy(ExecutionStrategy): exec_strategy is used to control how to run
the program in ParallelExecutor, for example how many threads are used to
execute the program, how many iterations to clean up the temp variables
which is generated during execution. For more information, please refer
to fluid.ExecutionStrategy. Default None.
build_strategy(BuildStrategy): build_strategy is used to control how to
build the SSA Graph in ParallelExecutor by setting the property,
for example reduce_strategy, gradient_scale_strategy. For more information,
please refer to fluid.BuildStrategy. Default None.
num_trainers(int): If greater than 1, NCCL will be initialized with num_trainers(int): If greater than 1, NCCL will be initialized with
multiple rank of nodes, each node should have same number of GPUs. multiple rank of nodes, each node should have same number of GPUs.
Distributed training will be enabled then. Default 1. Distributed training will be enabled then. Default 1.
......
...@@ -81,7 +81,10 @@ def get_optimizer(): ...@@ -81,7 +81,10 @@ def get_optimizer():
return optimizer return optimizer
def train_network(batch_size, is_distributed=False, is_sparse=False): def train_network(batch_size,
is_distributed=False,
is_sparse=False,
is_self_contained_lr=False):
# query # query
q = fluid.layers.data( q = fluid.layers.data(
name="query_ids", shape=[1], dtype="int64", lod_level=1) name="query_ids", shape=[1], dtype="int64", lod_level=1)
...@@ -93,7 +96,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False): ...@@ -93,7 +96,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False):
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01), initializer=fluid.initializer.Constant(value=0.01),
name="__emb__", name="__emb__",
learning_rate=emb_lr), learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__emb__"),
is_sparse=is_sparse) is_sparse=is_sparse)
## vsum ## vsum
q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
...@@ -119,7 +124,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False): ...@@ -119,7 +124,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False):
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01), initializer=fluid.initializer.Constant(value=0.01),
name="__emb__", name="__emb__",
learning_rate=emb_lr), learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__emb__"),
is_sparse=is_sparse) is_sparse=is_sparse)
## vsum ## vsum
pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
...@@ -144,7 +151,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False): ...@@ -144,7 +151,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False):
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01), initializer=fluid.initializer.Constant(value=0.01),
name="__emb__", name="__emb__",
learning_rate=emb_lr), learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__emb__"),
is_sparse=is_sparse) is_sparse=is_sparse)
## vsum ## vsum
nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
...@@ -220,7 +229,10 @@ class TestDistSimnetBow2x2(TestDistRunnerBase): ...@@ -220,7 +229,10 @@ class TestDistSimnetBow2x2(TestDistRunnerBase):
def get_model(self, batch_size=2): def get_model(self, batch_size=2):
# Train program # Train program
avg_cost, acc, predict = \ avg_cost, acc, predict = \
train_network(batch_size, bool(int(os.environ["IS_DISTRIBUTED"])), bool(int(os.environ["IS_SPARSE"]))) train_network(batch_size,
bool(int(os.environ["IS_DISTRIBUTED"])),
bool(int(os.environ["IS_SPARSE"])),
bool(int(os.environ["IS_SELF_CONTAINED_LR"])))
inference_program = fluid.default_main_program().clone() inference_program = fluid.default_main_program().clone()
......
...@@ -18,6 +18,9 @@ import unittest ...@@ -18,6 +18,9 @@ import unittest
import numpy as np import numpy as np
from op_test import OpTest from op_test import OpTest
import paddle.fluid as fluid
import paddle.fluid.core as core
class TestClipByNormOp(OpTest): class TestClipByNormOp(OpTest):
def setUp(self): def setUp(self):
...@@ -62,5 +65,59 @@ class TestCase3(TestClipByNormOp): ...@@ -62,5 +65,59 @@ class TestCase3(TestClipByNormOp):
self.max_norm = 1.0 self.max_norm = 1.0
class TestClipByNormOpWithSelectedRows(OpTest):
def check_with_place(self, place):
self.config_test_case()
scope = core.Scope()
# set input
x_selected_rows = scope.var('X').get_selected_rows()
x_selected_rows.set_rows(self.grad_rows)
x_tensor = x_selected_rows.get_tensor()
x_np = np.random.random(self.grad_shape).astype("float32")
x_np[np.abs(x_np) < self.max_relative_error] = 0.5
x_tensor.set(x_np, place)
# set output
out_selected_rows = scope.var('Out').get_selected_rows()
# run clip_by_norm_op
clip_by_norm_op = fluid.op.Operator(
"clip_by_norm", max_norm=self.max_norm, X='X', Out='Out')
clip_by_norm_op.run(scope, place)
# check output
self.assertEqual(out_selected_rows.rows(), self.grad_clipped_rows)
out_tensor = out_selected_rows.get_tensor()
y_np = np.zeros(self.grad_clipped_shape)
y_np[0] = np.sum(x_np[0:2])
y_np[1] = x_np[2]
y_np[2] = x_np[3]
norm = np.sqrt(np.sum(np.square(y_np)))
if norm > self.max_norm:
output = self.max_norm * y_np / norm
else:
output = y_np
self.assertTrue(
np.allclose(
np.array(out_tensor), output, atol=1e-5, equal_nan=False))
def test_clip_by_norm_with_selected_ros(self):
places = [core.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
for place in places:
self.check_with_place(place)
def config_test_case(self):
self.max_norm = 1.0
self.max_relative_error = 0.006
self.grad_shape = (4, 1)
self.grad_clipped_shape = (3, 1)
self.grad_rows = [0, 0, 1, 2]
self.grad_clipped_rows = [0, 1, 2]
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -25,7 +25,11 @@ class TestDistSimnetBowDense2x2(TestDistBase): ...@@ -25,7 +25,11 @@ class TestDistSimnetBowDense2x2(TestDistBase):
self._enforce_place = "CPU" self._enforce_place = "CPU"
def test_simnet_bow(self): def test_simnet_bow(self):
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'} need_envs = {
"IS_DISTRIBUTED": '0',
"IS_SPARSE": '0',
'IS_SELF_CONTAINED_LR': '1'
}
self.check_with_place( self.check_with_place(
"dist_simnet_bow.py", "dist_simnet_bow.py",
delta=1e-5, delta=1e-5,
...@@ -39,7 +43,11 @@ class TestDistSimnetBow2x2DenseAsync(TestDistBase): ...@@ -39,7 +43,11 @@ class TestDistSimnetBow2x2DenseAsync(TestDistBase):
self._enforce_place = "CPU" self._enforce_place = "CPU"
def test_simnet_bow(self): def test_simnet_bow(self):
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'} need_envs = {
"IS_DISTRIBUTED": '0',
"IS_SPARSE": '0',
'IS_SELF_CONTAINED_LR': '1'
}
self.check_with_place( self.check_with_place(
"dist_simnet_bow.py", "dist_simnet_bow.py",
delta=100, delta=100,
...@@ -53,7 +61,11 @@ class TestDistSimnetBowSparse2x2(TestDistBase): ...@@ -53,7 +61,11 @@ class TestDistSimnetBowSparse2x2(TestDistBase):
self._enforce_place = "CPU" self._enforce_place = "CPU"
def test_simnet_bow(self): def test_simnet_bow(self):
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'} need_envs = {
"IS_DISTRIBUTED": '0',
"IS_SPARSE": '1',
'IS_SELF_CONTAINED_LR': '1'
}
self.check_with_place( self.check_with_place(
"dist_simnet_bow.py", "dist_simnet_bow.py",
delta=1e-5, delta=1e-5,
...@@ -67,7 +79,11 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase): ...@@ -67,7 +79,11 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase):
self._enforce_place = "CPU" self._enforce_place = "CPU"
def test_simnet_bow(self): def test_simnet_bow(self):
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'} need_envs = {
"IS_DISTRIBUTED": '0',
"IS_SPARSE": '1',
'IS_SELF_CONTAINED_LR': '1'
}
self.check_with_place( self.check_with_place(
"dist_simnet_bow.py", "dist_simnet_bow.py",
delta=100, delta=100,
...@@ -75,5 +91,59 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase): ...@@ -75,5 +91,59 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase):
need_envs=need_envs) need_envs=need_envs)
class TestDistSimnetBow2x2LookupTableSync(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._enforce_place = "CPU"
def test_simnet_bow(self):
need_envs = {
"IS_DISTRIBUTED": '1',
"IS_SPARSE": '1',
'IS_SELF_CONTAINED_LR': '1'
}
self.check_with_place(
"dist_simnet_bow.py",
delta=1e-5,
check_error_log=False,
need_envs=need_envs)
class TestDistSimnetBow2x2LookupTableAsync(TestDistBase):
def _setup_config(self):
self._sync_mode = False
self._enforce_place = "CPU"
def test_simnet_bow(self):
need_envs = {
"IS_DISTRIBUTED": '1',
"IS_SPARSE": '1',
'IS_SELF_CONTAINED_LR': '1'
}
self.check_with_place(
"dist_simnet_bow.py",
delta=100,
check_error_log=False,
need_envs=need_envs)
class TestDistSimnetBow2x2LookupTableNotContainLRSync(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._enforce_place = "CPU"
def test_simnet_bow(self):
need_envs = {
"IS_DISTRIBUTED": '1',
"IS_SPARSE": '1',
'IS_SELF_CONTAINED_LR': '0'
}
self.check_with_place(
"dist_simnet_bow.py",
delta=1e-5,
check_error_log=False,
need_envs=need_envs)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -194,6 +194,14 @@ class TestBook(unittest.TestCase): ...@@ -194,6 +194,14 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(layers.sequence_expand(x=x, y=y, ref_level=1)) self.assertIsNotNone(layers.sequence_expand(x=x, y=y, ref_level=1))
print(str(program)) print(str(program))
def test_sequence_unpad(self):
program = Program()
with program_guard(program):
x = layers.data(name='x', shape=[10, 5], dtype='float32')
length = layers.data(name='length', shape=[1], dtype='int64')
self.assertIsNotNone(layers.sequence_unpad(x=x, length=length))
print(str(program))
def test_lstm_unit(self): def test_lstm_unit(self):
program = Program() program = Program()
with program_guard(program): with program_guard(program):
......
...@@ -19,33 +19,76 @@ import unittest ...@@ -19,33 +19,76 @@ import unittest
import numpy as np import numpy as np
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.op import Operator from paddle.fluid.op import Operator
import paddle.fluid as fluid
def create_selected_rows_and_tensor(scope, place, height, row_num,
embedding_size):
sr = scope.var("@selected_rows@").get_selected_rows()
tensor = scope.var("grad").get_tensor()
rows = np.random.random_integers(
low=0, high=height - 1, size=[row_num, ]).astype('int64')
sr_val = np.random.random(size=[row_num, embedding_size]).astype('float32')
sr.set_height(height)
sr.set_rows(rows)
sr.get_tensor().set(sr_val, place)
tensor_val = np.zeros(shape=[height, embedding_size], dtype='float32')
for i in range(row_num):
row = rows[i]
tensor_val[row, :] = tensor_val[row, :] + sr_val[i, :]
tensor.set(tensor_val, place)
return tensor_val, sr_val
class TestBase(unittest.TestCase): class TestBase(unittest.TestCase):
def setup(self, centered, epsilon=1e-6): def setup(self,
place,
is_sparse,
centered,
size,
row_num=None,
epsilon=1e-6):
np.random.seed(5) # fix seed np.random.seed(5) # fix seed
self.scope = fluid.global_scope()
self.place = place
self.param_name = "param" self.param_name = "param"
self.param = np.random.random((123, 321)).astype("float32") self.param = np.random.random(size).astype("float32")
self.mean_square_name = "mean_square" self.mean_square_name = "mean_square"
self.mean_square = np.random.random((123, 321)).astype("float32") self.mean_square = np.random.uniform(
low=1, high=2, size=size).astype("float32")
self.mean_grad_name = "mean_grad" self.mean_grad_name = "mean_grad"
self.mean_grad = np.random.random((123, 321)).astype("float32") self.mean_grad = np.random.random(size).astype("float32")
self.lr_name = "lr" self.lr_name = "lr"
self.learning_rate = np.array([0.01]).astype("float32") self.learning_rate = np.array([0.01]).astype("float32")
self.grad_name = "grad" self.grad_name = "grad"
self.grad = np.random.random((123, 321)).astype("float32")
self.is_sparse = is_sparse
if self.is_sparse:
self.grad_sr_name = "@selected_rows@"
self.grad, self.grad_sr = create_selected_rows_and_tensor(
self.scope, place, size[0], row_num, size[1])
else:
self.grad = np.random.random(size).astype("float32")
grad_tensor = self.scope.var(self.grad_name).get_tensor()
grad_tensor.set(self.grad, place)
self.moment_name = "moment" self.moment_name = "moment"
self.moment = np.zeros((123, 321)).astype("float32") self.moment = np.random.uniform(
low=0, high=1, size=size).astype("float32")
self.epsilon = epsilon self.epsilon = epsilon
self.decay = 0.9 self.decay = 0.9
self.momentum = 0.0 self.momentum = 0.1
self.centered = centered self.centered = centered
self.ms_out = self.decay * self.mean_square + (1 - self.decay self.ms_out = self.decay * self.mean_square + (1 - self.decay
...@@ -61,118 +104,122 @@ class TestBase(unittest.TestCase): ...@@ -61,118 +104,122 @@ class TestBase(unittest.TestCase):
self.param_out = self.param - self.moment_out self.param_out = self.param - self.moment_out
def check(self,
actual_t,
expect_t,
place,
out_name,
atol=1e-5,
equal_nan=False):
self.assertTrue(
np.allclose(
actual_t, expect_t, atol=atol, equal_nan=equal_nan),
"Output (" + out_name + ") has diff at " + str(place) + "\nExpect "
+ str(expect_t) + "\n" + "But Got" + str(actual_t))
class TestRmspropOp(TestBase):
def check_with_place(self, place, centered, epsilon):
self.setup(centered, epsilon)
scope = core.Scope()
# create and initialize Param Variable # create and initialize Param Variable
param = scope.var(self.param_name).get_tensor() self.param_tensor = self.scope.var(self.param_name).get_tensor()
param.set(self.param, place) self.param_tensor.set(self.param, place)
mean_square = scope.var(self.mean_square_name).get_tensor() self.mean_square_tensor = self.scope.var(
mean_square.set(self.mean_square, place) self.mean_square_name).get_tensor()
self.mean_square_tensor.set(self.mean_square, place)
lr = scope.var(self.lr_name).get_tensor() lr = self.scope.var(self.lr_name).get_tensor()
lr.set(self.learning_rate, place) lr.set(self.learning_rate, place)
grad = scope.var(self.grad_name).get_tensor() self.moment_tensor = self.scope.var(self.moment_name).get_tensor()
grad.set(self.grad, place) self.moment_tensor.set(self.moment, place)
moment = scope.var(self.moment_name).get_tensor() if self.centered:
moment.set(self.moment, place) self.mean_grad_tensor = self.scope.var(
self.mean_grad_name).get_tensor()
self.mean_grad_tensor.set(self.mean_grad, place)
# create and run sgd operator def check(self, actual_t, expect_t, place, out_name, atol=1e-5):
self.assertTrue(
np.allclose(
actual_t, expect_t, atol=atol),
"Output (" + out_name + ") has diff at " + str(place) + "\nExpect "
+ str(expect_t) + "\n" + "But Got" + str(actual_t))
if self.centered:
mean_grad = scope.var(self.mean_grad_name).get_tensor() class TestRmspropOp(TestBase):
mean_grad.set(self.mean_grad, place) def check_with_place(self,
place,
rmsprop_op = Operator( is_sparse,
"rmsprop", centered,
Param=self.param_name, size,
Grad=self.grad_name, row_num=None,
MeanSquare=self.mean_square_name, epsilon=1e-6):
MeanGrad=self.mean_grad_name, self.setup(place, is_sparse, centered, size, row_num, epsilon)
Moment=self.moment_name, self.run_and_check()
LearningRate=self.lr_name,
ParamOut=self.param_name, def run_and_check(self):
MeanSquareOut=self.mean_square_name, grad_name = self.grad_sr_name if self.is_sparse else self.grad_name
MomentOut=self.moment_name,
MeanGradOut=self.mean_grad_name, kwargs = {
epsilon=self.epsilon, 'Param': self.param_name,
decay=self.decay, 'Grad': grad_name,
momentum=self.momentum, 'MeanSquare': self.mean_square_name,
centered=True) 'Moment': self.moment_name,
else: 'LearningRate': self.lr_name,
rmsprop_op = Operator( 'ParamOut': self.param_name,
"rmsprop", 'MeanSquareOut': self.mean_square_name,
Param=self.param_name, 'MomentOut': self.moment_name,
Grad=self.grad_name, 'epsilon': self.epsilon,
MeanSquare=self.mean_square_name, 'decay': self.decay,
Moment=self.moment_name, 'momentum': self.momentum,
LearningRate=self.lr_name, 'centered': self.centered
ParamOut=self.param_name, }
MeanSquareOut=self.mean_square_name,
MomentOut=self.moment_name,
epsilon=self.epsilon,
decay=self.decay,
momentum=self.momentum,
centered=False)
rmsprop_op.run(scope, place)
atol = 1e-5
equal_nan = False
if self.centered: if self.centered:
atol = 1e-3 kwargs['MeanGrad'] = self.mean_grad_name
equal_nan = True kwargs['MeanGradOut'] = self.mean_grad_name
rmsprop_op = Operator('rmsprop', **kwargs)
atol = 1e-6
rmsprop_op.run(self.scope, self.place)
self.check( self.check(
np.array(mean_square), self.ms_out, place, self.mean_square_name) np.array(self.mean_square_tensor),
self.ms_out,
self.place,
self.mean_square_name,
atol=atol)
self.check( self.check(
np.array(moment), np.array(self.moment_tensor),
self.moment_out, self.moment_out,
place, self.place,
self.moment_name, self.moment_name,
atol=atol, atol=atol)
equal_nan=equal_nan)
self.check( self.check(
np.array(param), np.array(self.param_tensor),
self.param_out, self.param_out,
place, self.place,
self.param_name, self.param_name,
atol=atol, atol=atol)
equal_nan=equal_nan)
if self.centered: if self.centered:
self.check( self.check(
np.array(mean_grad), self.mg_out, place, self.mean_grad_name) np.array(self.mean_grad_tensor), self.mg_out, self.place,
self.mean_grad_name)
def test_rmsprop(self): def test_rmsprop(self):
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
size = (128, 320)
for place in places: for place in places:
self.check_with_place(place, False, 1e-6) for centered in [False, True]:
self.check_with_place(place, False, 1e-10) with fluid.scope_guard(core.Scope()):
self.check_with_place(place, True, 1e-6) self.check_with_place(
self.check_with_place(place, True, 1e-10) place, is_sparse=False, centered=centered, size=size)
with fluid.scope_guard(core.Scope()):
self.check_with_place(
place,
is_sparse=True,
centered=centered,
row_num=512,
size=size)
with fluid.scope_guard(core.Scope()):
self.check_with_place(
place,
is_sparse=True,
centered=centered,
row_num=60,
size=size)
if __name__ == "__main__": if __name__ == "__main__":
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import six
import numpy as np
from op_test import OpTest
class TestSequenceUnpadOp(OpTest):
def init(self):
self.length = [2, 3, 4]
self.x_shape = (3, 5)
self.dtype = "float32"
def compute(self):
assert len(self.length) == self.x_shape[0]
x = np.random.random(self.x_shape).astype(self.dtype)
out_lod = [self.length]
out = x[0, 0:self.length[0]]
for i in six.moves.xrange(1, x.shape[0]):
out = np.append(out, x[i, 0:self.length[i]], axis=0)
out_shape = (sum(self.length), )
if len(self.x_shape) == 2:
out_shape = out_shape + (1, )
else:
out_shape = out_shape + self.x_shape[2:]
self.inputs = {
'X': x,
'Length': np.array(self.length).astype('int64').reshape(-1, 1)
}
self.outputs = {'Out': (out.reshape(out_shape), out_lod)}
def setUp(self):
self.op_type = 'sequence_unpad'
self.init()
self.compute()
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["X"], "Out")
class TestSequenceUnpadOp2(TestSequenceUnpadOp):
def init(self):
self.length = [2, 3, 4]
self.x_shape = (3, 5, 4, 3)
self.dtype = "float32"
class TestSequenceUnpadOp3(TestSequenceUnpadOp):
def init(self):
self.length = [5, 2, 3, 4]
self.x_shape = (4, 5, 3, 3, 6)
self.dtype = "float64"
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册