diff --git a/Dockerfile b/Dockerfile index 402adee2ea2822250ebc8f6229fd6a44545d58e5..634be18a51bf61e96a8bf6f263b6674a7932d6e4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -53,7 +53,7 @@ RUN curl -s -q https://glide.sh/get | sh # and its size is only one-third of the official one. # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle. # See https://github.com/PaddlePaddle/Paddle/issues/10129 for details. -RUN wget -qO- http://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \ +RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \ tar -xz -C /usr/local && \ cp -rf /usr/local/TensorRT/include /usr && \ cp -rf /usr/local/TensorRT/lib /usr diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index bc36683a9facc253e7b9feb0c5a56e79491fb9b0..f61770514eb05a99c140cdb18575c89aa5235c14 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -128,16 +128,13 @@ set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid") set(module "framework") if (NOT WIN32) -copy(framework_lib DEPS framework_py_proto - SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h - DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} -) -else() -copy(framework_lib +set(framework_lib_deps framework_py_proto) +endif(NOT WIN32) +copy(framework_lib DEPS ${framework_lib_deps} SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h - DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} + ${src_dir}/${module}/ir/*.h + DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}/ir ) -endif(NOT WIN32) set(module "memory") copy(memory_lib @@ -161,7 +158,8 @@ set(module "inference") copy(inference_lib DEPS ${inference_deps} SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.* ${src_dir}/${module}/api/paddle_inference_api.h ${src_dir}/${module}/api/demo_ci - DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} + ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h + DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ) set(module "platform") diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md index fa2b930be0d26d816566599cece8afbedc1157e0..6e5f77fec8a894c390ced8c93ee344fd8d27370e 100644 --- a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md +++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md @@ -60,6 +60,7 @@ 图3. 编码器-解码器框架 + #### 编码器 编码阶段分为三步: @@ -81,7 +82,7 @@ 机器翻译任务的训练过程中,解码阶段的目标是最大化下一个正确的目标语言词的概率。思路是: 1. 每一个时刻,根据源语言句子的编码信息(又叫上下文向量,context vector)`$c$`、真实目标语言序列的第`$i$`个词`$u_i$`和`$i$`时刻RNN的隐层状态`$z_i$`,计算出下一个隐层状态`$z_{i+1}$`。计算公式如下: $$z_{i+1}=\phi_{\theta '} \left ( c,u_i,z_i \right )$$ -其中`$\phi _{\theta '}$`是一个非线性激活函数;`$c=q\mathbf{h}$`是源语言句子的上下文向量,在不使用[注意力机制](#注意力机制)时,如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素,则可以定义`$c=h_T$`;`$u_i$`是目标语言序列的第`$i$`个单词,`$u_0$`是目标语言序列的开始标记``,表示解码开始;`$z_i$`是`$i$`时刻解码RNN的隐层状态,`$z_0$`是一个全零的向量。 +其中`$\phi _{\theta '}$`是一个非线性激活函数;`$c=q\mathbf{h}$`是源语言句子的上下文向量,在不使用注意力机制时,如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素,则可以定义`$c=h_T$`;`$u_i$`是目标语言序列的第`$i$`个单词,`$u_0$`是目标语言序列的开始标记``,表示解码开始;`$z_i$`是`$i$`时刻解码RNN的隐层状态,`$z_0$`是一个全零的向量。 2. 将`$z_{i+1}$`通过`softmax`归一化,得到目标语言序列的第`$i+1$`个单词的概率分布`$p_{i+1}$`。概率分布公式如下: $$p\left ( u_{i+1}|u_{<i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$ @@ -93,6 +94,7 @@ $$p\left ( u_{i+1}|u_{<i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$ 机器翻译任务的生成过程,通俗来讲就是根据预先训练的模型来翻译源语言句子。生成过程中的解码阶段和上述训练过程的有所差异,具体介绍请见[柱搜索算法](#柱搜索算法)。 + ### 柱搜索算法 柱搜索([beam search](http://en.wikipedia.org/wiki/Beam_search))是一种启发式图搜索算法,用于在图或树中搜索有限集合中的最优扩展节点,通常用在解空间非常大的系统(如机器翻译、语音识别)中,原因是内存无法装下图或树中所有展开的解。如在机器翻译任务中希望翻译“`你好`”,就算目标语言字典中只有3个词(``, ``, `hello`),也可能生成无限句话(`hello`循环出现的次数不定),为了找到其中较好的翻译结果,我们可采用柱搜索算法。 diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md index 9900dfb9a67dc6f8940bd7dd3abfa15ac8a3488f..8477cf32146c33947ced447c8bdd287a3e1e71f5 100644 --- a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md +++ b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md @@ -149,6 +149,8 @@ def convolution_net(data, input_dim, class_dim, emb_dim, hid_dim): 网络的输入`input_dim`表示的是词典的大小,`class_dim`表示类别数。这里,我们使用[`sequence_conv_pool`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/trainer_config_helpers/networks.py) API实现了卷积和池化操作。 + + ### 栈式双向LSTM 栈式双向神经网络`stacked_lstm_net`的代码片段如下: diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md index 2c68cdac4f10319359b74bc92569dfd3f65380b5..904d99fe2ffc9ead69a86c9763568a5c098348d5 100644 --- a/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md +++ b/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md @@ -50,7 +50,7 @@ similarity: -0.0997506977351 ``` -以上结果可以通过运行`calculate_dis.py`, 加载字典里的单词和对应训练特征结果得到,我们将在[应用模型](#应用模型)中详细描述用法。 +以上结果可以通过运行`calculate_dis.py`, 加载字典里的单词和对应训练特征结果得到,我们将在[模型应用](#模型应用)中详细描述用法。 ## 模型概览 @@ -189,6 +189,7 @@ dream that one day 最后,每个输入会按其单词次在字典里的位置,转化成整数的索引序列,作为PaddlePaddle的输入。 + ## 编程实现 本配置的模型结构如下图所示: @@ -349,6 +350,7 @@ Step 20: Average Cost 5.766995 ... ``` + ## 模型应用 在模型训练后,我们可以用它做一些预测。 diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md index e6f89b23a95d1a07565f3e0a285e9c3f921930df..ac36c4ecf6b9b716fe5f0dbe2346e64918c22242 100644 --- a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md +++ b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md @@ -102,7 +102,7 @@ Softmax回归模型采用了最简单的两层神经网络,即只有输入层 池化是非线性下采样的一种形式,主要作用是通过减少网络的参数来减小计算量,并且能够在一定程度上控制过拟合。通常在卷积层的后面会加上一个池化层。池化包括最大池化、平均池化等。其中最大池化是用不重叠的矩形框将输入层分成不同的区域,对于每个矩形框的数取最大值作为输出层,如图6所示。 -更详细的关于卷积神经网络的具体知识可以参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/ )和[图像分类](https://github.com/PaddlePaddle/book/blob/develop/image_classification/README.md)教程。 +更详细的关于卷积神经网络的具体知识可以参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/ )和[图像分类]( https://github.com/PaddlePaddle/book/tree/develop/03.image_classification )教程。 ### 常见激活函数介绍 - sigmoid激活函数: $ f(x) = sigmoid(x) = \frac{1}{1+e^{-x}} $ diff --git a/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md b/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md index a2f30823a6fcd379f94e6e98d043b0d00681827f..84987ea5daee9abd0fe2fe71bdfde62ea3388ab5 100644 --- a/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md +++ b/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md @@ -149,7 +149,7 @@ python setup.py bdist_wheel pip install --upgrade dist/visualdl-*.whl ``` -如果打包和安装遇到其他问题,不安装只想运行Visual DL可以看[这里](https://github.com/PaddlePaddle/VisualDL/blob/develop/docs/how_to_dev_frontend_en.md) +如果打包和安装遇到其他问题,不安装只想运行Visual DL可以看[这里](https://github.com/PaddlePaddle/VisualDL/blob/develop/docs/develop/how_to_dev_frontend_cn.md) ## SDK diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index bfc649017f19d67660bd11d590134cf56772bb27..f5235f70ad79616801110644999d511eeda33a32 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -1,20 +1,35 @@ +set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h) +file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt. DO NOT EDIT!\n\n") +file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n") +function(pass_library TARGET) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass) + file(APPEND ${pass_file} "USE_PASS(${TARGET});\n") + set(PASS_LIBRARY ${TARGET} ${PASS_LIBRARY} PARENT_SCOPE) +endfunction() + cc_library(node SRCS node.cc DEPS proto_desc) cc_library(graph SRCS graph.cc DEPS node) cc_library(graph_helper SRCS graph_helper.cc DEPS graph) cc_library(pass SRCS pass.cc DEPS graph node graph_helper) -cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper) -cc_library(graph_to_program_pass SRCS graph_to_program_pass.cc DEPS graph pass graph_helper) cc_library(graph_traits SRCS graph_traits.cc DEPS graph) cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits) -cc_library(fc_fuse_pass SRCS fc_fuse_pass.cc DEPS graph graph_pattern_detector) -cc_library(attention_lstm_fuse_pass SRCS attention_lstm_fuse_pass.cc DEPS graph graph_pattern_detector) -cc_library(infer_clean_graph_pass SRCS infer_clean_graph_pass.cc DEPS graph pass) -cc_library(fc_lstm_fuse_pass SRCS fc_lstm_fuse_pass.cc DEPS graph graph_pattern_detector) -cc_library(seq_concat_fc_fuse_pass SRCS seq_concat_fc_fuse_pass.cc DEPS graph graph_pattern_detector) + +pass_library(graph_to_program_pass) +pass_library(graph_viz_pass) +pass_library(fc_fuse_pass) +pass_library(attention_lstm_fuse_pass) +pass_library(infer_clean_graph_pass) +pass_library(fc_lstm_fuse_pass) +pass_library(seq_concat_fc_fuse_pass) +set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library") cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry) cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry) cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) -cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass graph_pattern_detector graph pass graph_traits framework_proto) +cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc index 0278ade6763ec614701674691797d766878a378e..d7580a1cfd689c122ea1e7db5918e2cd8711718f 100644 --- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc @@ -99,17 +99,13 @@ void FindWhileOp(Graph* graph) { auto* cell_init = graph->RetriveNode(6); auto* hidden_init = graph->RetriveNode(8); -#define LINK_TO(node0, node1) \ - node0->outputs.push_back(node1); \ - node1->inputs.push_back(node0); - auto* lstm_op = graph->CreateOpNode(&op_desc); PrepareParameters(graph, param); - LINK_TO(X, lstm_op); - LINK_TO(cell_init, lstm_op); - LINK_TO(hidden_init, lstm_op); - LINK_TO(lstm_op, LSTMOUT); + IR_NODE_LINK_TO(X, lstm_op); + IR_NODE_LINK_TO(cell_init, lstm_op); + IR_NODE_LINK_TO(hidden_init, lstm_op); + IR_NODE_LINK_TO(lstm_op, LSTMOUT); GraphSafeRemoveNodes(graph, marked_nodes); } diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc index 513742bab69d465aac1bfb7bcef2fe89108c14a0..5a4ebd6f3de555acccd72c61bd377ffd8ce69780 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc @@ -21,74 +21,26 @@ namespace paddle { namespace framework { namespace ir { -bool VarOutLinksToOp(Node* node, const std::string& op_type) { - for (auto* out : node->outputs) { - if (out->IsOp() && out->Op()->Type() == op_type) { - return true; - } - } - return false; -} - -void BuildFCPattern(PDPattern* pattern) { - // Create Operators - auto* mul_op = pattern->NewNode("mul")->assert_is_op("mul"); - auto* elementwise_add_op = - pattern->NewNode("elementwise_add")->assert_is_op("elementwise_add"); - // Create variables - // w - auto* mul_weight_var = pattern->NewNode("mul_weight") - ->AsInput() - ->assert_is_op_nth_input("mul", "Y", 0); - // x - auto* mul_tmp_var = pattern->NewNode("mul_tmp_var") - ->AsInput() - ->assert_is_op_nth_input("mul", "X", 0); - // intermediate variable, will be removed in the IR after fuse. - auto* mul_out_var = pattern->NewNode("mul_out") - ->AsIntermediate() - ->assert_is_only_output_of_op("mul") - ->assert_is_op_input("elementwise_add"); - // bias - auto* elementwise_add_tmp_var = pattern->NewNode("elementwise_add_tmpvar") - ->assert_is_op_input("elementwise_add") - ->AsInput(); - // output - auto* elementwise_add_out_var = pattern->NewNode("elementwise_add_out") - ->AsOutput() - ->assert_is_op_output("elementwise_add"); - - mul_op->LinksFrom({mul_weight_var, mul_tmp_var}).LinksTo({mul_out_var}); - elementwise_add_op->LinksFrom({mul_out_var, elementwise_add_tmp_var}) - .LinksTo({elementwise_add_out_var}); -} - -// Replace the node `from` in the links to `to` -bool LinksReplace(std::vector* links, Node* from, Node* to) { - for (auto*& n : *links) { - if (n == from) { - n = to; - return true; - } - } - return false; -} - std::unique_ptr FCFusePass::ApplyImpl( std::unique_ptr graph) const { PADDLE_ENFORCE(graph.get()); - FusePassBase::Init("fc", graph.get()); + FusePassBase::Init("fc_fuse", graph.get()); std::unordered_set nodes2delete; GraphPatternDetector gpd; - BuildFCPattern(gpd.mutable_pattern()); - -#define GET_NODE(id) \ - PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetrieveNode(#id)), \ - "pattern has no Node called %s", #id); \ - auto* id = subgraph.at(gpd.pattern().RetrieveNode(#id)); \ - PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id); + // BuildFCPattern(gpd.mutable_pattern()); + auto* x = gpd.mutable_pattern() + ->NewNode("fc_fuse/x") + ->AsInput() + ->assert_is_op_input("mul", "X"); + patterns::FC(gpd.mutable_pattern(), "fc_fuse", x, true /*with bias*/); + +#define GET_NODE(id) \ + PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetrieveNode("fc_fuse/" #id)), \ + "pattern has no Node called %s", #id); \ + auto* id = subgraph.at(gpd.pattern().RetrieveNode("fc_fuse/" #id)); \ + PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", "fc_fuse/" #id); int found_fc_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, @@ -98,43 +50,33 @@ std::unique_ptr FCFusePass::ApplyImpl( // scenerio. // FC's fusion is simple, just op fuse, no need to process the // parameters. - GET_NODE(mul_tmp_var); // x - GET_NODE(mul_weight); // Y - GET_NODE(elementwise_add_tmpvar); // bias - GET_NODE(elementwise_add_out); // Out - GET_NODE(mul); // MUL op - GET_NODE(elementwise_add); // ELEMENT_ADD op - GET_NODE(mul_out); // tmp + GET_NODE(x); // x + GET_NODE(w); // Y + GET_NODE(fc_bias); // bias + GET_NODE(fc_out); // Out + GET_NODE(mul); // MUL op + GET_NODE(elementwise_add); // ELEMENT_ADD op + GET_NODE(mul_out); // tmp #undef GET_NODE // Create an FC Node. OpDesc desc; - std::string fc_x_in = mul_tmp_var->Name(); - std::string fc_Y_in = mul_weight->Name(); - std::string fc_bias_in = elementwise_add_tmpvar->Name(); - std::string fc_out = elementwise_add_out->Name(); + std::string fc_x_in = x->Name(); + std::string fc_Y_in = w->Name(); + std::string fc_bias_in = fc_bias->Name(); + std::string fc_out_out = fc_out->Name(); desc.SetInput("Input", std::vector({fc_x_in})); desc.SetInput("W", std::vector({fc_Y_in})); desc.SetInput("Bias", std::vector({fc_bias_in})); - desc.SetOutput("Out", std::vector({fc_out})); + desc.SetOutput("Out", std::vector({fc_out_out})); desc.SetType("fc"); auto fc_node = g->CreateOpNode(&desc); // OpDesc will be copied. - fc_node->inputs = - std::vector({mul_tmp_var, mul_weight, elementwise_add_tmpvar}); - fc_node->outputs.push_back(elementwise_add_out); - - // Update link relatons - PADDLE_ENFORCE(LinksReplace(&mul_tmp_var->outputs, mul, fc_node)); - PADDLE_ENFORCE(LinksReplace(&mul_weight->outputs, mul, fc_node)); - PADDLE_ENFORCE(LinksReplace(&elementwise_add_tmpvar->outputs, - elementwise_add, fc_node)); - PADDLE_ENFORCE( - LinksReplace(&elementwise_add_out->inputs, elementwise_add, fc_node)); + GraphSafeRemoveNodes(graph.get(), {mul, elementwise_add, mul_out}); - // Drop old nodes - graph->RemoveNode(mul); - graph->RemoveNode(elementwise_add); - graph->RemoveNode(mul_out); // tmp variable + IR_NODE_LINK_TO(x, fc_node); + IR_NODE_LINK_TO(w, fc_node); + IR_NODE_LINK_TO(fc_bias, fc_node); + IR_NODE_LINK_TO(fc_node, fc_out); found_fc_count++; }; diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc index c404a6c44ccea8287ddfad976889a9f80cf6bad9..00f5e7fad2ef5d42eb0de9703389e910090d93c1 100644 --- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc @@ -121,15 +121,11 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, #undef TMP_NEW #undef TMP_NAME -#define LINK_TO(a, b) \ - a->outputs.push_back(b); \ - b->inputs.push_back(a); - LINK_TO(input_n, op); - LINK_TO(weight_x_n, op); - LINK_TO(weight_h_n, op); - LINK_TO(bias_n, op); - LINK_TO(op, hidden_n); -#undef LINK_TO + IR_NODE_LINK_TO(input_n, op); + IR_NODE_LINK_TO(weight_x_n, op); + IR_NODE_LINK_TO(weight_h_n, op); + IR_NODE_LINK_TO(bias_n, op); + IR_NODE_LINK_TO(op, hidden_n); return op; }; diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index f651ab635eadc9f248964e91dceebf3aa9c42926..a4da69a0a2ea44806b68a27647213759ebd387b1 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -111,6 +111,11 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) { return false; } } + for (auto& item : pdnodes2nodes_) { + for (auto& n : item.second) { + GetMarkedNodes(const_cast(&graph)).insert(n); + } + } VLOG(3) << pdnodes2nodes_.size() << " nodes marked"; return !pdnodes2nodes_.empty(); @@ -278,7 +283,7 @@ void GraphPatternDetector::RemoveOverlappedMatch( for (const auto& subgraph : *subgraphs) { bool valid = true; for (auto& item : subgraph) { - if (node_set.count(item.second)) { + if (item.first->IsIntermediate() && node_set.count(item.second)) { valid = false; break; } @@ -334,22 +339,22 @@ PDNode& PDNode::LinksFrom(const std::vector& others) { } PDNode* PDNode::assert_is_op() { - asserts_.emplace_back([this](Node* x) { return x && x->IsOp(); }); + asserts_.emplace_back([](Node* x) { return x && x->IsOp(); }); return this; } PDNode* PDNode::assert_is_op(const std::string& op_type) { - asserts_.emplace_back([this, op_type](Node* x) { + asserts_.emplace_back([op_type](Node* x) { return x && x->IsOp() && x->Op()->Type() == op_type; }); return this; } PDNode* PDNode::assert_is_var() { - asserts_.emplace_back([this](Node* x) { return x && x->IsVar(); }); + asserts_.emplace_back([](Node* x) { return x && x->IsVar(); }); return this; } PDNode* PDNode::assert_var_not_persistable() { assert_is_var(); - asserts_.emplace_back([this](Node* x) { return !x->Var()->Persistable(); }); + asserts_.emplace_back([](Node* x) { return !x->Var()->Persistable(); }); return this; } PDNode* PDNode::assert_is_persistable_var() { @@ -491,14 +496,16 @@ void GraphSafeRemoveNodes(Graph* graph, for (auto it = node->inputs.begin(); it != node->inputs.end();) { if (nodes.count(*it)) { it = const_cast(node)->inputs.erase(it); - } else + } else { it++; + } } for (auto it = node->outputs.begin(); it != node->outputs.end();) { if (nodes.count(*it)) { it = const_cast(node)->outputs.erase(it); - } else + } else { it++; + } } } } diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 024ce8ce55616cc5e0eaced4a27a6e1fb004af2c..9d67c4a6997dfe19561f37bf3ea76eba8b59ff35 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -245,6 +245,8 @@ class GraphPatternDetector { void UniquePatterns(std::vector* subgraphs); // Remove overlapped match subgraphs, when overlapped, keep the previous one. + // The intermediate PDNodes will be removed, so can't shared by multiple + // patterns. void RemoveOverlappedMatch(std::vector* subgraphs); // Validate whether the intermediate nodes are linked by external nodes. @@ -295,6 +297,10 @@ PDNode* LSTM(PDPattern* pattern, const std::string& name_scope, PDNode* x); } // namespace patterns +#define IR_NODE_LINK_TO(a, b) \ + a->outputs.push_back(b); \ + b->inputs.push_back(a); + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc index 7e5c86b033a7c69a306491cf4bf8d099018c5f19..6c466fb21fb46e09961dc874e9e39655f83d17c6 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc @@ -140,8 +140,9 @@ TEST(GraphPatternDetecter, MultiSubgraph) { return node->IsOp() && (node->Name() == "op2" || node->Name() == "op3"); }, "OP0"); - auto* any_var = x.mutable_pattern()->NewNode( - [](Node* node) { return node->IsVar(); }, "VAR"); + auto* any_var = x.mutable_pattern() + ->NewNode([](Node* node) { return node->IsVar(); }, "VAR") + ->AsIntermediate(); auto* any_op1 = x.mutable_pattern()->NewNode( [](Node* node) { return node->IsOp(); }, "OP1"); diff --git a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc index f885567da1965b997b2063e06c839af95b43e1e1..7713ed1eab88ee4fa16d52e7425075ae66f721a3 100644 --- a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc +++ b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc @@ -13,42 +13,41 @@ // limitations under the License. #include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" namespace paddle { namespace framework { namespace ir { -class InferCleanGraphPass : public Pass { +class InferCleanGraphPass : public FusePassBase { public: virtual ~InferCleanGraphPass() {} protected: std::unique_ptr ApplyImpl(std::unique_ptr graph) const { + FusePassBase::Init("original_graph", graph.get()); PADDLE_ENFORCE(graph.get()); auto is_valid_node = [](Node* x) { return x && IsControlDepVar(*x) && x->IsVar() && !x->Var(); }; - std::unordered_set invalid_nodes; + std::unordered_set invalid_nodes; + int valid_op = 0; for (auto* node : graph->Nodes()) { if (is_valid_node(node)) { invalid_nodes.insert(node); + } else if (node->IsOp()) { + // Collect all the operators to help tracking number of operators. + ++valid_op; } } - // remove nodes from the graph. - for (auto* node : invalid_nodes) { - graph->RemoveNode(node); - } + GraphSafeRemoveNodes(graph.get(), invalid_nodes); - // clean edges. - for (auto* node : graph->Nodes()) { - CleanEdges(&node->inputs, invalid_nodes); - CleanEdges(&node->outputs, invalid_nodes); - } + AddStatis(valid_op); return graph; } diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc index a776a898a5ee13b4dde12460dce71433268fb9d4..e1a441d09aaa3647c4b2a582210a2c7e2b64e0da 100644 --- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc +++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc @@ -219,16 +219,13 @@ std::unique_ptr SeqConcatFcFusePass::ApplyImpl( op_desc.SetAttr("fc_activation", act->Op()->Type()); auto* op_node = graph->CreateOpNode(&op_desc); -// Add links -#define NODE_LINKS(a, b) \ - a->outputs.push_back(b); \ - b->inputs.push_back(a); - NODE_LINKS(fc_w, op_node); - NODE_LINKS(fc_bias, op_node); - NODE_LINKS(concat_in0, op_node); - NODE_LINKS(sequence_expand0_in, op_node); - NODE_LINKS(sequence_expand1_in, op_node); - NODE_LINKS(op_node, fc_out); + // Add links + IR_NODE_LINK_TO(fc_w, op_node); + IR_NODE_LINK_TO(fc_bias, op_node); + IR_NODE_LINK_TO(concat_in0, op_node); + IR_NODE_LINK_TO(sequence_expand0_in, op_node); + IR_NODE_LINK_TO(sequence_expand1_in, op_node); + IR_NODE_LINK_TO(op_node, fc_out); // Clean nodes. std::unordered_set marked_nodes; @@ -241,7 +238,6 @@ std::unique_ptr SeqConcatFcFusePass::ApplyImpl( marked_nodes.erase(sequence_expand0_in); marked_nodes.erase(sequence_expand1_in); marked_nodes.erase(fc_out); - GraphSafeRemoveNodes(graph, marked_nodes); }); diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 86392078b356df774fbc47aed9214e9f10fe33be..2006e3b24f71d0ae32b4e2ae34f1a1e4d3a82f91 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -10,7 +10,7 @@ set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor) # TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal? cc_library(paddle_fluid_api SRCS io.cc - DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} graph_to_program_pass) + DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) @@ -22,7 +22,7 @@ cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) #endif() # Create static library -cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api) +cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api analysis_predictor) if(NOT APPLE) # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac. set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym") @@ -32,6 +32,7 @@ endif() # Create shared library cc_library(paddle_fluid_shared SHARED SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc + ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc DEPS ${fluid_modules} paddle_fluid_api) set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index cc0dd0d492d42e9552c9ce081e268330599104f0..dadc8a53706fb9edff884dcf6d49168bfef3aa30 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -33,7 +33,7 @@ function (inference_analysis_test TARGET) endif() cc_test(${TARGET} SRCS "${analysis_test_SRCS}" - DEPS analysis graph fc_fuse_pass graph_viz_pass infer_clean_graph_pass graph_pattern_detector pass ${analysis_test_EXTRA_DEPS} + DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS} ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt} ${analysis_test_ARGS}) set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec) endif(WITH_TESTING) @@ -56,25 +56,13 @@ if (NOT EXISTS ${DITU_INSTALL_DIR} AND WITH_TESTING) endif() inference_analysis_test(test_analyzer SRCS analyzer_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis - analysis_predictor - # ir - fc_fuse_pass - fc_lstm_fuse_pass - seq_concat_fc_fuse_pass - graph_viz_pass - infer_clean_graph_pass - graph_pattern_detector - infer_clean_graph_pass - attention_lstm_fuse_pass - paddle_inference_api - pass + EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor ARGS --infer_ditu_rnn_model=${DITU_INSTALL_DIR}/model --infer_ditu_rnn_data=${DITU_INSTALL_DIR}/data.txt) inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc) -inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc EXTRA_DEPS paddle_inference_api) -inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc EXTRA_DEPS paddle_fluid) +inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc) +inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc) inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc) inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc) inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc) diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index ec1f3979a74bd86ee7402bca441e95d3d177d113..59e103e1179240a100d492a2475573c8188bebe7 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -22,6 +22,7 @@ #include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/platform/profiler.h" @@ -327,9 +328,20 @@ void TestDituRNNPrediction(const std::string &model_path, LOG(INFO) << "fused " << item.first << " " << item.second; } - ASSERT_TRUE(fuse_statis.count("fc")); - EXPECT_EQ(fuse_statis.at("fc"), 1); - EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 1); + int num_ops = 0; + for (auto &node : + analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) { + if (node->IsFunction()) { + ++num_ops; + } + } + LOG(INFO) << "has num ops: " << num_ops; + + ASSERT_TRUE(fuse_statis.count("fc_fuse")); + EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); + EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM + EXPECT_EQ(num_ops, + 13); // After graph optimization, only 13 operators exists. } } @@ -357,10 +369,3 @@ TEST(Analyzer, DituRNN_with_analysis_with_IR) { } // namespace analysis } // namespace inference } // namespace paddle - -USE_PASS(fc_fuse_pass); -USE_PASS(seq_concat_fc_fuse_pass); -USE_PASS(fc_lstm_fuse_pass); -USE_PASS(graph_viz_pass); -USE_PASS(infer_clean_graph_pass); -USE_PASS(attention_lstm_fuse_pass); diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc index 6a13c60e7b2ebf645b12d5ddf83ef6ab3a2e83bd..367c25805d05f8d10fb8341158760ac6356a5c48 100644 --- a/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc +++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc @@ -16,6 +16,7 @@ #include #include "paddle/fluid/inference/analysis/ut_helper.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" namespace paddle { namespace inference { @@ -33,10 +34,3 @@ TEST(FluidToIrPass, Test) { } // namespace analysis } // namespace inference } // namespace paddle - -USE_PASS(graph_viz_pass); -USE_PASS(infer_clean_graph_pass); -USE_PASS(attention_lstm_fuse_pass); -USE_PASS(fc_lstm_fuse_pass); -USE_PASS(seq_concat_fc_fuse_pass); -USE_PASS(fc_fuse_pass); diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index adfe4392448557a30cd834022b9a5d21d9086b95..3a43c72e33b3d5d8910b554021bb1c6a626edd93 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -18,10 +18,7 @@ if(APPLE) endif(APPLE) -set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager - graph_viz_pass fc_fuse_pass - infer_clean_graph_pass - ) +set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager ${GLOB_PASS_LIB}) if(WITH_GPU AND TENSORRT_FOUND) set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 33862232bdaae817b9ca72879605386c32ed3e8b..e87abd2feeff7769eb223f83a7a28f5cb3337cdb 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -18,6 +18,7 @@ #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/utils/singleton.h" namespace paddle { @@ -133,7 +134,3 @@ std::unique_ptr CreatePaddlePredictor< } } // namespace paddle - -USE_PASS(fc_fuse_pass); -USE_PASS(graph_viz_pass); -USE_PASS(infer_clean_graph_pass); diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index bdc9a15d543818da94ac2acf34ecabbbbae3291e..ce4728ab8046f886fc40af3644d855a4f971fb71 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -16,6 +16,7 @@ #include #include +#include #include #include #include diff --git a/paddle/fluid/inference/paddle_fluid.map b/paddle/fluid/inference/paddle_fluid.map index 5203784dc1fcb672eb6a26d9dfd3ffbe02e08038..7e5cae04b81e6ce759b92f6c4b921ecf974e8260 100644 --- a/paddle/fluid/inference/paddle_fluid.map +++ b/paddle/fluid/inference/paddle_fluid.map @@ -1,6 +1,7 @@ { global: *paddle*; + *Pass*; local: *; }; diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h new file mode 100644 index 0000000000000000000000000000000000000000..0dee1781623d5a62830545c0952e5aadbe37accb --- /dev/null +++ b/paddle/fluid/operators/detection/bbox_util.h @@ -0,0 +1,66 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace operators { + +/* + * transform that computes target bounding-box regression deltas + * given proposal boxes and ground-truth boxes. + */ +template +inline void BoxToDelta(const int box_num, const framework::Tensor& ex_boxes, + const framework::Tensor& gt_boxes, const T* weights, + const bool normalized, framework::Tensor* box_delta) { + auto ex_boxes_et = framework::EigenTensor::From(ex_boxes); + auto gt_boxes_et = framework::EigenTensor::From(gt_boxes); + auto trg = framework::EigenTensor::From(*box_delta); + T ex_w, ex_h, ex_ctr_x, ex_ctr_y, gt_w, gt_h, gt_ctr_x, gt_ctr_y; + for (int64_t i = 0; i < box_num; ++i) { + ex_w = ex_boxes_et(i, 2) - ex_boxes_et(i, 0) + (normalized == false); + ex_h = ex_boxes_et(i, 3) - ex_boxes_et(i, 1) + (normalized == false); + ex_ctr_x = ex_boxes_et(i, 0) + 0.5 * ex_w; + ex_ctr_y = ex_boxes_et(i, 1) + 0.5 * ex_h; + + gt_w = gt_boxes_et(i, 2) - gt_boxes_et(i, 0) + (normalized == false); + gt_h = gt_boxes_et(i, 3) - gt_boxes_et(i, 1) + (normalized == false); + gt_ctr_x = gt_boxes_et(i, 0) + 0.5 * gt_w; + gt_ctr_y = gt_boxes_et(i, 1) + 0.5 * gt_h; + + trg(i, 0) = (gt_ctr_x - ex_ctr_x) / ex_w; + trg(i, 1) = (gt_ctr_y - ex_ctr_y) / ex_h; + trg(i, 2) = std::log(gt_w / ex_w); + trg(i, 3) = std::log(gt_h / ex_h); + + if (weights) { + trg(i, 0) = trg(i, 0) / weights[0]; + trg(i, 1) = trg(i, 1) / weights[1]; + trg(i, 2) = trg(i, 2) / weights[2]; + trg(i, 3) = trg(i, 3) / weights[3]; + } + } +} + +template +void Gather(const T* in, const int in_stride, const int* index, const int num, + T* out) { + const int stride_bytes = in_stride * sizeof(T); + for (int i = 0; i < num; ++i) { + int id = index[i]; + memcpy(out + i * in_stride, in + id * in_stride, stride_bytes); + } +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc index 0571c46f6be99c9a06b7dd2abb310eeda506ecd5..be06dc19743cfa6f093bcb3f4e9f91af315d4211 100644 --- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc +++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detection/bbox_util.h" #include "paddle/fluid/operators/gather.h" #include "paddle/fluid/operators/math/concat.h" #include "paddle/fluid/operators/math/math_function.h" @@ -133,31 +134,6 @@ void BboxOverlaps(const Tensor& r_boxes, const Tensor& c_boxes, } } -template -void BoxToDelta(int box_num, const Tensor& ex_boxes, const Tensor& gt_boxes, - const std::vector& weights, Tensor* box_delta) { - auto ex_boxes_et = framework::EigenTensor::From(ex_boxes); - auto gt_boxes_et = framework::EigenTensor::From(gt_boxes); - auto box_delta_et = framework::EigenTensor::From(*box_delta); - T ex_w, ex_h, ex_ctr_x, ex_ctr_y, gt_w, gt_h, gt_ctr_x, gt_ctr_y; - for (int64_t i = 0; i < box_num; ++i) { - ex_w = ex_boxes_et(i, 2) - ex_boxes_et(i, 0) + 1; - ex_h = ex_boxes_et(i, 3) - ex_boxes_et(i, 1) + 1; - ex_ctr_x = ex_boxes_et(i, 0) + 0.5 * ex_w; - ex_ctr_y = ex_boxes_et(i, 1) + 0.5 * ex_h; - - gt_w = gt_boxes_et(i, 2) - gt_boxes_et(i, 0) + 1; - gt_h = gt_boxes_et(i, 3) - gt_boxes_et(i, 1) + 1; - gt_ctr_x = gt_boxes_et(i, 0) + 0.5 * gt_w; - gt_ctr_y = gt_boxes_et(i, 1) + 0.5 * gt_h; - - box_delta_et(i, 0) = (gt_ctr_x - ex_ctr_x) / ex_w / weights[0]; - box_delta_et(i, 1) = (gt_ctr_y - ex_ctr_y) / ex_h / weights[1]; - box_delta_et(i, 2) = log(gt_w / ex_w) / ex_w / weights[2]; - box_delta_et(i, 3) = log(gt_h / ex_h) / ex_h / weights[3]; - } -} - template std::vector> SampleFgBgGt( const platform::CPUDeviceContext& context, Tensor* iou, @@ -243,12 +219,11 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context, Tensor* sampled_labels, Tensor* sampled_gts) { int fg_num = fg_inds.size(); int bg_num = bg_inds.size(); - int gt_num = fg_num + bg_num; Tensor fg_inds_t, bg_inds_t, gt_box_inds_t, gt_label_inds_t; int* fg_inds_data = fg_inds_t.mutable_data({fg_num}, context.GetPlace()); int* bg_inds_data = bg_inds_t.mutable_data({bg_num}, context.GetPlace()); int* gt_box_inds_data = - gt_box_inds_t.mutable_data({gt_num}, context.GetPlace()); + gt_box_inds_t.mutable_data({fg_num}, context.GetPlace()); int* gt_label_inds_data = gt_label_inds_t.mutable_data({fg_num}, context.GetPlace()); std::copy(fg_inds.begin(), fg_inds.end(), fg_inds_data); @@ -303,18 +278,20 @@ std::vector SampleRoisForOneImage( // Gather boxes and labels Tensor sampled_boxes, sampled_labels, sampled_gts; - int boxes_num = fg_inds.size() + bg_inds.size(); + int fg_num = fg_inds.size(); + int bg_num = bg_inds.size(); + int boxes_num = fg_num + bg_num; framework::DDim bbox_dim({boxes_num, kBoxDim}); sampled_boxes.mutable_data(bbox_dim, context.GetPlace()); sampled_labels.mutable_data({boxes_num}, context.GetPlace()); - sampled_gts.mutable_data(bbox_dim, context.GetPlace()); + sampled_gts.mutable_data({fg_num, kBoxDim}, context.GetPlace()); GatherBoxesLabels(context, boxes, *gt_boxes, *gt_classes, fg_inds, bg_inds, gt_inds, &sampled_boxes, &sampled_labels, &sampled_gts); // Compute targets Tensor bbox_targets_single; bbox_targets_single.mutable_data(bbox_dim, context.GetPlace()); - BoxToDelta(boxes_num, sampled_boxes, sampled_gts, bbox_reg_weights, + BoxToDelta(fg_num, sampled_boxes, sampled_gts, nullptr, false, &bbox_targets_single); // Scale rois @@ -427,7 +404,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel { auto rpn_rois_lod = rpn_rois->lod().back(); auto gt_classes_lod = gt_classes->lod().back(); auto gt_boxes_lod = gt_boxes->lod().back(); - for (size_t i = 0; i < n; ++i) { + for (int i = 0; i < n; ++i) { Tensor rpn_rois_slice = rpn_rois->Slice(rpn_rois_lod[i], rpn_rois_lod[i + 1]); Tensor gt_classes_slice = diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc index fcdcafae7273afa6887ee531dfc37ef833b92d68..ebe6830eccd87a156768eb0d4b96220bcc9f4edc 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_op.cc @@ -311,8 +311,7 @@ class GenerateProposalsKernel : public framework::OpKernel { rpn_rois->mutable_data({bbox_deltas->numel() / 4, 4}, context.GetPlace()); - rpn_roi_probs->mutable_data({scores->numel() / 4, 1}, - context.GetPlace()); + rpn_roi_probs->mutable_data({scores->numel(), 1}, context.GetPlace()); Tensor bbox_deltas_swap, scores_swap; bbox_deltas_swap.mutable_data({num, h_bbox, w_bbox, c_bbox}, @@ -421,7 +420,7 @@ class GenerateProposalsKernel : public framework::OpKernel { CPUGather(ctx, proposals, keep, &bbox_sel); CPUGather(ctx, scores_sel, keep, &scores_filter); if (nms_thresh <= 0) { - return std::make_pair(bbox_sel, scores_sel); + return std::make_pair(bbox_sel, scores_filter); } Tensor keep_nms = NMS(ctx, &bbox_sel, &scores_filter, nms_thresh, eta); diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc index 177ff7cf187bc9daf69889e99ca57ae18766de90..88757f25cd9a5789758640de2d9cae0b12350b25 100644 --- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc +++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detection/bbox_util.h" #include "paddle/fluid/operators/math/math_function.h" namespace paddle { @@ -46,156 +47,219 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel { auto in_dims = ctx->GetInputDim("DistMat"); PADDLE_ENFORCE_EQ(in_dims.size(), 2, "The rank of Input(DistMat) must be 2."); + + ctx->SetOutputDim("LocationIndex", {-1}); + ctx->SetOutputDim("ScoreIndex", {-1}); + ctx->SetOutputDim("TargetLabel", {-1, 1}); + ctx->SetOutputDim("TargetBBox", {-1, 4}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType( + ctx.Input("DistMat")->type()), + platform::CPUPlace()); } }; template class RpnTargetAssignKernel : public framework::OpKernel { public: + void Compute(const framework::ExecutionContext& context) const override { + auto* anchor_t = context.Input("Anchor"); // (H*W*A) * 4 + auto* gt_bbox_t = context.Input("GtBox"); + auto* dist_t = context.Input("DistMat"); + + auto* loc_index_t = context.Output("LocationIndex"); + auto* score_index_t = context.Output("ScoreIndex"); + auto* tgt_bbox_t = context.Output("TargetBBox"); + auto* tgt_lbl_t = context.Output("TargetLabel"); + + auto lod = dist_t->lod().back(); + int64_t batch_num = static_cast(lod.size() - 1); + int64_t anchor_num = dist_t->dims()[1]; + PADDLE_ENFORCE_EQ(anchor_num, anchor_t->dims()[0]); + + int rpn_batch_size = context.Attr("rpn_batch_size_per_im"); + float pos_threshold = context.Attr("rpn_positive_overlap"); + float neg_threshold = context.Attr("rpn_negative_overlap"); + float fg_fraction = context.Attr("fg_fraction"); + + int fg_num_per_batch = static_cast(rpn_batch_size * fg_fraction); + + int64_t max_num = batch_num * anchor_num; + auto place = context.GetPlace(); + + tgt_bbox_t->mutable_data({max_num, 4}, place); + auto* loc_index = loc_index_t->mutable_data({max_num}, place); + auto* score_index = score_index_t->mutable_data({max_num}, place); + + Tensor tmp_tgt_lbl; + auto* tmp_lbl_data = tmp_tgt_lbl.mutable_data({max_num}, place); + auto& dev_ctx = context.device_context(); + math::SetConstant iset; + iset(dev_ctx, &tmp_tgt_lbl, static_cast(-1)); + + std::random_device rnd; + std::minstd_rand engine; + int seed = + context.Attr("fix_seed") ? context.Attr("seed") : rnd(); + engine.seed(seed); + + int fg_num = 0; + int bg_num = 0; + for (int i = 0; i < batch_num; ++i) { + Tensor dist = dist_t->Slice(lod[i], lod[i + 1]); + Tensor gt_bbox = gt_bbox_t->Slice(lod[i], lod[i + 1]); + auto fg_bg_gt = SampleFgBgGt(dev_ctx, dist, pos_threshold, neg_threshold, + rpn_batch_size, fg_num_per_batch, engine, + tmp_lbl_data + i * anchor_num); + + int cur_fg_num = fg_bg_gt[0].size(); + int cur_bg_num = fg_bg_gt[1].size(); + std::transform(fg_bg_gt[0].begin(), fg_bg_gt[0].end(), loc_index, + [i, anchor_num](int d) { return d + i * anchor_num; }); + memcpy(score_index, loc_index, cur_fg_num * sizeof(int)); + std::transform(fg_bg_gt[1].begin(), fg_bg_gt[1].end(), + score_index + cur_fg_num, + [i, anchor_num](int d) { return d + i * anchor_num; }); + + // get target bbox deltas + if (cur_fg_num) { + Tensor fg_gt; + T* gt_data = fg_gt.mutable_data({cur_fg_num, 4}, place); + Tensor tgt_bbox = tgt_bbox_t->Slice(fg_num, fg_num + cur_fg_num); + T* tgt_data = tgt_bbox.data(); + Gather(anchor_t->data(), 4, + reinterpret_cast(&fg_bg_gt[0][0]), cur_fg_num, + tgt_data); + Gather(gt_bbox.data(), 4, reinterpret_cast(&fg_bg_gt[2][0]), + cur_fg_num, gt_data); + BoxToDelta(cur_fg_num, tgt_bbox, fg_gt, nullptr, false, &tgt_bbox); + } + + loc_index += cur_fg_num; + score_index += cur_fg_num + cur_bg_num; + fg_num += cur_fg_num; + bg_num += cur_bg_num; + } + + int lbl_num = fg_num + bg_num; + PADDLE_ENFORCE_LE(fg_num, max_num); + PADDLE_ENFORCE_LE(lbl_num, max_num); + + tgt_bbox_t->Resize({fg_num, 4}); + loc_index_t->Resize({fg_num}); + score_index_t->Resize({lbl_num}); + auto* lbl_data = tgt_lbl_t->mutable_data({lbl_num, 1}, place); + Gather(tmp_lbl_data, 1, score_index_t->data(), lbl_num, + lbl_data); + } + + private: void ScoreAssign(const T* dist_data, const Tensor& anchor_to_gt_max, const int row, const int col, const float pos_threshold, - const float neg_threshold, int64_t* target_label_data, + const float neg_threshold, int64_t* target_label, std::vector* fg_inds, std::vector* bg_inds) const { - int fg_offset = fg_inds->size(); - int bg_offset = bg_inds->size(); + float epsilon = 0.0001; for (int64_t i = 0; i < row; ++i) { const T* v = dist_data + i * col; - T max_dist = *std::max_element(v, v + col); + T max = *std::max_element(v, v + col); for (int64_t j = 0; j < col; ++j) { - T val = dist_data[i * col + j]; - if (val == max_dist) target_label_data[j] = 1; + if (std::abs(max - v[j]) < epsilon) { + target_label[j] = 1; + } } } - // Pick the fg/bg and count the number + // Pick the fg/bg + const T* anchor_to_gt_max_data = anchor_to_gt_max.data(); for (int64_t j = 0; j < col; ++j) { - if (anchor_to_gt_max.data()[j] > pos_threshold) { - target_label_data[j] = 1; - } else if (anchor_to_gt_max.data()[j] < neg_threshold) { - target_label_data[j] = 0; + if (anchor_to_gt_max_data[j] >= pos_threshold) { + target_label[j] = 1; + } else if (anchor_to_gt_max_data[j] < neg_threshold) { + target_label[j] = 0; } - if (target_label_data[j] == 1) { - fg_inds->push_back(fg_offset + j); - } else if (target_label_data[j] == 0) { - bg_inds->push_back(bg_offset + j); + if (target_label[j] == 1) { + fg_inds->push_back(j); + } else if (target_label[j] == 0) { + bg_inds->push_back(j); } } } - void ReservoirSampling(const int num, const int offset, - std::minstd_rand engine, + void ReservoirSampling(const int num, std::minstd_rand engine, std::vector* inds) const { std::uniform_real_distribution uniform(0, 1); - const int64_t size = static_cast(inds->size() - offset); - if (size > num) { - for (int64_t i = num; i < size; ++i) { + size_t len = inds->size(); + if (len > static_cast(num)) { + for (size_t i = num; i < len; ++i) { int rng_ind = std::floor(uniform(engine) * i); if (rng_ind < num) - std::iter_swap(inds->begin() + rng_ind + offset, - inds->begin() + i + offset); + std::iter_swap(inds->begin() + rng_ind, inds->begin() + i); } + inds->resize(num); } } - void RpnTargetAssign(const framework::ExecutionContext& ctx, - const Tensor& dist, const float pos_threshold, - const float neg_threshold, const int rpn_batch_size, - const int fg_num, std::minstd_rand engine, - std::vector* fg_inds, std::vector* bg_inds, - int64_t* target_label_data) const { + // std::vector> RpnTargetAssign( + std::vector> SampleFgBgGt( + const platform::CPUDeviceContext& ctx, const Tensor& dist, + const float pos_threshold, const float neg_threshold, + const int rpn_batch_size, const int fg_num, std::minstd_rand engine, + int64_t* target_label) const { auto* dist_data = dist.data(); - int64_t row = dist.dims()[0]; - int64_t col = dist.dims()[1]; - int fg_offset = fg_inds->size(); - int bg_offset = bg_inds->size(); + int row = dist.dims()[0]; + int col = dist.dims()[1]; + + std::vector fg_inds; + std::vector bg_inds; + std::vector gt_inds; // Calculate the max IoU between anchors and gt boxes - Tensor anchor_to_gt_max; - anchor_to_gt_max.mutable_data( - framework::make_ddim({static_cast(col), 1}), - platform::CPUPlace()); - auto& place = *ctx.template device_context() - .eigen_device(); - auto x = EigenMatrix::From(dist); - auto x_col_max = EigenMatrix::From(anchor_to_gt_max); - x_col_max.device(place) = - x.maximum(Eigen::DSizes(0)) - .reshape(Eigen::DSizes(static_cast(col), 1)); + // Map from anchor to gt box that has highest overlap + auto place = ctx.GetPlace(); + Tensor anchor_to_gt_max, anchor_to_gt_argmax; + anchor_to_gt_max.mutable_data({col}, place); + int* argmax = anchor_to_gt_argmax.mutable_data({col}, place); + + auto x = framework::EigenMatrix::From(dist); + auto x_col_max = framework::EigenVector::Flatten(anchor_to_gt_max); + auto x_col_argmax = + framework::EigenVector::Flatten(anchor_to_gt_argmax); + x_col_max = x.maximum(Eigen::DSizes(0)); + x_col_argmax = x.argmax(0).template cast(); + // Follow the Faster RCNN's implementation ScoreAssign(dist_data, anchor_to_gt_max, row, col, pos_threshold, - neg_threshold, target_label_data, fg_inds, bg_inds); + neg_threshold, target_label, &fg_inds, &bg_inds); // Reservoir Sampling - ReservoirSampling(fg_num, fg_offset, engine, fg_inds); - int bg_num = rpn_batch_size - (fg_inds->size() - fg_offset); - ReservoirSampling(bg_num, bg_offset, engine, bg_inds); - } + ReservoirSampling(fg_num, engine, &fg_inds); + int fg_num2 = static_cast(fg_inds.size()); + int bg_num = rpn_batch_size - fg_num2; + ReservoirSampling(bg_num, engine, &bg_inds); - void Compute(const framework::ExecutionContext& context) const override { - auto* dist = context.Input("DistMat"); - auto* loc_index = context.Output("LocationIndex"); - auto* score_index = context.Output("ScoreIndex"); - auto* tgt_lbl = context.Output("TargetLabel"); - - auto col = dist->dims()[1]; - int64_t n = dist->lod().size() == 0UL - ? 1 - : static_cast(dist->lod().back().size() - 1); - if (dist->lod().size()) { - PADDLE_ENFORCE_EQ(dist->lod().size(), 1UL, - "Only support 1 level of LoD."); + gt_inds.reserve(fg_num2); + for (int i = 0; i < fg_num2; ++i) { + gt_inds.emplace_back(argmax[fg_inds[i]]); } - int rpn_batch_size = context.Attr("rpn_batch_size_per_im"); - float pos_threshold = context.Attr("rpn_positive_overlap"); - float neg_threshold = context.Attr("rpn_negative_overlap"); - float fg_fraction = context.Attr("fg_fraction"); - - int fg_num = static_cast(rpn_batch_size * fg_fraction); - - int64_t* target_label_data = - tgt_lbl->mutable_data({n * col, 1}, context.GetPlace()); + std::vector> fg_bg_gt; + fg_bg_gt.emplace_back(fg_inds); + fg_bg_gt.emplace_back(bg_inds); + fg_bg_gt.emplace_back(gt_inds); - auto& dev_ctx = context.device_context(); - math::SetConstant iset; - iset(dev_ctx, tgt_lbl, static_cast(-1)); - - std::vector fg_inds; - std::vector bg_inds; - std::random_device rnd; - std::minstd_rand engine; - int seed = - context.Attr("fix_seed") ? context.Attr("seed") : rnd(); - engine.seed(seed); - - if (n == 1) { - RpnTargetAssign(context, *dist, pos_threshold, neg_threshold, - rpn_batch_size, fg_num, engine, &fg_inds, &bg_inds, - target_label_data); - } else { - auto lod = dist->lod().back(); - for (size_t i = 0; i < lod.size() - 1; ++i) { - Tensor one_ins = dist->Slice(lod[i], lod[i + 1]); - RpnTargetAssign(context, one_ins, pos_threshold, neg_threshold, - rpn_batch_size, fg_num, engine, &fg_inds, &bg_inds, - target_label_data + i * col); - } - } - int* loc_index_data = loc_index->mutable_data( - {static_cast(fg_inds.size())}, context.GetPlace()); - int* score_index_data = score_index->mutable_data( - {static_cast(fg_inds.size() + bg_inds.size())}, - context.GetPlace()); - memcpy(loc_index_data, reinterpret_cast(&fg_inds[0]), - fg_inds.size() * sizeof(int)); - memcpy(score_index_data, reinterpret_cast(&fg_inds[0]), - fg_inds.size() * sizeof(int)); - memcpy(score_index_data + fg_inds.size(), - reinterpret_cast(&bg_inds[0]), bg_inds.size() * sizeof(int)); + return fg_bg_gt; } }; class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { + AddInput("Anchor", + "(Tensor) input anchor is a 2-D Tensor with shape [H*W*A, 4]."); + AddInput("GtBox", "(LoDTensor) input groud-truth bbox with shape [K, 4]."); AddInput( "DistMat", "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape " @@ -241,12 +305,15 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { "ScoreIndex", "(Tensor), The indexes of foreground and background anchors in all " "RPN anchors(The rest anchors are ignored). The shape of the " - "ScoreIndex is [F + B], F and B depend on the value of input " - "tensor and attributes."); - AddOutput("TargetLabel", - "(Tensor), The target labels of each anchor with shape " - "[K * M, 1], " - "K and M is the same as they are in DistMat."); + "ScoreIndex is [F + B], F and B are sampled foreground and backgroud " + " number."); + AddOutput("TargetBBox", + "(Tensor), The target bbox deltas with shape " + "[F, 4], F is the sampled foreground number."); + AddOutput( + "TargetLabel", + "(Tensor), The target labels of each anchor with shape " + "[F + B, 1], F and B are sampled foreground and backgroud number."); AddComment(R"DOC( This operator can be, for given the IoU between the ground truth bboxes and the anchors, to assign classification and regression targets to each prediction. diff --git a/paddle/fluid/operators/gru_unit_op.h b/paddle/fluid/operators/gru_unit_op.h index f18d09d33e9052929b1ff9b36bb2b371fb513d37..451ec61ba1f7239d92c6dfbad0b2961e74e1bc17 100644 --- a/paddle/fluid/operators/gru_unit_op.h +++ b/paddle/fluid/operators/gru_unit_op.h @@ -92,12 +92,12 @@ class GRUUnitKernel : public framework::OpKernel { gate_data, frame_size * 3); // calculate activited gate - Eigen::array extents = {batch_size, frame_size}; - Eigen::array u_offsets = {0, 0}; + Eigen::array extents{{batch_size, frame_size}}; + Eigen::array u_offsets{{0, 0}}; ActCompute(context.Attr("gate_activation"), place, g.slice(u_offsets, extents), g.slice(u_offsets, extents)); auto u = g.slice(u_offsets, extents); // update gate - Eigen::array r_offsets = {0, frame_size}; + Eigen::array r_offsets{{0, frame_size}}; ActCompute(context.Attr("gate_activation"), place, g.slice(r_offsets, extents), g.slice(r_offsets, extents)); auto r = g.slice(r_offsets, extents); // reset gate @@ -107,7 +107,7 @@ class GRUUnitKernel : public framework::OpKernel { weight_data + frame_size * frame_size * 2, frame_size, 1, gate_data + frame_size * 2, frame_size * 3); - Eigen::array c_offsets = {0, frame_size * 2}; + Eigen::array c_offsets{{0, frame_size * 2}}; ActCompute(context.Attr("activation"), place, g.slice(c_offsets, extents), g.slice(c_offsets, extents)); auto c = g.slice(c_offsets, extents); // output candidate @@ -171,12 +171,12 @@ class GRUUnitGradKernel : public framework::OpKernel { int batch_size = input->dims()[0]; int frame_size = hidden_prev->dims()[1]; - Eigen::array extents = {batch_size, frame_size}; - Eigen::array u_offsets = {0, 0}; + Eigen::array extents{{batch_size, frame_size}}; + Eigen::array u_offsets{{0, 0}}; auto u = g.slice(u_offsets, extents); // update gate - Eigen::array r_offsets = {0, frame_size}; + Eigen::array r_offsets{{0, frame_size}}; auto r = g.slice(r_offsets, extents); // reset gate - Eigen::array c_offsets = {0, frame_size * 2}; + Eigen::array c_offsets{{0, frame_size * 2}}; auto c = g.slice(c_offsets, extents); // output candidate // backward for unactivated update gate diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu index 50450b62f7b1c0b2b5abf01a43581a0e2d2cd01e..46e20285db6d7acd39dead3994409645adddf494 100644 --- a/paddle/fluid/operators/roi_pool_op.cu +++ b/paddle/fluid/operators/roi_pool_op.cu @@ -31,7 +31,7 @@ static inline int NumBlocks(const int N) { template __global__ void GPUROIPoolForward( - const int nthreads, const T* input_data, const int64_t* input_rois, + const int nthreads, const T* input_data, const T* input_rois, const float spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, int* roi_batch_id_data, T* output_data, int64_t* argmax_data) { @@ -43,7 +43,7 @@ __global__ void GPUROIPoolForward( int c = (i / pooled_width / pooled_height) % channels; int n = i / pooled_width / pooled_height / channels; - const int64_t* offset_input_rois = input_rois + n * kROISize; + const T* offset_input_rois = input_rois + n * kROISize; int roi_batch_ind = roi_batch_id_data[n]; int roi_start_w = round(offset_input_rois[0] * spatial_scale); int roi_start_h = round(offset_input_rois[1] * spatial_scale); @@ -93,7 +93,7 @@ __global__ void GPUROIPoolForward( template __global__ void GPUROIPoolBackward( - const int nthreads, const int64_t* input_rois, const T* output_grad, + const int nthreads, const T* input_rois, const T* output_grad, const int64_t* argmax_data, const int num_rois, const float spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, int* roi_batch_id_data, @@ -174,8 +174,8 @@ class GPUROIPoolOpKernel : public framework::OpKernel { GPUROIPoolForward< T><<>>( - output_size, in->data(), rois->data(), spatial_scale, - channels, height, width, pooled_height, pooled_width, + output_size, in->data(), rois->data(), spatial_scale, channels, + height, width, pooled_height, pooled_width, roi_batch_id_list_gpu.data(), out->mutable_data(ctx.GetPlace()), argmax->mutable_data(ctx.GetPlace())); } @@ -228,7 +228,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel { if (output_grad_size > 0) { GPUROIPoolBackward< T><<>>( - output_grad_size, rois->data(), out_grad->data(), + output_grad_size, rois->data(), out_grad->data(), argmax->data(), rois_num, spatial_scale, channels, height, width, pooled_height, pooled_width, roi_batch_id_list_gpu.data(), diff --git a/paddle/fluid/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h index c4f739b2c6b2d62ebebcc15fd627ebad040e7b3f..07de7c9f0e070cef7c6f38f8d564ab76910842db 100644 --- a/paddle/fluid/operators/roi_pool_op.h +++ b/paddle/fluid/operators/roi_pool_op.h @@ -72,7 +72,7 @@ class CPUROIPoolOpKernel : public framework::OpKernel { T* output_data = out->mutable_data(ctx.GetPlace()); int64_t* argmax_data = argmax->mutable_data(ctx.GetPlace()); - const int64_t* rois_data = rois->data(); + const T* rois_data = rois->data(); for (int n = 0; n < rois_num; ++n) { int roi_batch_id = roi_batch_id_data[n]; int roi_start_w = round(rois_data[0] * spatial_scale); @@ -171,7 +171,7 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel { } } - const int64_t* rois_data = rois->data(); + const T* rois_data = rois->data(); const T* out_grad_data = out_grad->data(); const int64_t* argmax_data = argmax->data(); T* in_grad_data = in_grad->mutable_data(ctx.GetPlace()); diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 5757b2798e43dc70b406462a74b4f74eedcf56fa..1bc1dbbecaccd328d84cd3364a50c8f828d823c0 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -145,26 +145,23 @@ def rpn_target_assign(loc, """ helper = LayerHelper('rpn_target_assign', **locals()) - # 1. Compute the regression target bboxes - target_bbox = box_coder( - prior_box=anchor_box, - prior_box_var=anchor_var, - target_box=gt_box, - code_type='encode_center_size', - box_normalized=False) - # 2. Compute overlaps between the prior boxes and the gt boxes overlaps + # Compute overlaps between the prior boxes and the gt boxes overlaps iou = iou_similarity(x=gt_box, y=anchor_box) - # 3. Assign target label to anchors - loc_index = helper.create_tmp_variable(dtype=anchor_box.dtype) - score_index = helper.create_tmp_variable(dtype=anchor_box.dtype) - target_label = helper.create_tmp_variable(dtype=anchor_box.dtype) + # Assign target label to anchors + loc_index = helper.create_tmp_variable(dtype='int32') + score_index = helper.create_tmp_variable(dtype='int32') + target_label = helper.create_tmp_variable(dtype='int64') + target_bbox = helper.create_tmp_variable(dtype=anchor_box.dtype) helper.append_op( type="rpn_target_assign", - inputs={'DistMat': iou}, + inputs={'Anchor': anchor_box, + 'GtBox': gt_box, + 'DistMat': iou}, outputs={ 'LocationIndex': loc_index, 'ScoreIndex': score_index, - 'TargetLabel': target_label + 'TargetLabel': target_label, + 'TargetBBox': target_bbox, }, attrs={ 'rpn_batch_size_per_im': rpn_batch_size_per_im, @@ -173,16 +170,16 @@ def rpn_target_assign(loc, 'fg_fraction': fg_fraction }) - # 4. Reshape and gather the target entry - scores = nn.reshape(x=scores, shape=(-1, 2)) - loc = nn.reshape(x=loc, shape=(-1, 4)) - target_label = nn.reshape(x=target_label, shape=(-1, 1)) - target_bbox = nn.reshape(x=target_bbox, shape=(-1, 4)) + loc_index.stop_gradient = True + score_index.stop_gradient = True + target_label.stop_gradient = True + target_bbox.stop_gradient = True + scores = nn.reshape(x=scores, shape=(-1, 1)) + loc = nn.reshape(x=loc, shape=(-1, 4)) predicted_scores = nn.gather(scores, score_index) predicted_location = nn.gather(loc, loc_index) - target_label = nn.gather(target_label, score_index) - target_bbox = nn.gather(target_bbox, loc_index) + return predicted_scores, predicted_location, target_label, target_bbox diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py index ec0bf3ff8d64345111537780aaa5367ed0e1f8ff..e2564763d19d180f7c6933429dddf58c77be7bb8 100644 --- a/python/paddle/fluid/tests/test_detection.py +++ b/python/paddle/fluid/tests/test_detection.py @@ -281,7 +281,7 @@ class TestRpnTargetAssign(unittest.TestCase): gt_box = layers.data( name='gt_box', shape=[4], lod_level=1, dtype='float32') - predicted_scores, predicted_location, target_label, target_bbox = layers.rpn_target_assign( + pred_scores, pred_loc, tgt_lbl, tgt_bbox = layers.rpn_target_assign( loc=loc, scores=scores, anchor_box=anchor_box, @@ -292,15 +292,13 @@ class TestRpnTargetAssign(unittest.TestCase): rpn_positive_overlap=0.7, rpn_negative_overlap=0.3) - self.assertIsNotNone(predicted_scores) - self.assertIsNotNone(predicted_location) - self.assertIsNotNone(target_label) - self.assertIsNotNone(target_bbox) - assert predicted_scores.shape[1] == 2 - assert predicted_location.shape[1] == 4 - assert predicted_location.shape[1] == target_bbox.shape[1] - - print(str(program)) + self.assertIsNotNone(pred_scores) + self.assertIsNotNone(pred_loc) + self.assertIsNotNone(tgt_lbl) + self.assertIsNotNone(tgt_bbox) + assert pred_scores.shape[1] == 1 + assert pred_loc.shape[1] == 4 + assert pred_loc.shape[1] == tgt_bbox.shape[1] class TestGenerateProposals(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py index 764f83b534c8a183dbf21511f0b05741c13c9528..36ebc8fb6ea9efdcd1807f5c8917ab1428b3381e 100644 --- a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py +++ b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py @@ -37,7 +37,7 @@ def fusion_gru( h0, wh, np.zeros( - (1, wh.shape[1]), dtype='float64'), + (1, wh.shape[1]), dtype='float32'), is_reverse, act_state, act_gate) @@ -62,15 +62,15 @@ class TestFusionGRUOp(OpTest): T = sum(self.lod[0]) N = len(self.lod[0]) - x = np.random.rand(T, self.M).astype('float64') - wx = np.random.rand(self.M, 3 * self.D).astype('float64') - wh = np.random.rand(self.D, 3 * self.D).astype('float64') + x = np.random.rand(T, self.M).astype('float32') + wx = np.random.rand(self.M, 3 * self.D).astype('float32') + wh = np.random.rand(self.D, 3 * self.D).astype('float32') bias = np.random.rand( - 1, 3 * self.D).astype('float64') if self.with_bias else np.zeros( - (1, 3 * self.D), dtype='float64') + 1, 3 * self.D).astype('float32') if self.with_bias else np.zeros( + (1, 3 * self.D), dtype='float32') h0 = np.random.rand( - N, self.D).astype('float64') if self.with_h0 else np.zeros( - (N, self.D), dtype='float64') + N, self.D).astype('float32') if self.with_h0 else np.zeros( + (N, self.D), dtype='float32') _, _, _, hidden = fusion_gru( x, self.lod, h0, wx, wh, bias, self.is_reverse, @@ -93,7 +93,9 @@ class TestFusionGRUOp(OpTest): } def test_check_output(self): - self.check_output(atol=1e-8) + for use_seq in {True, False}: + self.attrs['use_seq'] = use_seq + self.check_output() class TestFusionGRUOpNoInitial(TestFusionGRUOp): diff --git a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py index 5805bdf461998e90611dec05b079cd55feda520d..1f1eb37667e304351a6a85edde09e7da32cf1630 100644 --- a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py +++ b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py @@ -114,7 +114,9 @@ class TestFusionLSTMOp(OpTest): } def test_check_output(self): - self.check_output() + for use_seq in {True, False}: + self.attrs['use_seq'] = use_seq + self.check_output() class TestFusionLSTMOpInit(TestFusionLSTMOp): diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py index ce766fffbce98a6a2cee4c508d6db85ee0163401..6dc101b6dad8813893c6a891da0e16f952bb4c2d 100644 --- a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py +++ b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py @@ -177,8 +177,8 @@ def _box_to_delta(ex_boxes, gt_boxes, weights): dx = (gt_ctr_x - ex_ctr_x) / ex_w / weights[0] dy = (gt_ctr_y - ex_ctr_y) / ex_h / weights[1] - dw = (np.log(gt_w / ex_w)) / ex_w / weights[2] - dh = (np.log(gt_h / ex_h)) / ex_h / weights[3] + dw = (np.log(gt_w / ex_w)) / weights[2] + dh = (np.log(gt_h / ex_h)) / weights[3] targets = np.vstack([dx, dy, dw, dh]).transpose() return targets diff --git a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py index ed7f467835f32242a9650f226b4a5ad9d6d87af4..ad4cd2e803bfae4c3fbc04503331b9a786b25d17 100644 --- a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py +++ b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py @@ -61,7 +61,7 @@ class TestROIPoolOp(OpTest): for i in range(self.rois_num): roi = self.rois[i] - roi_batch_id = roi[0] + roi_batch_id = int(roi[0]) roi_start_w = int(cpt.round(roi[1] * self.spatial_scale)) roi_start_h = int(cpt.round(roi[2] * self.spatial_scale)) roi_end_w = int(cpt.round(roi[3] * self.spatial_scale)) @@ -125,7 +125,7 @@ class TestROIPoolOp(OpTest): roi = [bno, x1, y1, x2, y2] rois.append(roi) self.rois_num = len(rois) - self.rois = np.array(rois).astype("int64") + self.rois = np.array(rois).astype("float32") def setUp(self): self.op_type = "roi_pool" diff --git a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py index 08c462d9036cacab81dab7c9ea16664c9159479f..bd548009b3ada9512e4b5f7d7b61b67b0717a39b 100644 --- a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py +++ b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py @@ -18,12 +18,17 @@ import unittest import numpy as np import paddle.fluid.core as core from op_test import OpTest +from test_anchor_generator_op import anchor_generator_in_python +from test_generate_proposal_labels import _generate_groundtruth +from test_generate_proposal_labels import _bbox_overlaps, _box_to_delta -def rpn_target_assign(iou, rpn_batch_size_per_im, rpn_positive_overlap, - rpn_negative_overlap, fg_fraction): - iou = np.transpose(iou) +def rpn_target_assign(gt_anchor_iou, rpn_batch_size_per_im, + rpn_positive_overlap, rpn_negative_overlap, fg_fraction): + iou = np.transpose(gt_anchor_iou) anchor_to_gt_max = iou.max(axis=1) + anchor_to_gt_argmax = iou.argmax(axis=1) + gt_to_anchor_argmax = iou.argmax(axis=0) gt_to_anchor_max = iou[gt_to_anchor_argmax, np.arange(iou.shape[1])] anchors_with_max_overlap = np.where(iou == gt_to_anchor_max)[0] @@ -42,59 +47,113 @@ def rpn_target_assign(iou, rpn_batch_size_per_im, rpn_positive_overlap, num_bg = rpn_batch_size_per_im - np.sum(tgt_lbl == 1) bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0] + tgt_lbl[bg_inds] = 0 if len(bg_inds) > num_bg: enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)] tgt_lbl[enable_inds] = 0 bg_inds = np.where(tgt_lbl == 0)[0] + tgt_lbl[bg_inds] = 0 loc_index = fg_inds score_index = np.hstack((fg_inds, bg_inds)) tgt_lbl = np.expand_dims(tgt_lbl, axis=1) - return loc_index, score_index, tgt_lbl + + gt_inds = anchor_to_gt_argmax[fg_inds] + + return loc_index, score_index, tgt_lbl, gt_inds + + +def get_anchor(n, c, h, w): + input_feat = np.random.random((n, c, h, w)).astype('float32') + anchors, _ = anchor_generator_in_python( + input_feat=input_feat, + anchor_sizes=[32., 64.], + aspect_ratios=[0.5, 1.0], + variances=[1.0, 1.0, 1.0, 1.0], + stride=[16.0, 16.0], + offset=0.5) + return anchors + + +def rpn_blob(anchor, gt_boxes, iou, lod, rpn_batch_size_per_im, + rpn_positive_overlap, rpn_negative_overlap, fg_fraction): + + loc_indexes = [] + score_indexes = [] + tmp_tgt_labels = [] + tgt_bboxes = [] + anchor_num = anchor.shape[0] + + batch_size = len(lod) - 1 + for i in range(batch_size): + b, e = lod[i], lod[i + 1] + iou_slice = iou[b:e, :] + bboxes_slice = gt_boxes[b:e, :] + + loc_idx, score_idx, tgt_lbl, gt_inds = rpn_target_assign( + iou_slice, rpn_batch_size_per_im, rpn_positive_overlap, + rpn_negative_overlap, fg_fraction) + + fg_bboxes = bboxes_slice[gt_inds] + fg_anchors = anchor[loc_idx] + box_deltas = _box_to_delta(fg_anchors, fg_bboxes, [1., 1., 1., 1.]) + + if i == 0: + loc_indexes = loc_idx + score_indexes = score_idx + tmp_tgt_labels = tgt_lbl + tgt_bboxes = box_deltas + else: + loc_indexes = np.concatenate( + [loc_indexes, loc_idx + i * anchor_num]) + score_indexes = np.concatenate( + [score_indexes, score_idx + i * anchor_num]) + tmp_tgt_labels = np.concatenate([tmp_tgt_labels, tgt_lbl]) + tgt_bboxes = np.vstack([tgt_bboxes, box_deltas]) + + tgt_labels = tmp_tgt_labels[score_indexes] + return loc_indexes, score_indexes, tgt_bboxes, tgt_labels class TestRpnTargetAssignOp(OpTest): def setUp(self): - iou = np.random.random((10, 8)).astype("float32") - self.op_type = "rpn_target_assign" - self.inputs = {'DistMat': iou} - self.attrs = { - 'rpn_batch_size_per_im': 256, - 'rpn_positive_overlap': 0.95, - 'rpn_negative_overlap': 0.3, - 'fg_fraction': 0.25, - 'fix_seed': True - } - loc_index, score_index, tgt_lbl = rpn_target_assign(iou, 256, 0.95, 0.3, - 0.25) - self.outputs = { - 'LocationIndex': loc_index, - 'ScoreIndex': score_index, - 'TargetLabel': tgt_lbl, - } + n, c, h, w = 2, 4, 14, 14 + anchor = get_anchor(n, c, h, w) + gt_num = 10 + anchor = anchor.reshape(-1, 4) + anchor_num = anchor.shape[0] - def test_check_output(self): - self.check_output() + im_shapes = [[64, 64], [64, 64]] + gt_box, lod = _generate_groundtruth(im_shapes, 3, 4) + bbox = np.vstack([v['boxes'] for v in gt_box]) + iou = _bbox_overlaps(bbox, anchor) + + anchor = anchor.astype('float32') + bbox = bbox.astype('float32') + iou = iou.astype('float32') + + loc_index, score_index, tgt_bbox, tgt_lbl = rpn_blob( + anchor, bbox, iou, [0, 4, 8], 25600, 0.95, 0.03, 0.25) -class TestRpnTargetAssignOp2(OpTest): - def setUp(self): - iou = np.random.random((10, 20)).astype("float32") self.op_type = "rpn_target_assign" - self.inputs = {'DistMat': iou} + self.inputs = { + 'Anchor': anchor, + 'GtBox': (bbox, [[4, 4]]), + 'DistMat': (iou, [[4, 4]]), + } self.attrs = { - 'rpn_batch_size_per_im': 128, - 'rpn_positive_overlap': 0.5, - 'rpn_negative_overlap': 0.5, - 'fg_fraction': 0.5, + 'rpn_batch_size_per_im': 25600, + 'rpn_positive_overlap': 0.95, + 'rpn_negative_overlap': 0.03, + 'fg_fraction': 0.25, 'fix_seed': True } - loc_index, score_index, tgt_lbl = rpn_target_assign(iou, 128, 0.5, 0.5, - 0.5) self.outputs = { - 'LocationIndex': loc_index, - 'ScoreIndex': score_index, - 'TargetLabel': tgt_lbl, + 'LocationIndex': loc_index.astype('int32'), + 'ScoreIndex': score_index.astype('int32'), + 'TargetBBox': tgt_bbox.astype('float32'), + 'TargetLabel': tgt_lbl.astype('int64'), } def test_check_output(self):