提交 12b483c0 编写于 作者: T tensor-tang

Merge remote-tracking branch 'ups/develop' into refine/ut/lac

...@@ -53,7 +53,7 @@ RUN curl -s -q https://glide.sh/get | sh ...@@ -53,7 +53,7 @@ RUN curl -s -q https://glide.sh/get | sh
# and its size is only one-third of the official one. # and its size is only one-third of the official one.
# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle. # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
# See https://github.com/PaddlePaddle/Paddle/issues/10129 for details. # See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
RUN wget -qO- http://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \ RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
tar -xz -C /usr/local && \ tar -xz -C /usr/local && \
cp -rf /usr/local/TensorRT/include /usr && \ cp -rf /usr/local/TensorRT/include /usr && \
cp -rf /usr/local/TensorRT/lib /usr cp -rf /usr/local/TensorRT/lib /usr
......
...@@ -128,16 +128,13 @@ set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") ...@@ -128,16 +128,13 @@ set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid") set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
set(module "framework") set(module "framework")
if (NOT WIN32) if (NOT WIN32)
copy(framework_lib DEPS framework_py_proto set(framework_lib_deps framework_py_proto)
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h endif(NOT WIN32)
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} copy(framework_lib DEPS ${framework_lib_deps}
)
else()
copy(framework_lib
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${src_dir}/${module}/ir/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}/ir
) )
endif(NOT WIN32)
set(module "memory") set(module "memory")
copy(memory_lib copy(memory_lib
...@@ -161,7 +158,8 @@ set(module "inference") ...@@ -161,7 +158,8 @@ set(module "inference")
copy(inference_lib DEPS ${inference_deps} copy(inference_lib DEPS ${inference_deps}
SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.* SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
${src_dir}/${module}/api/paddle_inference_api.h ${src_dir}/${module}/api/demo_ci ${src_dir}/${module}/api/paddle_inference_api.h ${src_dir}/${module}/api/demo_ci
DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
) )
set(module "platform") set(module "platform")
......
...@@ -60,6 +60,7 @@ ...@@ -60,6 +60,7 @@
图3. 编码器-解码器框架 图3. 编码器-解码器框架
</div> </div>
<a name="编码器"></a>
#### 编码器 #### 编码器
编码阶段分为三步: 编码阶段分为三步:
...@@ -81,7 +82,7 @@ ...@@ -81,7 +82,7 @@
机器翻译任务的训练过程中,解码阶段的目标是最大化下一个正确的目标语言词的概率。思路是: 机器翻译任务的训练过程中,解码阶段的目标是最大化下一个正确的目标语言词的概率。思路是:
1. 每一个时刻,根据源语言句子的编码信息(又叫上下文向量,context vector)`$c$`、真实目标语言序列的第`$i$`个词`$u_i$``$i$`时刻RNN的隐层状态`$z_i$`,计算出下一个隐层状态`$z_{i+1}$`。计算公式如下: 1. 每一个时刻,根据源语言句子的编码信息(又叫上下文向量,context vector)`$c$`、真实目标语言序列的第`$i$`个词`$u_i$``$i$`时刻RNN的隐层状态`$z_i$`,计算出下一个隐层状态`$z_{i+1}$`。计算公式如下:
$$z_{i+1}=\phi_{\theta '} \left ( c,u_i,z_i \right )$$ $$z_{i+1}=\phi_{\theta '} \left ( c,u_i,z_i \right )$$
其中`$\phi _{\theta '}$`是一个非线性激活函数;`$c=q\mathbf{h}$`是源语言句子的上下文向量,在不使用[注意力机制](#注意力机制)时,如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素,则可以定义`$c=h_T$``$u_i$`是目标语言序列的第`$i$`个单词,`$u_0$`是目标语言序列的开始标记`<s>`,表示解码开始;`$z_i$``$i$`时刻解码RNN的隐层状态,`$z_0$`是一个全零的向量。 其中`$\phi _{\theta '}$`是一个非线性激活函数;`$c=q\mathbf{h}$`是源语言句子的上下文向量,在不使用注意力机制时,如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素,则可以定义`$c=h_T$``$u_i$`是目标语言序列的第`$i$`个单词,`$u_0$`是目标语言序列的开始标记`<s>`,表示解码开始;`$z_i$``$i$`时刻解码RNN的隐层状态,`$z_0$`是一个全零的向量。
2.`$z_{i+1}$`通过`softmax`归一化,得到目标语言序列的第`$i+1$`个单词的概率分布`$p_{i+1}$`。概率分布公式如下: 2.`$z_{i+1}$`通过`softmax`归一化,得到目标语言序列的第`$i+1$`个单词的概率分布`$p_{i+1}$`。概率分布公式如下:
$$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$ $$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$
...@@ -93,6 +94,7 @@ $$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$ ...@@ -93,6 +94,7 @@ $$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$
机器翻译任务的生成过程,通俗来讲就是根据预先训练的模型来翻译源语言句子。生成过程中的解码阶段和上述训练过程的有所差异,具体介绍请见[柱搜索算法](#柱搜索算法) 机器翻译任务的生成过程,通俗来讲就是根据预先训练的模型来翻译源语言句子。生成过程中的解码阶段和上述训练过程的有所差异,具体介绍请见[柱搜索算法](#柱搜索算法)
<a name="柱搜索算法"></a>
### 柱搜索算法 ### 柱搜索算法
柱搜索([beam search](http://en.wikipedia.org/wiki/Beam_search))是一种启发式图搜索算法,用于在图或树中搜索有限集合中的最优扩展节点,通常用在解空间非常大的系统(如机器翻译、语音识别)中,原因是内存无法装下图或树中所有展开的解。如在机器翻译任务中希望翻译“`<s>你好<e>`”,就算目标语言字典中只有3个词(`<s>`, `<e>`, `hello`),也可能生成无限句话(`hello`循环出现的次数不定),为了找到其中较好的翻译结果,我们可采用柱搜索算法。 柱搜索([beam search](http://en.wikipedia.org/wiki/Beam_search))是一种启发式图搜索算法,用于在图或树中搜索有限集合中的最优扩展节点,通常用在解空间非常大的系统(如机器翻译、语音识别)中,原因是内存无法装下图或树中所有展开的解。如在机器翻译任务中希望翻译“`<s>你好<e>`”,就算目标语言字典中只有3个词(`<s>`, `<e>`, `hello`),也可能生成无限句话(`hello`循环出现的次数不定),为了找到其中较好的翻译结果,我们可采用柱搜索算法。
......
...@@ -149,6 +149,8 @@ def convolution_net(data, input_dim, class_dim, emb_dim, hid_dim): ...@@ -149,6 +149,8 @@ def convolution_net(data, input_dim, class_dim, emb_dim, hid_dim):
网络的输入`input_dim`表示的是词典的大小,`class_dim`表示类别数。这里,我们使用[`sequence_conv_pool`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/trainer_config_helpers/networks.py) API实现了卷积和池化操作。 网络的输入`input_dim`表示的是词典的大小,`class_dim`表示类别数。这里,我们使用[`sequence_conv_pool`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/trainer_config_helpers/networks.py) API实现了卷积和池化操作。
<a name="栈值双向LSTM"></a>
### 栈式双向LSTM ### 栈式双向LSTM
栈式双向神经网络`stacked_lstm_net`的代码片段如下: 栈式双向神经网络`stacked_lstm_net`的代码片段如下:
......
...@@ -50,7 +50,7 @@ similarity: -0.0997506977351 ...@@ -50,7 +50,7 @@ similarity: -0.0997506977351
``` ```
以上结果可以通过运行`calculate_dis.py`, 加载字典里的单词和对应训练特征结果得到,我们将在[应用模型](#应用模型)中详细描述用法。 以上结果可以通过运行`calculate_dis.py`, 加载字典里的单词和对应训练特征结果得到,我们将在[模型应用](#模型应用)中详细描述用法。
## 模型概览 ## 模型概览
...@@ -189,6 +189,7 @@ dream that one day <e> ...@@ -189,6 +189,7 @@ dream that one day <e>
最后,每个输入会按其单词次在字典里的位置,转化成整数的索引序列,作为PaddlePaddle的输入。 最后,每个输入会按其单词次在字典里的位置,转化成整数的索引序列,作为PaddlePaddle的输入。
<a name="训练模型"></a>
## 编程实现 ## 编程实现
本配置的模型结构如下图所示: 本配置的模型结构如下图所示:
...@@ -349,6 +350,7 @@ Step 20: Average Cost 5.766995 ...@@ -349,6 +350,7 @@ Step 20: Average Cost 5.766995
... ...
``` ```
<a name="模型应用"></a>
## 模型应用 ## 模型应用
在模型训练后,我们可以用它做一些预测。 在模型训练后,我们可以用它做一些预测。
......
...@@ -102,7 +102,7 @@ Softmax回归模型采用了最简单的两层神经网络,即只有输入层 ...@@ -102,7 +102,7 @@ Softmax回归模型采用了最简单的两层神经网络,即只有输入层
池化是非线性下采样的一种形式,主要作用是通过减少网络的参数来减小计算量,并且能够在一定程度上控制过拟合。通常在卷积层的后面会加上一个池化层。池化包括最大池化、平均池化等。其中最大池化是用不重叠的矩形框将输入层分成不同的区域,对于每个矩形框的数取最大值作为输出层,如图6所示。 池化是非线性下采样的一种形式,主要作用是通过减少网络的参数来减小计算量,并且能够在一定程度上控制过拟合。通常在卷积层的后面会加上一个池化层。池化包括最大池化、平均池化等。其中最大池化是用不重叠的矩形框将输入层分成不同的区域,对于每个矩形框的数取最大值作为输出层,如图6所示。
更详细的关于卷积神经网络的具体知识可以参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/ )[图像分类](https://github.com/PaddlePaddle/book/blob/develop/image_classification/README.md)教程。 更详细的关于卷积神经网络的具体知识可以参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/ )[图像分类]( https://github.com/PaddlePaddle/book/tree/develop/03.image_classification )教程。
### 常见激活函数介绍 ### 常见激活函数介绍
- sigmoid激活函数: $ f(x) = sigmoid(x) = \frac{1}{1+e^{-x}} $ - sigmoid激活函数: $ f(x) = sigmoid(x) = \frac{1}{1+e^{-x}} $
......
...@@ -149,7 +149,7 @@ python setup.py bdist_wheel ...@@ -149,7 +149,7 @@ python setup.py bdist_wheel
pip install --upgrade dist/visualdl-*.whl pip install --upgrade dist/visualdl-*.whl
``` ```
如果打包和安装遇到其他问题,不安装只想运行Visual DL可以看[这里](https://github.com/PaddlePaddle/VisualDL/blob/develop/docs/how_to_dev_frontend_en.md) 如果打包和安装遇到其他问题,不安装只想运行Visual DL可以看[这里](https://github.com/PaddlePaddle/VisualDL/blob/develop/docs/develop/how_to_dev_frontend_cn.md)
## SDK ## SDK
......
...@@ -4,13 +4,12 @@ Paddle 预测 API ...@@ -4,13 +4,12 @@ Paddle 预测 API
为了更简单方便的预测部署,Fluid 提供了一套高层 API 为了更简单方便的预测部署,Fluid 提供了一套高层 API
用来隐藏底层不同的优化实现。 用来隐藏底层不同的优化实现。
`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/contrib/inference>`__ `预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api>`_
包括 包括
- 头文件 ``paddle_inference_api.h`` 定义了所有的接口 - 头文件 ``paddle_inference_api.h`` 定义了所有的接口
- 库文件\ ``libpaddle_fluid.so`` 或 ``libpaddle_fluid.a`` - 库文件\ ``libpaddle_fluid.so`` 或 ``libpaddle_fluid.a``
- 库文件 ``libpaddle_inference_api.so`` 或
``libpaddle_inference_api.a``
编译和依赖可以参考 :ref:`install_or_build_cpp_inference_lib` 。 编译和依赖可以参考 :ref:`install_or_build_cpp_inference_lib` 。
...@@ -97,8 +96,7 @@ engine ...@@ -97,8 +96,7 @@ engine
CHECK(predictor->Run(slots, &outputs)); CHECK(predictor->Run(slots, &outputs));
// 获取 outputs ... // 获取 outputs ...
编译时,联编 ``libpaddle_fluid.a/.so`` 和 编译时,联编 ``libpaddle_fluid.a/.so`` 便可。
``libpaddle_inference_api.a/.so`` 便可。
详细代码参考 详细代码参考
------------ ------------
......
...@@ -312,7 +312,7 @@ paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kw ...@@ -312,7 +312,7 @@ paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kw
paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 200, 1)) paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 4095, 1))
paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
...@@ -376,7 +376,7 @@ paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'l ...@@ -376,7 +376,7 @@ paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'l
paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power'], varargs=None, keywords='kwargs', defaults=(0.0, 0.0, -0.5)) paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power'], varargs=None, keywords='kwargs', defaults=(0.0, 0.0, -0.5))
paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06, 0.0)) paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06, 0.0, False))
paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho'], varargs=None, keywords='kwargs', defaults=(1e-06, 0.95)) paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho'], varargs=None, keywords='kwargs', defaults=(1e-06, 0.95))
paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
......
...@@ -326,7 +326,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl( ...@@ -326,7 +326,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
ir::Graph &result = *graph; ir::Graph &result = *graph;
for (auto &node : nodes) { for (auto &node : nodes) {
if (node->NodeType() == ir::Node::Type::kVariable && node->Var()) { if (node->IsVar() && node->Var()) {
all_vars_.emplace(node->Name(), node->Var()); all_vars_.emplace(node->Name(), node->Var());
} }
} }
...@@ -583,18 +583,6 @@ void MultiDevSSAGraphBuilder::InsertDataBalanceOp( ...@@ -583,18 +583,6 @@ void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
} }
} }
bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
const std::string &og,
std::unordered_set<std::string> *og_has_been_broadcast) const {
bool is_pg_once =
grad_names_.count(og) != 0 && og_has_been_broadcast->count(og) == 0;
if (is_pg_once) {
// Insert NCCL AllReduce Op
og_has_been_broadcast->insert(og);
}
return is_pg_once;
}
int MultiDevSSAGraphBuilder::GetOpDeviceID(const ir::Graph &graph, int MultiDevSSAGraphBuilder::GetOpDeviceID(const ir::Graph &graph,
ir::Node *node) const { ir::Node *node) const {
if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
...@@ -688,20 +676,6 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result, ...@@ -688,20 +676,6 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
return var; return var;
} }
// Find the first occurence of `prev_op_name` and make current `op` depend
// on it.
void MultiDevSSAGraphBuilder::ConnectOp(ir::Graph *result, OpHandleBase *op,
const std::string &prev_op_name) const {
for (auto &prev_op : result->Get<GraphOps>(kGraphOps)) {
if (prev_op->Name() == prev_op_name) {
auto *dep_var = new DummyVarHandle(result->CreateControlDepVar());
prev_op->AddOutput(dep_var);
result->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
op->AddInput(dep_var);
}
}
}
void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
ir::Node *node) const { ir::Node *node) const {
int op_dev_id = -1; int op_dev_id = -1;
......
...@@ -69,9 +69,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass { ...@@ -69,9 +69,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
std::vector<std::string> FindDistTrainRecvVars( std::vector<std::string> FindDistTrainRecvVars(
const std::vector<ir::Node *> &nodes) const; const std::vector<ir::Node *> &nodes) const;
void ConnectOp(ir::Graph *result, OpHandleBase *op,
const std::string &prev_op_name) const;
void CreateComputationalOps(ir::Graph *result, ir::Node *node, void CreateComputationalOps(ir::Graph *result, ir::Node *node,
size_t num_places) const; size_t num_places) const;
...@@ -83,10 +80,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass { ...@@ -83,10 +80,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
void CreateComputationalOp(ir::Graph *result, ir::Node *node, void CreateComputationalOp(ir::Graph *result, ir::Node *node,
int dev_id) const; int dev_id) const;
bool IsParameterGradientOnce(
const std::string &og,
std::unordered_set<std::string> *og_has_been_broadcast) const;
int GetOpDeviceID(const ir::Graph &graph, ir::Node *node) const; int GetOpDeviceID(const ir::Graph &graph, ir::Node *node) const;
void InsertAllReduceOp(ir::Graph *result, const std::string &og) const; void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;
......
set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt. DO NOT EDIT!\n\n")
file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n")
function(pass_library TARGET)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass)
file(APPEND ${pass_file} "USE_PASS(${TARGET});\n")
set(PASS_LIBRARY ${TARGET} ${PASS_LIBRARY} PARENT_SCOPE)
endfunction()
cc_library(node SRCS node.cc DEPS proto_desc) cc_library(node SRCS node.cc DEPS proto_desc)
cc_library(graph SRCS graph.cc DEPS node) cc_library(graph SRCS graph.cc DEPS node)
cc_library(graph_helper SRCS graph_helper.cc DEPS graph) cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
cc_library(pass SRCS pass.cc DEPS graph node graph_helper) cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper)
cc_library(graph_to_program_pass SRCS graph_to_program_pass.cc DEPS graph pass graph_helper)
cc_library(graph_traits SRCS graph_traits.cc DEPS graph) cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits) cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits)
cc_library(fc_fuse_pass SRCS fc_fuse_pass.cc DEPS graph graph_pattern_detector)
cc_library(attention_lstm_fuse_pass SRCS attention_lstm_fuse_pass.cc DEPS graph graph_pattern_detector) pass_library(graph_to_program_pass)
cc_library(infer_clean_graph_pass SRCS infer_clean_graph_pass.cc DEPS graph pass) pass_library(graph_viz_pass)
cc_library(fc_lstm_fuse_pass SRCS fc_lstm_fuse_pass.cc DEPS graph graph_pattern_detector) pass_library(fc_fuse_pass)
cc_library(seq_concat_fc_fuse_pass SRCS seq_concat_fc_fuse_pass.cc DEPS graph graph_pattern_detector) pass_library(attention_lstm_fuse_pass)
pass_library(infer_clean_graph_pass)
pass_library(fc_lstm_fuse_pass)
pass_library(seq_concat_fc_fuse_pass)
set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")
cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry) cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry) cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass graph_pattern_detector graph pass graph_traits framework_proto) cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
...@@ -99,17 +99,13 @@ void FindWhileOp(Graph* graph) { ...@@ -99,17 +99,13 @@ void FindWhileOp(Graph* graph) {
auto* cell_init = graph->RetriveNode(6); auto* cell_init = graph->RetriveNode(6);
auto* hidden_init = graph->RetriveNode(8); auto* hidden_init = graph->RetriveNode(8);
#define LINK_TO(node0, node1) \
node0->outputs.push_back(node1); \
node1->inputs.push_back(node0);
auto* lstm_op = graph->CreateOpNode(&op_desc); auto* lstm_op = graph->CreateOpNode(&op_desc);
PrepareParameters(graph, param); PrepareParameters(graph, param);
LINK_TO(X, lstm_op); IR_NODE_LINK_TO(X, lstm_op);
LINK_TO(cell_init, lstm_op); IR_NODE_LINK_TO(cell_init, lstm_op);
LINK_TO(hidden_init, lstm_op); IR_NODE_LINK_TO(hidden_init, lstm_op);
LINK_TO(lstm_op, LSTMOUT); IR_NODE_LINK_TO(lstm_op, LSTMOUT);
GraphSafeRemoveNodes(graph, marked_nodes); GraphSafeRemoveNodes(graph, marked_nodes);
} }
......
...@@ -21,74 +21,26 @@ namespace paddle { ...@@ -21,74 +21,26 @@ namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
bool VarOutLinksToOp(Node* node, const std::string& op_type) {
for (auto* out : node->outputs) {
if (out->IsOp() && out->Op()->Type() == op_type) {
return true;
}
}
return false;
}
void BuildFCPattern(PDPattern* pattern) {
// Create Operators
auto* mul_op = pattern->NewNode("mul")->assert_is_op("mul");
auto* elementwise_add_op =
pattern->NewNode("elementwise_add")->assert_is_op("elementwise_add");
// Create variables
// w
auto* mul_weight_var = pattern->NewNode("mul_weight")
->AsInput()
->assert_is_op_nth_input("mul", "Y", 0);
// x
auto* mul_tmp_var = pattern->NewNode("mul_tmp_var")
->AsInput()
->assert_is_op_nth_input("mul", "X", 0);
// intermediate variable, will be removed in the IR after fuse.
auto* mul_out_var = pattern->NewNode("mul_out")
->AsIntermediate()
->assert_is_only_output_of_op("mul")
->assert_is_op_input("elementwise_add");
// bias
auto* elementwise_add_tmp_var = pattern->NewNode("elementwise_add_tmpvar")
->assert_is_op_input("elementwise_add")
->AsInput();
// output
auto* elementwise_add_out_var = pattern->NewNode("elementwise_add_out")
->AsOutput()
->assert_is_op_output("elementwise_add");
mul_op->LinksFrom({mul_weight_var, mul_tmp_var}).LinksTo({mul_out_var});
elementwise_add_op->LinksFrom({mul_out_var, elementwise_add_tmp_var})
.LinksTo({elementwise_add_out_var});
}
// Replace the node `from` in the links to `to`
bool LinksReplace(std::vector<Node*>* links, Node* from, Node* to) {
for (auto*& n : *links) {
if (n == from) {
n = to;
return true;
}
}
return false;
}
std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl( std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const { std::unique_ptr<ir::Graph> graph) const {
PADDLE_ENFORCE(graph.get()); PADDLE_ENFORCE(graph.get());
FusePassBase::Init("fc", graph.get()); FusePassBase::Init("fc_fuse", graph.get());
std::unordered_set<Node*> nodes2delete; std::unordered_set<Node*> nodes2delete;
GraphPatternDetector gpd; GraphPatternDetector gpd;
BuildFCPattern(gpd.mutable_pattern()); // BuildFCPattern(gpd.mutable_pattern());
auto* x = gpd.mutable_pattern()
#define GET_NODE(id) \ ->NewNode("fc_fuse/x")
PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetrieveNode(#id)), \ ->AsInput()
"pattern has no Node called %s", #id); \ ->assert_is_op_input("mul", "X");
auto* id = subgraph.at(gpd.pattern().RetrieveNode(#id)); \ patterns::FC(gpd.mutable_pattern(), "fc_fuse", x, true /*with bias*/);
PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
#define GET_NODE(id) \
PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetrieveNode("fc_fuse/" #id)), \
"pattern has no Node called %s", #id); \
auto* id = subgraph.at(gpd.pattern().RetrieveNode("fc_fuse/" #id)); \
PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", "fc_fuse/" #id);
int found_fc_count = 0; int found_fc_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
...@@ -98,43 +50,33 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl( ...@@ -98,43 +50,33 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
// scenerio. // scenerio.
// FC's fusion is simple, just op fuse, no need to process the // FC's fusion is simple, just op fuse, no need to process the
// parameters. // parameters.
GET_NODE(mul_tmp_var); // x GET_NODE(x); // x
GET_NODE(mul_weight); // Y GET_NODE(w); // Y
GET_NODE(elementwise_add_tmpvar); // bias GET_NODE(fc_bias); // bias
GET_NODE(elementwise_add_out); // Out GET_NODE(fc_out); // Out
GET_NODE(mul); // MUL op GET_NODE(mul); // MUL op
GET_NODE(elementwise_add); // ELEMENT_ADD op GET_NODE(elementwise_add); // ELEMENT_ADD op
GET_NODE(mul_out); // tmp GET_NODE(mul_out); // tmp
#undef GET_NODE #undef GET_NODE
// Create an FC Node. // Create an FC Node.
OpDesc desc; OpDesc desc;
std::string fc_x_in = mul_tmp_var->Name(); std::string fc_x_in = x->Name();
std::string fc_Y_in = mul_weight->Name(); std::string fc_Y_in = w->Name();
std::string fc_bias_in = elementwise_add_tmpvar->Name(); std::string fc_bias_in = fc_bias->Name();
std::string fc_out = elementwise_add_out->Name(); std::string fc_out_out = fc_out->Name();
desc.SetInput("Input", std::vector<std::string>({fc_x_in})); desc.SetInput("Input", std::vector<std::string>({fc_x_in}));
desc.SetInput("W", std::vector<std::string>({fc_Y_in})); desc.SetInput("W", std::vector<std::string>({fc_Y_in}));
desc.SetInput("Bias", std::vector<std::string>({fc_bias_in})); desc.SetInput("Bias", std::vector<std::string>({fc_bias_in}));
desc.SetOutput("Out", std::vector<std::string>({fc_out})); desc.SetOutput("Out", std::vector<std::string>({fc_out_out}));
desc.SetType("fc"); desc.SetType("fc");
auto fc_node = g->CreateOpNode(&desc); // OpDesc will be copied. auto fc_node = g->CreateOpNode(&desc); // OpDesc will be copied.
fc_node->inputs = GraphSafeRemoveNodes(graph.get(), {mul, elementwise_add, mul_out});
std::vector<Node*>({mul_tmp_var, mul_weight, elementwise_add_tmpvar});
fc_node->outputs.push_back(elementwise_add_out);
// Update link relatons
PADDLE_ENFORCE(LinksReplace(&mul_tmp_var->outputs, mul, fc_node));
PADDLE_ENFORCE(LinksReplace(&mul_weight->outputs, mul, fc_node));
PADDLE_ENFORCE(LinksReplace(&elementwise_add_tmpvar->outputs,
elementwise_add, fc_node));
PADDLE_ENFORCE(
LinksReplace(&elementwise_add_out->inputs, elementwise_add, fc_node));
// Drop old nodes IR_NODE_LINK_TO(x, fc_node);
graph->RemoveNode(mul); IR_NODE_LINK_TO(w, fc_node);
graph->RemoveNode(elementwise_add); IR_NODE_LINK_TO(fc_bias, fc_node);
graph->RemoveNode(mul_out); // tmp variable IR_NODE_LINK_TO(fc_node, fc_out);
found_fc_count++; found_fc_count++;
}; };
......
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h" #include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h"
#include <string> #include <string>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
...@@ -87,15 +86,24 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, ...@@ -87,15 +86,24 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
} }
op_desc.SetInput("Bias", {new_bias_var}); op_desc.SetInput("Bias", {new_bias_var});
} }
#undef GET_NODE #undef GET_NODE
// Create temp variables.
scope->Var(name_scope + "/BatchedInput.new")
->GetMutable<framework::LoDTensor>();
scope->Var(name_scope + "/BatchCellPreAct.new")
->GetMutable<framework::LoDTensor>();
scope->Var(name_scope + "/BatchedGate.new")
->GetMutable<framework::LoDTensor>();
op_desc.SetInput("H0", {}); op_desc.SetInput("H0", {});
op_desc.SetInput("C0", {}); op_desc.SetInput("C0", {});
op_desc.SetOutput("Hidden", {hidden_n->Name()}); op_desc.SetOutput("Hidden", {hidden_n->Name()});
op_desc.SetOutput("Cell", {cell_n->Name()}); op_desc.SetOutput("Cell", {cell_n->Name()});
op_desc.SetOutput("XX", {xx_n->Name()}); op_desc.SetOutput("XX", {xx_n->Name()});
op_desc.SetOutput("BatchedInput", {"blstm_0.tmp_2"}); op_desc.SetOutput("BatchedGate", {name_scope + "/BatchedGate.new"});
op_desc.SetOutput("BatchCellPreAct", {name_scope + "/BatchCellPreAct.new"});
op_desc.SetOutput("BatchedInput", {name_scope + "/BatchedInput.new"});
op_desc.SetAttr("is_reverse", lstm_n->Op()->GetAttr("is_reverse")); op_desc.SetAttr("is_reverse", lstm_n->Op()->GetAttr("is_reverse"));
op_desc.SetAttr("use_peepholes", lstm_n->Op()->GetAttr("use_peepholes")); op_desc.SetAttr("use_peepholes", lstm_n->Op()->GetAttr("use_peepholes"));
// TODO(TJ): get from attr // TODO(TJ): get from attr
...@@ -121,22 +129,18 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, ...@@ -121,22 +129,18 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
#undef TMP_NEW #undef TMP_NEW
#undef TMP_NAME #undef TMP_NAME
#define LINK_TO(a, b) \ IR_NODE_LINK_TO(input_n, op);
a->outputs.push_back(b); \ IR_NODE_LINK_TO(weight_x_n, op);
b->inputs.push_back(a); IR_NODE_LINK_TO(weight_h_n, op);
LINK_TO(input_n, op); IR_NODE_LINK_TO(bias_n, op);
LINK_TO(weight_x_n, op); IR_NODE_LINK_TO(op, hidden_n);
LINK_TO(weight_h_n, op);
LINK_TO(bias_n, op);
LINK_TO(op, hidden_n);
#undef LINK_TO
return op; return op;
}; };
int fusion_count{0}; int fusion_count{0};
auto fc_no_bias_handler = [&]( auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { Graph* g) {
#define GET_NODE(name__) \ #define GET_NODE(name__) \
std::string name__##key = name_scope + "/" + #name__; \ std::string name__##key = name_scope + "/" + #name__; \
auto* name__##n = pattern->RetrieveNode(name__##key); \ auto* name__##n = pattern->RetrieveNode(name__##key); \
...@@ -157,21 +161,24 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, ...@@ -157,21 +161,24 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
if (with_fc_bias) { if (with_fc_bias) {
GET_NODE(fc_bias); GET_NODE(fc_bias);
GET_NODE(elementwise_add);
lstm_creator(lstm, x, w, Weight, Bias, Hidden, Cell, fc_out, fc_bias); lstm_creator(lstm, x, w, Weight, Bias, Hidden, Cell, fc_out, fc_bias);
// Remove unneeded nodes.
std::unordered_set<const Node*> marked_nodes(
{mul_n, lstm_n, elementwise_add_n});
GraphSafeRemoveNodes(graph, marked_nodes);
} else { } else {
lstm_creator(lstm, x, w, Weight, Bias, Hidden, Cell, fc_out, -1); lstm_creator(lstm, x, w, Weight, Bias, Hidden, Cell, fc_out, -1);
// Remove unneeded nodes.
std::unordered_set<const Node*> marked_nodes({mul_n, lstm_n});
GraphSafeRemoveNodes(graph, marked_nodes);
} }
#undef GET_NODE #undef GET_NODE
// Remove unneeded nodes.
std::unordered_set<const Node*> marked_nodes({mul_n, lstm_n});
GraphSafeRemoveNodes(graph, marked_nodes);
++fusion_count; ++fusion_count;
}; };
gpd(graph, fc_no_bias_handler); gpd(graph, handler);
return fusion_count; return fusion_count;
} }
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
......
...@@ -73,7 +73,6 @@ void PDPattern::AddEdge(PDNode* a, PDNode* b) { ...@@ -73,7 +73,6 @@ void PDPattern::AddEdge(PDNode* a, PDNode* b) {
void GraphPatternDetector::operator()(Graph* graph, void GraphPatternDetector::operator()(Graph* graph,
GraphPatternDetector::handle_t handler) { GraphPatternDetector::handle_t handler) {
if (!MarkPDNodesInGraph(*graph)) { if (!MarkPDNodesInGraph(*graph)) {
LOG(INFO) << "Mark failed";
return; return;
} }
...@@ -111,6 +110,11 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) { ...@@ -111,6 +110,11 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) {
return false; return false;
} }
} }
for (auto& item : pdnodes2nodes_) {
for (auto& n : item.second) {
GetMarkedNodes(const_cast<Graph*>(&graph)).insert(n);
}
}
VLOG(3) << pdnodes2nodes_.size() << " nodes marked"; VLOG(3) << pdnodes2nodes_.size() << " nodes marked";
return !pdnodes2nodes_.empty(); return !pdnodes2nodes_.empty();
...@@ -278,7 +282,7 @@ void GraphPatternDetector::RemoveOverlappedMatch( ...@@ -278,7 +282,7 @@ void GraphPatternDetector::RemoveOverlappedMatch(
for (const auto& subgraph : *subgraphs) { for (const auto& subgraph : *subgraphs) {
bool valid = true; bool valid = true;
for (auto& item : subgraph) { for (auto& item : subgraph) {
if (node_set.count(item.second)) { if (item.first->IsIntermediate() && node_set.count(item.second)) {
valid = false; valid = false;
break; break;
} }
...@@ -334,22 +338,22 @@ PDNode& PDNode::LinksFrom(const std::vector<PDNode*>& others) { ...@@ -334,22 +338,22 @@ PDNode& PDNode::LinksFrom(const std::vector<PDNode*>& others) {
} }
PDNode* PDNode::assert_is_op() { PDNode* PDNode::assert_is_op() {
asserts_.emplace_back([this](Node* x) { return x && x->IsOp(); }); asserts_.emplace_back([](Node* x) { return x && x->IsOp(); });
return this; return this;
} }
PDNode* PDNode::assert_is_op(const std::string& op_type) { PDNode* PDNode::assert_is_op(const std::string& op_type) {
asserts_.emplace_back([this, op_type](Node* x) { asserts_.emplace_back([op_type](Node* x) {
return x && x->IsOp() && x->Op()->Type() == op_type; return x && x->IsOp() && x->Op()->Type() == op_type;
}); });
return this; return this;
} }
PDNode* PDNode::assert_is_var() { PDNode* PDNode::assert_is_var() {
asserts_.emplace_back([this](Node* x) { return x && x->IsVar(); }); asserts_.emplace_back([](Node* x) { return x && x->IsVar(); });
return this; return this;
} }
PDNode* PDNode::assert_var_not_persistable() { PDNode* PDNode::assert_var_not_persistable() {
assert_is_var(); assert_is_var();
asserts_.emplace_back([this](Node* x) { return !x->Var()->Persistable(); }); asserts_.emplace_back([](Node* x) { return !x->Var()->Persistable(); });
return this; return this;
} }
PDNode* PDNode::assert_is_persistable_var() { PDNode* PDNode::assert_is_persistable_var() {
...@@ -491,14 +495,16 @@ void GraphSafeRemoveNodes(Graph* graph, ...@@ -491,14 +495,16 @@ void GraphSafeRemoveNodes(Graph* graph,
for (auto it = node->inputs.begin(); it != node->inputs.end();) { for (auto it = node->inputs.begin(); it != node->inputs.end();) {
if (nodes.count(*it)) { if (nodes.count(*it)) {
it = const_cast<Node*>(node)->inputs.erase(it); it = const_cast<Node*>(node)->inputs.erase(it);
} else } else {
it++; it++;
}
} }
for (auto it = node->outputs.begin(); it != node->outputs.end();) { for (auto it = node->outputs.begin(); it != node->outputs.end();) {
if (nodes.count(*it)) { if (nodes.count(*it)) {
it = const_cast<Node*>(node)->outputs.erase(it); it = const_cast<Node*>(node)->outputs.erase(it);
} else } else {
it++; it++;
}
} }
} }
} }
......
...@@ -19,6 +19,9 @@ ...@@ -19,6 +19,9 @@
#endif #endif
#include <numeric> #include <numeric>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/inference/analysis/dot.h" #include "paddle/fluid/inference/analysis/dot.h"
...@@ -245,6 +248,8 @@ class GraphPatternDetector { ...@@ -245,6 +248,8 @@ class GraphPatternDetector {
void UniquePatterns(std::vector<subgraph_t>* subgraphs); void UniquePatterns(std::vector<subgraph_t>* subgraphs);
// Remove overlapped match subgraphs, when overlapped, keep the previous one. // Remove overlapped match subgraphs, when overlapped, keep the previous one.
// The intermediate PDNodes will be removed, so can't shared by multiple
// patterns.
void RemoveOverlappedMatch(std::vector<subgraph_t>* subgraphs); void RemoveOverlappedMatch(std::vector<subgraph_t>* subgraphs);
// Validate whether the intermediate nodes are linked by external nodes. // Validate whether the intermediate nodes are linked by external nodes.
...@@ -295,6 +300,10 @@ PDNode* LSTM(PDPattern* pattern, const std::string& name_scope, PDNode* x); ...@@ -295,6 +300,10 @@ PDNode* LSTM(PDPattern* pattern, const std::string& name_scope, PDNode* x);
} // namespace patterns } // namespace patterns
#define IR_NODE_LINK_TO(a, b) \
a->outputs.push_back(b); \
b->inputs.push_back(a);
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -140,8 +140,9 @@ TEST(GraphPatternDetecter, MultiSubgraph) { ...@@ -140,8 +140,9 @@ TEST(GraphPatternDetecter, MultiSubgraph) {
return node->IsOp() && (node->Name() == "op2" || node->Name() == "op3"); return node->IsOp() && (node->Name() == "op2" || node->Name() == "op3");
}, },
"OP0"); "OP0");
auto* any_var = x.mutable_pattern()->NewNode( auto* any_var = x.mutable_pattern()
[](Node* node) { return node->IsVar(); }, "VAR"); ->NewNode([](Node* node) { return node->IsVar(); }, "VAR")
->AsIntermediate();
auto* any_op1 = x.mutable_pattern()->NewNode( auto* any_op1 = x.mutable_pattern()->NewNode(
[](Node* node) { return node->IsOp(); }, "OP1"); [](Node* node) { return node->IsOp(); }, "OP1");
......
...@@ -13,42 +13,41 @@ ...@@ -13,42 +13,41 @@
// limitations under the License. // limitations under the License.
#include <algorithm> #include <algorithm>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
class InferCleanGraphPass : public Pass { class InferCleanGraphPass : public FusePassBase {
public: public:
virtual ~InferCleanGraphPass() {} virtual ~InferCleanGraphPass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const { std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const {
FusePassBase::Init("original_graph", graph.get());
PADDLE_ENFORCE(graph.get()); PADDLE_ENFORCE(graph.get());
auto is_valid_node = [](Node* x) { auto is_valid_node = [](Node* x) {
return x && IsControlDepVar(*x) && x->IsVar() && !x->Var(); return x && IsControlDepVar(*x) && x->IsVar() && !x->Var();
}; };
std::unordered_set<Node*> invalid_nodes; std::unordered_set<const Node*> invalid_nodes;
int valid_op = 0;
for (auto* node : graph->Nodes()) { for (auto* node : graph->Nodes()) {
if (is_valid_node(node)) { if (is_valid_node(node)) {
invalid_nodes.insert(node); invalid_nodes.insert(node);
} else if (node->IsOp()) {
// Collect all the operators to help tracking number of operators.
++valid_op;
} }
} }
// remove nodes from the graph. GraphSafeRemoveNodes(graph.get(), invalid_nodes);
for (auto* node : invalid_nodes) {
graph->RemoveNode(node);
}
// clean edges. AddStatis(valid_op);
for (auto* node : graph->Nodes()) {
CleanEdges(&node->inputs, invalid_nodes);
CleanEdges(&node->outputs, invalid_nodes);
}
return graph; return graph;
} }
......
...@@ -219,16 +219,13 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl( ...@@ -219,16 +219,13 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
op_desc.SetAttr("fc_activation", act->Op()->Type()); op_desc.SetAttr("fc_activation", act->Op()->Type());
auto* op_node = graph->CreateOpNode(&op_desc); auto* op_node = graph->CreateOpNode(&op_desc);
// Add links // Add links
#define NODE_LINKS(a, b) \ IR_NODE_LINK_TO(fc_w, op_node);
a->outputs.push_back(b); \ IR_NODE_LINK_TO(fc_bias, op_node);
b->inputs.push_back(a); IR_NODE_LINK_TO(concat_in0, op_node);
NODE_LINKS(fc_w, op_node); IR_NODE_LINK_TO(sequence_expand0_in, op_node);
NODE_LINKS(fc_bias, op_node); IR_NODE_LINK_TO(sequence_expand1_in, op_node);
NODE_LINKS(concat_in0, op_node); IR_NODE_LINK_TO(op_node, fc_out);
NODE_LINKS(sequence_expand0_in, op_node);
NODE_LINKS(sequence_expand1_in, op_node);
NODE_LINKS(op_node, fc_out);
// Clean nodes. // Clean nodes.
std::unordered_set<const Node*> marked_nodes; std::unordered_set<const Node*> marked_nodes;
...@@ -241,7 +238,6 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl( ...@@ -241,7 +238,6 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
marked_nodes.erase(sequence_expand0_in); marked_nodes.erase(sequence_expand0_in);
marked_nodes.erase(sequence_expand1_in); marked_nodes.erase(sequence_expand1_in);
marked_nodes.erase(fc_out); marked_nodes.erase(fc_out);
GraphSafeRemoveNodes(graph, marked_nodes); GraphSafeRemoveNodes(graph, marked_nodes);
}); });
......
...@@ -10,7 +10,7 @@ set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor) ...@@ -10,7 +10,7 @@ set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor)
# TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal? # TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
cc_library(paddle_fluid_api cc_library(paddle_fluid_api
SRCS io.cc SRCS io.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} graph_to_program_pass) DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
...@@ -22,7 +22,7 @@ cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) ...@@ -22,7 +22,7 @@ cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
#endif() #endif()
# Create static library # Create static library
cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api) cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api analysis_predictor)
if(NOT APPLE) if(NOT APPLE)
# TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac. # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym") set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
...@@ -32,6 +32,7 @@ endif() ...@@ -32,6 +32,7 @@ endif()
# Create shared library # Create shared library
cc_library(paddle_fluid_shared SHARED cc_library(paddle_fluid_shared SHARED
SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc
${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
DEPS ${fluid_modules} paddle_fluid_api) DEPS ${fluid_modules} paddle_fluid_api)
set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
......
...@@ -33,7 +33,7 @@ function (inference_analysis_test TARGET) ...@@ -33,7 +33,7 @@ function (inference_analysis_test TARGET)
endif() endif()
cc_test(${TARGET} cc_test(${TARGET}
SRCS "${analysis_test_SRCS}" SRCS "${analysis_test_SRCS}"
DEPS analysis graph fc_fuse_pass graph_viz_pass infer_clean_graph_pass graph_pattern_detector pass ${analysis_test_EXTRA_DEPS} DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS}
ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt} ${analysis_test_ARGS}) ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt} ${analysis_test_ARGS})
set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec) set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
endif(WITH_TESTING) endif(WITH_TESTING)
...@@ -56,25 +56,13 @@ if (NOT EXISTS ${DITU_INSTALL_DIR} AND WITH_TESTING) ...@@ -56,25 +56,13 @@ if (NOT EXISTS ${DITU_INSTALL_DIR} AND WITH_TESTING)
endif() endif()
inference_analysis_test(test_analyzer SRCS analyzer_tester.cc inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
analysis_predictor
# ir
fc_fuse_pass
fc_lstm_fuse_pass
seq_concat_fc_fuse_pass
graph_viz_pass
infer_clean_graph_pass
graph_pattern_detector
infer_clean_graph_pass
attention_lstm_fuse_pass
paddle_inference_api
pass
ARGS --infer_ditu_rnn_model=${DITU_INSTALL_DIR}/model ARGS --infer_ditu_rnn_model=${DITU_INSTALL_DIR}/model
--infer_ditu_rnn_data=${DITU_INSTALL_DIR}/data.txt) --infer_ditu_rnn_data=${DITU_INSTALL_DIR}/data.txt)
inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc) inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc EXTRA_DEPS paddle_inference_api) inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc)
inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc EXTRA_DEPS paddle_fluid) inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc)
inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc) inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc)
inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc) inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc)
inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc) inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc)
...@@ -86,7 +74,7 @@ inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc) ...@@ -86,7 +74,7 @@ inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc)
set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz") set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz")
set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz") set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz")
set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner" CACHE PATH "Chinese ner model and data root." FORCE) set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner" CACHE PATH "Chinese ner model and data root." FORCE)
if (NOT EXISTS ${CHINESE_NER_INSTALL_DIR} AND WITH_TESTING) if (NOT EXISTS ${CHINESE_NER_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} "chinese_ner_model.tar.gz") inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} "chinese_ner_model.tar.gz")
inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_DATA_URL} "chinese_ner-data.txt.tar.gz") inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_DATA_URL} "chinese_ner-data.txt.tar.gz")
endif() endif()
...@@ -99,7 +87,7 @@ inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc ...@@ -99,7 +87,7 @@ inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc
set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz") set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz")
set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz") set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz")
set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac" CACHE PATH "LAC model and data root." FORCE) set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac" CACHE PATH "LAC model and data root." FORCE)
if (NOT EXISTS ${LAC_INSTALL_DIR} AND WITH_TESTING) if (NOT EXISTS ${LAC_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} "lac_model.tar.gz") inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} "lac_model.tar.gz")
inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_DATA_URL} "lac_data.txt.tar.gz") inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_DATA_URL} "lac_data.txt.tar.gz")
endif() endif()
...@@ -120,3 +108,15 @@ inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc ...@@ -120,3 +108,15 @@ inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
pass pass
ARGS --infer_model=${LAC_INSTALL_DIR}/model ARGS --infer_model=${LAC_INSTALL_DIR}/model
--infer_data=${LAC_INSTALL_DIR}/data.txt) --infer_data=${LAC_INSTALL_DIR}/data.txt)
set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/text-classification-Senta.tar.gz")
set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification" CACHE PATH "Text Classification model and data root." FORCE)
if (NOT EXISTS ${TEXT_CLASSIFICATION_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
inference_download_and_uncompress(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} "text-classification-Senta.tar.gz")
endif()
inference_analysis_test(test_text_classification SRCS test_text_classification.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta)
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/analyzer.h"
#include <string> #include <string>
#include <vector>
#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h" #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
...@@ -41,20 +42,16 @@ class DfgPassManagerImpl final : public DfgPassManager { ...@@ -41,20 +42,16 @@ class DfgPassManagerImpl final : public DfgPassManager {
public: public:
DfgPassManagerImpl() { DfgPassManagerImpl() {
// TODO(Superjomn) set the key with pass reprs. // TODO(Superjomn) set the key with pass reprs.
LOG(INFO) if (!FLAGS_IA_enable_ir) {
<< "-----------------------------------------------------------------";
if (FLAGS_IA_enable_ir) {
AddPass("fluid-to-ir-pass", new FluidToIrPass);
} else {
AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass); AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
} else {
AddPass("fluid-to-ir-pass", new FluidToIrPass);
} }
TryAddTensorRtPass(); TryAddTensorRtPass();
AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass); AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
if (!FLAGS_IA_output_storage_path.empty()) { if (!FLAGS_IA_output_storage_path.empty()) {
AddPass("model-store-pass", new ModelStorePass); AddPass("model-store-pass", new ModelStorePass);
} }
LOG(INFO)
<< "-----------------------------------------------------------------";
} }
std::string repr() const override { return "dfg-pass-manager"; } std::string repr() const override { return "dfg-pass-manager"; }
...@@ -101,19 +98,16 @@ class DfgPassManagerImpl final : public DfgPassManager { ...@@ -101,19 +98,16 @@ class DfgPassManagerImpl final : public DfgPassManager {
Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); } Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); }
void Analyzer::Run(Argument* argument) { void Analyzer::Run(Argument* argument) {
std::vector<std::string> passes;
for (auto& pass : all_ir_passes_) {
if (!disabled_ir_passes_.count(pass)) {
passes.push_back(pass);
passes.push_back("graph_viz_pass"); // add graphviz for debug.
}
}
passes.push_back("graph_viz_pass");
// Ugly support fluid-to-ir-pass // Ugly support fluid-to-ir-pass
argument->Set(kFluidToIrPassesAttr, argument->Set(kFluidToIrPassesAttr, new std::vector<std::string>(passes));
new std::vector<std::string>({
// Manual update the passes here.
"graph_viz_pass", //
"infer_clean_graph_pass", "graph_viz_pass", //
"attention_lstm_fuse_pass", "graph_viz_pass", //
"fc_lstm_fuse_pass", "graph_viz_pass", //
"mul_lstm_fuse_pass", "graph_viz_pass", //
"seq_concat_fc_fuse_pass", "graph_viz_pass", //
"fc_fuse_pass", "graph_viz_pass" //
}));
for (auto& x : data_) { for (auto& x : data_) {
PADDLE_ENFORCE(x->Initialize(argument)); PADDLE_ENFORCE(x->Initialize(argument));
...@@ -122,6 +116,11 @@ void Analyzer::Run(Argument* argument) { ...@@ -122,6 +116,11 @@ void Analyzer::Run(Argument* argument) {
} }
} }
Analyzer& Analyzer::DisableIrPasses(const std::vector<std::string>& passes) {
disabled_ir_passes_.insert(passes.begin(), passes.end());
return *this;
}
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -36,16 +36,10 @@ limitations under the License. */ ...@@ -36,16 +36,10 @@ limitations under the License. */
*/ */
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include "paddle/fluid/inference/analysis/flags.h"
#include "paddle/fluid/inference/analysis/pass.h" #include "paddle/fluid/inference/analysis/pass.h"
#include "paddle/fluid/inference/analysis/pass_manager.h" #include "paddle/fluid/inference/analysis/pass_manager.h"
// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
// flag if not available.
DECLARE_bool(IA_enable_tensorrt_subgraph_engine);
DECLARE_string(IA_graphviz_log_root);
DECLARE_string(IA_output_storage_path);
DECLARE_bool(IA_enable_ir);
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
...@@ -57,7 +51,26 @@ class Analyzer : public OrderedRegistry<PassManager> { ...@@ -57,7 +51,26 @@ class Analyzer : public OrderedRegistry<PassManager> {
void Run(Argument* argument); void Run(Argument* argument);
Analyzer& DisableIrPasses(const std::vector<std::string>& passes);
DISABLE_COPY_AND_ASSIGN(Analyzer); DISABLE_COPY_AND_ASSIGN(Analyzer);
private:
// All avaiable IR passes.
// The bigger fuse comes first, so that the small operators prefer to be
// merged in a larger fuse op. The small fusion will not break the pattern of
// larger fusion.
const std::vector<std::string> all_ir_passes_{{
// Manual update the passes here.
"infer_clean_graph_pass", //
"attention_lstm_fuse_pass", //
"fc_lstm_fuse_pass", //
"mul_lstm_fuse_pass", //
"seq_concat_fc_fuse_pass", //
"fc_fuse_pass", //
}};
std::unordered_set<std::string> disabled_ir_passes_;
}; };
} // namespace analysis } // namespace analysis
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -270,17 +271,22 @@ void TestDituRNNPrediction(const std::string &model_path, ...@@ -270,17 +271,22 @@ void TestDituRNNPrediction(const std::string &model_path,
const std::string &data_path, int batch_size, const std::string &data_path, int batch_size,
bool use_analysis, bool activate_ir, bool use_analysis, bool activate_ir,
int num_times = 1) { int num_times = 1) {
NativeConfig config; AnalysisConfig config;
config.prog_file = FLAGS_infer_ditu_rnn_model + "/__model__"; config.prog_file = FLAGS_infer_ditu_rnn_model + "/__model__";
config.param_file = FLAGS_infer_ditu_rnn_model + "/param"; config.param_file = FLAGS_infer_ditu_rnn_model + "/param";
config.use_gpu = false; config.use_gpu = false;
config.device = 0; config.device = 0;
config.specify_input_name = true; config.specify_input_name = true;
config.enable_ir_optim = activate_ir;
PADDLE_ENFORCE(config.ir_mode ==
AnalysisConfig::IrPassMode::kExclude); // default
config.ir_passes.clear(); // Do not exclude any pass.
auto base_predictor = auto base_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
auto predictor = auto predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kAnalysis>(config); CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
std::vector<PaddleTensor> input_slots; std::vector<PaddleTensor> input_slots;
DataRecord data(data_path, batch_size); DataRecord data(data_path, batch_size);
// Prepare inputs. // Prepare inputs.
...@@ -327,9 +333,20 @@ void TestDituRNNPrediction(const std::string &model_path, ...@@ -327,9 +333,20 @@ void TestDituRNNPrediction(const std::string &model_path,
LOG(INFO) << "fused " << item.first << " " << item.second; LOG(INFO) << "fused " << item.first << " " << item.second;
} }
ASSERT_TRUE(fuse_statis.count("fc")); int num_ops = 0;
EXPECT_EQ(fuse_statis.at("fc"), 1); for (auto &node :
EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 1); analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
if (node->IsFunction()) {
++num_ops;
}
}
LOG(INFO) << "has num ops: " << num_ops;
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM
EXPECT_EQ(num_ops,
13); // After graph optimization, only 13 operators exists.
} }
} }
...@@ -357,10 +374,3 @@ TEST(Analyzer, DituRNN_with_analysis_with_IR) { ...@@ -357,10 +374,3 @@ TEST(Analyzer, DituRNN_with_analysis_with_IR) {
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_PASS(fc_fuse_pass);
USE_PASS(seq_concat_fc_fuse_pass);
USE_PASS(fc_lstm_fuse_pass);
USE_PASS(graph_viz_pass);
USE_PASS(infer_clean_graph_pass);
USE_PASS(attention_lstm_fuse_pass);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
// flag if not available.
DECLARE_bool(IA_enable_tensorrt_subgraph_engine);
DECLARE_string(IA_graphviz_log_root);
DECLARE_string(IA_output_storage_path);
DECLARE_bool(IA_enable_ir);
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#pragma once #pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/flags.h"
#include "paddle/fluid/inference/analysis/ir_pass_manager.h" #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
#include "paddle/fluid/inference/analysis/pass.h" #include "paddle/fluid/inference/analysis/pass.h"
...@@ -85,9 +86,11 @@ class FluidToIrPass final : public DataFlowGraphPass { ...@@ -85,9 +86,11 @@ class FluidToIrPass final : public DataFlowGraphPass {
new Scope *(&argument_->Get<Scope>(ir::kParamScopeAttr))); new Scope *(&argument_->Get<Scope>(ir::kParamScopeAttr)));
} }
const auto &ir_passes_to_apply = if (FLAGS_IA_enable_ir) {
argument_->Get<std::vector<std::string>>(kFluidToIrPassesAttr); const auto &ir_passes_to_apply =
ir_passes.Apply(ir_passes_to_apply); argument_->Get<std::vector<std::string>>(kFluidToIrPassesAttr);
ir_passes.Apply(ir_passes_to_apply);
}
PADDLE_ENFORCE(argument_->main_dfg.get()); PADDLE_ENFORCE(argument_->main_dfg.get());
argument_->main_dfg->Build(ir_passes.graph()); argument_->main_dfg->Build(ir_passes.graph());
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -33,10 +34,3 @@ TEST(FluidToIrPass, Test) { ...@@ -33,10 +34,3 @@ TEST(FluidToIrPass, Test) {
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_PASS(graph_viz_pass);
USE_PASS(infer_clean_graph_pass);
USE_PASS(attention_lstm_fuse_pass);
USE_PASS(fc_lstm_fuse_pass);
USE_PASS(seq_concat_fc_fuse_pass);
USE_PASS(fc_fuse_pass);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <glog/logging.h> // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
#include <gtest/gtest.h>
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/timer.h"
DEFINE_string(infer_model, "", "Directory of the inference model.");
DEFINE_string(infer_data, "", "Path of the dataset.");
DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(repeat, 1, "How many times to repeat run.");
namespace paddle {
template <typename T>
std::string to_string(const std::vector<T> &vec) {
std::stringstream ss;
for (const auto &c : vec) {
ss << c << " ";
}
return ss.str();
}
void PrintTime(const double latency, const int bs, const int repeat) {
LOG(INFO) << "===========profile result===========";
LOG(INFO) << "batch_size: " << bs << ", repeat: " << repeat
<< ", avg latency: " << latency / repeat << "ms";
LOG(INFO) << "=====================================";
}
void Main(int batch_size) {
// Three sequence inputs.
std::vector<PaddleTensor> input_slots(1);
// one batch starts
// data --
int64_t data0[] = {0, 1, 2};
for (auto &input : input_slots) {
input.data.Reset(data0, sizeof(data0));
input.shape = std::vector<int>({3, 1});
// dtype --
input.dtype = PaddleDType::INT64;
// LoD --
input.lod = std::vector<std::vector<size_t>>({{0, 3}});
}
// shape --
// Create Predictor --
AnalysisConfig config;
config.model_dir = FLAGS_infer_model;
config.use_gpu = false;
config.enable_ir_optim = true;
config.ir_passes.push_back("fc_lstm_fuse_pass");
auto predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
inference::Timer timer;
double sum = 0;
std::vector<PaddleTensor> output_slots;
for (int i = 0; i < FLAGS_repeat; i++) {
timer.tic();
CHECK(predictor->Run(input_slots, &output_slots));
sum += timer.toc();
}
PrintTime(sum, batch_size, FLAGS_repeat);
// Get output
LOG(INFO) << "get outputs " << output_slots.size();
for (auto &output : output_slots) {
LOG(INFO) << "output.shape: " << to_string(output.shape);
// no lod ?
CHECK_EQ(output.lod.size(), 0UL);
LOG(INFO) << "output.dtype: " << output.dtype;
std::stringstream ss;
for (int i = 0; i < 5; i++) {
ss << static_cast<float *>(output.data.data())[i] << " ";
}
LOG(INFO) << "output.data summary: " << ss.str();
// one batch ends
}
}
TEST(text_classification, basic) { Main(FLAGS_batch_size); }
} // namespace paddle
USE_PASS(fc_fuse_pass);
USE_PASS(seq_concat_fc_fuse_pass);
USE_PASS(fc_lstm_fuse_pass);
USE_PASS(graph_viz_pass);
USE_PASS(infer_clean_graph_pass);
USE_PASS(attention_lstm_fuse_pass);
...@@ -18,10 +18,7 @@ if(APPLE) ...@@ -18,10 +18,7 @@ if(APPLE)
endif(APPLE) endif(APPLE)
set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager ${GLOB_PASS_LIB})
graph_viz_pass fc_fuse_pass
infer_clean_graph_pass
)
if(WITH_GPU AND TENSORRT_FOUND) if(WITH_GPU AND TENSORRT_FOUND)
set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine) set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine)
...@@ -47,7 +44,19 @@ function(inference_api_test TARGET_NAME) ...@@ -47,7 +44,19 @@ function(inference_api_test TARGET_NAME)
endfunction(inference_api_test) endfunction(inference_api_test)
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor) cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor)
cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis) cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api
analysis
ir_pass_manager
pass
fc_fuse_pass
fc_lstm_fuse_pass
seq_concat_fc_fuse_pass
graph_viz_pass
infer_clean_graph_pass
graph_pattern_detector
infer_clean_graph_pass
attention_lstm_fuse_pass
)
cc_test(test_paddle_inference_api cc_test(test_paddle_inference_api
SRCS api_tester.cc SRCS api_tester.cc
......
...@@ -14,10 +14,13 @@ ...@@ -14,10 +14,13 @@
#include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/analysis_predictor.h"
#include <memory> #include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
namespace paddle { namespace paddle {
...@@ -27,6 +30,8 @@ bool AnalysisPredictor::Init( ...@@ -27,6 +30,8 @@ bool AnalysisPredictor::Init(
VLOG(3) << "Predictor::init()"; VLOG(3) << "Predictor::init()";
if (config_.use_gpu) { if (config_.use_gpu) {
place_ = paddle::platform::CUDAPlace(config_.device); place_ = paddle::platform::CUDAPlace(config_.device);
LOG(WARNING) << "ir optimize only supports CPU currently";
config_.enable_ir_optim = false;
} else { } else {
place_ = paddle::platform::CPUPlace(); place_ = paddle::platform::CPUPlace();
} }
...@@ -72,7 +77,7 @@ bool AnalysisPredictor::Init( ...@@ -72,7 +77,7 @@ bool AnalysisPredictor::Init(
void AnalysisPredictor::OptimizeInferenceProgram() { void AnalysisPredictor::OptimizeInferenceProgram() {
LOG(INFO) << "optimize begin"; LOG(INFO) << "optimize begin";
FLAGS_IA_enable_ir = true; FLAGS_IA_enable_ir = config_.enable_ir_optim;
FLAGS_IA_enable_tensorrt_subgraph_engine = false; FLAGS_IA_enable_tensorrt_subgraph_engine = false;
FLAGS_IA_output_storage_path = ""; // Don't output the model. FLAGS_IA_output_storage_path = ""; // Don't output the model.
// Analyze inference_program // Analyze inference_program
...@@ -89,24 +94,26 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -89,24 +94,26 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
} }
argument_.origin_program_desc.reset( argument_.origin_program_desc.reset(
new ProgramDesc(*inference_program_->Proto())); new ProgramDesc(*inference_program_->Proto()));
Analyzer().Run(&argument_); PADDLE_ENFORCE(config_.ir_mode == AnalysisConfig::IrPassMode::kExclude,
"Only kExclude is supported yet.");
Analyzer().DisableIrPasses(config_.ir_passes).Run(&argument_);
CHECK(argument_.transformed_program_desc); CHECK(argument_.transformed_program_desc);
VLOG(5) << "to prepare executor"; VLOG(5) << "to prepare executor";
// LOG(INFO) << "transformed_parogram_desc " <<
// argument.transformed_program_desc->DebugString();
inference_program_.reset( inference_program_.reset(
new framework::ProgramDesc(*argument_.transformed_program_desc)); new framework::ProgramDesc(*argument_.transformed_program_desc));
PADDLE_ENFORCE(argument_.Has(framework::ir::kParamScopeAttr)); if (argument_.Has(framework::ir::kParamScopeAttr)) {
// Update scope. // Update scope.
scope_.reset( scope_.reset(
argument_.Release<framework::Scope>(framework::ir::kParamScopeAttr)); argument_.Release<framework::Scope>(framework::ir::kParamScopeAttr));
LOG(INFO) << "optimize end =="; }
LOG(INFO) << "== optimize end ==";
} }
template <> template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
NativeConfig, PaddleEngineKind::kAnalysis>(const NativeConfig& config) { AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig& config) {
VLOG(3) << "create NativePredictor"; VLOG(3) << "create AnalysisConfig";
if (config.use_gpu) { if (config.use_gpu) {
// 1. GPU memeroy // 1. GPU memeroy
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
...@@ -133,7 +140,3 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< ...@@ -133,7 +140,3 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
} }
} // namespace paddle } // namespace paddle
USE_PASS(fc_fuse_pass);
USE_PASS(graph_viz_pass);
USE_PASS(infer_clean_graph_pass);
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <string>
#include <vector>
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
...@@ -28,7 +30,7 @@ using framework::proto::ProgramDesc; ...@@ -28,7 +30,7 @@ using framework::proto::ProgramDesc;
*/ */
class AnalysisPredictor : public NativePaddlePredictor { class AnalysisPredictor : public NativePaddlePredictor {
public: public:
explicit AnalysisPredictor(const NativeConfig& config) explicit AnalysisPredictor(const AnalysisConfig& config)
: NativePaddlePredictor(config), config_(config) {} : NativePaddlePredictor(config), config_(config) {}
bool Init(const std::shared_ptr<framework::Scope>& parent_scope); bool Init(const std::shared_ptr<framework::Scope>& parent_scope);
...@@ -44,7 +46,7 @@ class AnalysisPredictor : public NativePaddlePredictor { ...@@ -44,7 +46,7 @@ class AnalysisPredictor : public NativePaddlePredictor {
Argument& analysis_argument() { return argument_; } Argument& analysis_argument() { return argument_; }
private: private:
NativeConfig config_; AnalysisConfig config_;
Argument argument_; Argument argument_;
}; };
......
...@@ -176,7 +176,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs, ...@@ -176,7 +176,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
framework::Scope *scope) { framework::Scope *scope) {
VLOG(3) << "Predictor::set_feed"; VLOG(3) << "Predictor::set_feed";
if (inputs.size() != feeds_.size()) { if (inputs.size() != feeds_.size()) {
LOG(ERROR) << "wrong feed input size."; LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get "
<< inputs.size();
return false; return false;
} }
for (size_t i = 0; i < inputs.size(); ++i) { for (size_t i = 0; i < inputs.size(); ++i) {
......
...@@ -14,7 +14,7 @@ else ...@@ -14,7 +14,7 @@ else
fi fi
PREFIX=inference-vis-demos%2F PREFIX=inference-vis-demos%2F
URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX} URL_ROOT=http://paddlemodels.cdn.bcebos.com/${PREFIX}
# download vis_demo data # download vis_demo data
function download() { function download() {
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <sys/time.h> #include <sys/time.h>
#include <algorithm> #include <algorithm>
#include <numeric>
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <vector> #include <vector>
......
...@@ -150,6 +150,21 @@ struct TensorRTConfig : public NativeConfig { ...@@ -150,6 +150,21 @@ struct TensorRTConfig : public NativeConfig {
int workspace_size{1 << 30}; int workspace_size{1 << 30};
}; };
// NOTE WIP, not stable yet.
struct AnalysisConfig : public NativeConfig {
//
enum class IrPassMode {
kSystem, // Use system default passes, not customize.
kInclude, // Specify the passes in `ir_passes`.
kExclude // Specify the disabled passes in `ir_passes`.
};
bool enable_ir_optim = true;
IrPassMode ir_mode{IrPassMode::kExclude};
// attention lstm fuse works only on some specific models, disable as default.
std::vector<std::string> ir_passes{"attention_lstm_fuse_pass"};
};
// A factory to help create different predictors. // A factory to help create different predictors.
// //
// FOR EXTENSION DEVELOPER: // FOR EXTENSION DEVELOPER:
......
{ {
global: global:
*paddle*; *paddle*;
*Pass*;
local: local:
*; *;
}; };
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/auc_op.h" #include "paddle/fluid/operators/auc_op.h"
#include <string>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -36,15 +35,12 @@ class AucOp : public framework::OperatorWithKernel { ...@@ -36,15 +35,12 @@ class AucOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(predict_height, label_height, PADDLE_ENFORCE_EQ(predict_height, label_height,
"Out and Label should have same height."); "Out and Label should have same height.");
int num_thres = ctx->Attrs().Get<int>("num_thresholds"); int num_pred_buckets = ctx->Attrs().Get<int>("num_thresholds") + 1;
ctx->SetOutputDim("AUC", {1}); ctx->SetOutputDim("AUC", {1});
ctx->SetOutputDim("TPOut", {num_thres}); ctx->SetOutputDim("BatchAUC", {1});
ctx->SetOutputDim("TNOut", {num_thres}); ctx->SetOutputDim("StatPosOut", {num_pred_buckets});
ctx->SetOutputDim("FPOut", {num_thres}); ctx->SetOutputDim("StatNegOut", {num_pred_buckets});
ctx->SetOutputDim("FNOut", {num_thres});
ctx->ShareLoD("Predict", /*->*/ "AUC");
} }
protected: protected:
...@@ -66,25 +62,24 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -66,25 +62,24 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("Label", AddInput("Label",
"A 2D int tensor indicating the label of the training data. " "A 2D int tensor indicating the label of the training data. "
"shape: [batch_size, 1]"); "shape: [batch_size, 1]");
AddInput("TP", "True-Positive value.");
AddInput("FP", "False-Positive value.");
AddInput("TN", "True-Negative value.");
AddInput("FN", "False-Negative value.");
// TODO(typhoonzero): support weight input // TODO(typhoonzero): support weight input
AddInput("StatPos", "Statistic value when label = 1");
AddInput("StatNeg", "Statistic value when label = 0");
AddOutput("AUC", AddOutput("AUC",
"A scalar representing the " "A scalar representing the "
"current area-under-the-curve."); "current area-under-the-curve.");
AddOutput("TPOut", "True-Positive value."); AddOutput("BatchAUC", "The AUC for current batch");
AddOutput("FPOut", "False-Positive value."); AddOutput("StatPosOut", "Statistic value when label = 1");
AddOutput("TNOut", "True-Negative value."); AddOutput("StatNegOut", "Statistic value when label = 0");
AddOutput("FNOut", "False-Negative value.");
AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.") AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
.SetDefault("ROC"); .SetDefault("ROC");
AddAttr<int>("num_thresholds", AddAttr<int>("num_thresholds",
"The number of thresholds to use when discretizing the" "The number of thresholds to use when discretizing the"
" roc curve.") " roc curve.")
.SetDefault(200); .SetDefault((2 << 12) - 1);
AddComment(R"DOC( AddComment(R"DOC(
Area Under The Curve (AUC) Operator. Area Under The Curve (AUC) Operator.
......
...@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
namespace paddle { namespace paddle {
...@@ -23,106 +23,85 @@ namespace operators { ...@@ -23,106 +23,85 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class AucKernel : public framework::OpKernel<T> { class AucKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
auto* predict = ctx.Input<Tensor>("Predict"); auto *predict = ctx.Input<Tensor>("Predict");
auto* label = ctx.Input<Tensor>("Label"); auto *label = ctx.Input<Tensor>("Label");
auto* auc = ctx.Output<Tensor>("AUC");
std::string curve = ctx.Attr<std::string>("curve");
int num_thresholds = ctx.Attr<int>("num_thresholds");
int num_pred_buckets = num_thresholds + 1;
// Only use output var for now, make sure it's persistable and // Only use output var for now, make sure it's persistable and
// not cleaned up for each batch. // not cleaned up for each batch.
auto* true_positive = ctx.Output<Tensor>("TPOut"); auto *auc = ctx.Output<Tensor>("AUC");
auto* false_positive = ctx.Output<Tensor>("FPOut"); auto *stat_pos = ctx.Output<Tensor>("StatPosOut");
auto* true_negative = ctx.Output<Tensor>("TNOut"); auto *stat_neg = ctx.Output<Tensor>("StatNegOut");
auto* false_negative = ctx.Output<Tensor>("FNOut");
auto* auc_data = auc->mutable_data<double>(ctx.GetPlace()); auto *stat_pos_data = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
auto *stat_neg_data = stat_neg->mutable_data<int64_t>(ctx.GetPlace());
calcAuc(ctx, label, predict, stat_pos_data, stat_neg_data, num_thresholds,
auc);
std::string curve = ctx.Attr<std::string>("curve"); auto *batch_auc = ctx.Output<Tensor>("BatchAUC");
int num_thresholds = ctx.Attr<int>("num_thresholds"); std::vector<int64_t> stat_pos_batch(num_pred_buckets, 0);
std::vector<double> thresholds_list; std::vector<int64_t> stat_neg_batch(num_pred_buckets, 0);
thresholds_list.reserve(num_thresholds); calcAuc(ctx, label, predict, stat_pos_batch.data(), stat_neg_batch.data(),
for (int i = 1; i < num_thresholds - 1; i++) { num_thresholds, batch_auc);
thresholds_list[i] = static_cast<double>(i) / (num_thresholds - 1); }
}
const double kEpsilon = 1e-7;
thresholds_list[0] = 0.0f - kEpsilon;
thresholds_list[num_thresholds - 1] = 1.0f + kEpsilon;
private:
inline static double trapezoidArea(double X1, double X2, double Y1,
double Y2) {
return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
}
inline static void calcAuc(const framework::ExecutionContext &ctx,
const framework::Tensor *label,
const framework::Tensor *predict,
int64_t *stat_pos, int64_t *stat_neg,
int num_thresholds,
framework::Tensor *auc_tensor) {
size_t batch_size = predict->dims()[0]; size_t batch_size = predict->dims()[0];
size_t inference_width = predict->dims()[1]; size_t inference_width = predict->dims()[1];
const T *inference_data = predict->data<T>();
const auto *label_data = label->data<int64_t>();
auto *auc = auc_tensor->mutable_data<double>(ctx.GetPlace());
const T* inference_data = predict->data<T>(); for (size_t i = 0; i < batch_size; i++) {
const auto* label_data = label->data<int64_t>(); uint32_t binIdx = static_cast<uint32_t>(
inference_data[i * inference_width + 1] * num_thresholds);
auto* tp_data = true_positive->mutable_data<int64_t>(ctx.GetPlace()); if (label_data[i]) {
auto* fn_data = false_negative->mutable_data<int64_t>(ctx.GetPlace()); stat_pos[binIdx] += 1.0;
auto* tn_data = true_negative->mutable_data<int64_t>(ctx.GetPlace()); } else {
auto* fp_data = false_positive->mutable_data<int64_t>(ctx.GetPlace()); stat_neg[binIdx] += 1.0;
for (int idx_thresh = 0; idx_thresh < num_thresholds; idx_thresh++) {
// calculate TP, FN, TN, FP for current thresh
int64_t tp = 0, fn = 0, tn = 0, fp = 0;
for (size_t i = 0; i < batch_size; i++) {
// NOTE: label_data used as bool, labels > 0 will be treated as true.
if (label_data[i]) {
if (inference_data[i * inference_width + 1] >=
(thresholds_list[idx_thresh])) {
tp++;
} else {
fn++;
}
} else {
if (inference_data[i * inference_width + 1] >=
(thresholds_list[idx_thresh])) {
fp++;
} else {
tn++;
}
}
} }
// store rates
tp_data[idx_thresh] += tp;
fn_data[idx_thresh] += fn;
tn_data[idx_thresh] += tn;
fp_data[idx_thresh] += fp;
} }
// epsilon to avoid divide by zero.
double epsilon = 1e-6; *auc = 0.0f;
// Riemann sum to caculate auc.
Tensor tp_rate, fp_rate, rec_rate; double totPos = 0.0;
tp_rate.Resize({num_thresholds}); double totNeg = 0.0;
fp_rate.Resize({num_thresholds}); double totPosPrev = 0.0;
rec_rate.Resize({num_thresholds}); double totNegPrev = 0.0;
auto* tp_rate_data = tp_rate.mutable_data<double>(ctx.GetPlace());
auto* fp_rate_data = fp_rate.mutable_data<double>(ctx.GetPlace()); int idx = num_thresholds;
auto* rec_rate_data = rec_rate.mutable_data<double>(ctx.GetPlace());
for (int i = 0; i < num_thresholds; i++) { while (idx >= 0) {
tp_rate_data[i] = (static_cast<double>(tp_data[i]) + epsilon) / totPosPrev = totPos;
(tp_data[i] + fn_data[i] + epsilon); totNegPrev = totNeg;
fp_rate_data[i] = totPos += stat_pos[idx];
static_cast<double>(fp_data[i]) / (fp_data[i] + tn_data[i] + epsilon); totNeg += stat_neg[idx];
rec_rate_data[i] = (static_cast<double>(tp_data[i]) + epsilon) / *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
(tp_data[i] + fp_data[i] + epsilon);
--idx;
} }
*auc_data = 0.0f;
if (curve == "ROC") { if (totPos > 0.0 && totNeg > 0.0) {
for (int i = 0; i < num_thresholds - 1; i++) { *auc = *auc / totPos / totNeg;
auto dx = fp_rate_data[i] - fp_rate_data[i + 1];
auto y = (tp_rate_data[i] + tp_rate_data[i + 1]) / 2.0f;
*auc_data = *auc_data + dx * y;
}
} else if (curve == "PR") {
for (int i = 1; i < num_thresholds; i++) {
auto dx = tp_rate_data[i] - tp_rate_data[i - 1];
auto y = (rec_rate_data[i] + rec_rate_data[i - 1]) / 2.0f;
*auc_data = *auc_data + dx * y;
}
} }
} }
}; };
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
namespace paddle {
namespace operators {
/*
* transform that computes target bounding-box regression deltas
* given proposal boxes and ground-truth boxes.
*/
template <typename T>
inline void BoxToDelta(const int box_num, const framework::Tensor& ex_boxes,
const framework::Tensor& gt_boxes, const T* weights,
const bool normalized, framework::Tensor* box_delta) {
auto ex_boxes_et = framework::EigenTensor<T, 2>::From(ex_boxes);
auto gt_boxes_et = framework::EigenTensor<T, 2>::From(gt_boxes);
auto trg = framework::EigenTensor<T, 2>::From(*box_delta);
T ex_w, ex_h, ex_ctr_x, ex_ctr_y, gt_w, gt_h, gt_ctr_x, gt_ctr_y;
for (int64_t i = 0; i < box_num; ++i) {
ex_w = ex_boxes_et(i, 2) - ex_boxes_et(i, 0) + (normalized == false);
ex_h = ex_boxes_et(i, 3) - ex_boxes_et(i, 1) + (normalized == false);
ex_ctr_x = ex_boxes_et(i, 0) + 0.5 * ex_w;
ex_ctr_y = ex_boxes_et(i, 1) + 0.5 * ex_h;
gt_w = gt_boxes_et(i, 2) - gt_boxes_et(i, 0) + (normalized == false);
gt_h = gt_boxes_et(i, 3) - gt_boxes_et(i, 1) + (normalized == false);
gt_ctr_x = gt_boxes_et(i, 0) + 0.5 * gt_w;
gt_ctr_y = gt_boxes_et(i, 1) + 0.5 * gt_h;
trg(i, 0) = (gt_ctr_x - ex_ctr_x) / ex_w;
trg(i, 1) = (gt_ctr_y - ex_ctr_y) / ex_h;
trg(i, 2) = std::log(gt_w / ex_w);
trg(i, 3) = std::log(gt_h / ex_h);
if (weights) {
trg(i, 0) = trg(i, 0) / weights[0];
trg(i, 1) = trg(i, 1) / weights[1];
trg(i, 2) = trg(i, 2) / weights[2];
trg(i, 3) = trg(i, 3) / weights[3];
}
}
}
template <typename T>
void Gather(const T* in, const int in_stride, const int* index, const int num,
T* out) {
const int stride_bytes = in_stride * sizeof(T);
for (int i = 0; i < num; ++i) {
int id = index[i];
memcpy(out + i * in_stride, in + id * in_stride, stride_bytes);
}
}
} // namespace operators
} // namespace paddle
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detection/bbox_util.h"
#include "paddle/fluid/operators/gather.h" #include "paddle/fluid/operators/gather.h"
#include "paddle/fluid/operators/math/concat.h" #include "paddle/fluid/operators/math/concat.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
...@@ -133,31 +134,6 @@ void BboxOverlaps(const Tensor& r_boxes, const Tensor& c_boxes, ...@@ -133,31 +134,6 @@ void BboxOverlaps(const Tensor& r_boxes, const Tensor& c_boxes,
} }
} }
template <typename T>
void BoxToDelta(int box_num, const Tensor& ex_boxes, const Tensor& gt_boxes,
const std::vector<float>& weights, Tensor* box_delta) {
auto ex_boxes_et = framework::EigenTensor<T, 2>::From(ex_boxes);
auto gt_boxes_et = framework::EigenTensor<T, 2>::From(gt_boxes);
auto box_delta_et = framework::EigenTensor<T, 2>::From(*box_delta);
T ex_w, ex_h, ex_ctr_x, ex_ctr_y, gt_w, gt_h, gt_ctr_x, gt_ctr_y;
for (int64_t i = 0; i < box_num; ++i) {
ex_w = ex_boxes_et(i, 2) - ex_boxes_et(i, 0) + 1;
ex_h = ex_boxes_et(i, 3) - ex_boxes_et(i, 1) + 1;
ex_ctr_x = ex_boxes_et(i, 0) + 0.5 * ex_w;
ex_ctr_y = ex_boxes_et(i, 1) + 0.5 * ex_h;
gt_w = gt_boxes_et(i, 2) - gt_boxes_et(i, 0) + 1;
gt_h = gt_boxes_et(i, 3) - gt_boxes_et(i, 1) + 1;
gt_ctr_x = gt_boxes_et(i, 0) + 0.5 * gt_w;
gt_ctr_y = gt_boxes_et(i, 1) + 0.5 * gt_h;
box_delta_et(i, 0) = (gt_ctr_x - ex_ctr_x) / ex_w / weights[0];
box_delta_et(i, 1) = (gt_ctr_y - ex_ctr_y) / ex_h / weights[1];
box_delta_et(i, 2) = log(gt_w / ex_w) / ex_w / weights[2];
box_delta_et(i, 3) = log(gt_h / ex_h) / ex_h / weights[3];
}
}
template <typename T> template <typename T>
std::vector<std::vector<int>> SampleFgBgGt( std::vector<std::vector<int>> SampleFgBgGt(
const platform::CPUDeviceContext& context, Tensor* iou, const platform::CPUDeviceContext& context, Tensor* iou,
...@@ -243,12 +219,11 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context, ...@@ -243,12 +219,11 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context,
Tensor* sampled_labels, Tensor* sampled_gts) { Tensor* sampled_labels, Tensor* sampled_gts) {
int fg_num = fg_inds.size(); int fg_num = fg_inds.size();
int bg_num = bg_inds.size(); int bg_num = bg_inds.size();
int gt_num = fg_num + bg_num;
Tensor fg_inds_t, bg_inds_t, gt_box_inds_t, gt_label_inds_t; Tensor fg_inds_t, bg_inds_t, gt_box_inds_t, gt_label_inds_t;
int* fg_inds_data = fg_inds_t.mutable_data<int>({fg_num}, context.GetPlace()); int* fg_inds_data = fg_inds_t.mutable_data<int>({fg_num}, context.GetPlace());
int* bg_inds_data = bg_inds_t.mutable_data<int>({bg_num}, context.GetPlace()); int* bg_inds_data = bg_inds_t.mutable_data<int>({bg_num}, context.GetPlace());
int* gt_box_inds_data = int* gt_box_inds_data =
gt_box_inds_t.mutable_data<int>({gt_num}, context.GetPlace()); gt_box_inds_t.mutable_data<int>({fg_num}, context.GetPlace());
int* gt_label_inds_data = int* gt_label_inds_data =
gt_label_inds_t.mutable_data<int>({fg_num}, context.GetPlace()); gt_label_inds_t.mutable_data<int>({fg_num}, context.GetPlace());
std::copy(fg_inds.begin(), fg_inds.end(), fg_inds_data); std::copy(fg_inds.begin(), fg_inds.end(), fg_inds_data);
...@@ -303,18 +278,20 @@ std::vector<Tensor> SampleRoisForOneImage( ...@@ -303,18 +278,20 @@ std::vector<Tensor> SampleRoisForOneImage(
// Gather boxes and labels // Gather boxes and labels
Tensor sampled_boxes, sampled_labels, sampled_gts; Tensor sampled_boxes, sampled_labels, sampled_gts;
int boxes_num = fg_inds.size() + bg_inds.size(); int fg_num = fg_inds.size();
int bg_num = bg_inds.size();
int boxes_num = fg_num + bg_num;
framework::DDim bbox_dim({boxes_num, kBoxDim}); framework::DDim bbox_dim({boxes_num, kBoxDim});
sampled_boxes.mutable_data<T>(bbox_dim, context.GetPlace()); sampled_boxes.mutable_data<T>(bbox_dim, context.GetPlace());
sampled_labels.mutable_data<int>({boxes_num}, context.GetPlace()); sampled_labels.mutable_data<int>({boxes_num}, context.GetPlace());
sampled_gts.mutable_data<T>(bbox_dim, context.GetPlace()); sampled_gts.mutable_data<T>({fg_num, kBoxDim}, context.GetPlace());
GatherBoxesLabels<T>(context, boxes, *gt_boxes, *gt_classes, fg_inds, bg_inds, GatherBoxesLabels<T>(context, boxes, *gt_boxes, *gt_classes, fg_inds, bg_inds,
gt_inds, &sampled_boxes, &sampled_labels, &sampled_gts); gt_inds, &sampled_boxes, &sampled_labels, &sampled_gts);
// Compute targets // Compute targets
Tensor bbox_targets_single; Tensor bbox_targets_single;
bbox_targets_single.mutable_data<T>(bbox_dim, context.GetPlace()); bbox_targets_single.mutable_data<T>(bbox_dim, context.GetPlace());
BoxToDelta<T>(boxes_num, sampled_boxes, sampled_gts, bbox_reg_weights, BoxToDelta<T>(fg_num, sampled_boxes, sampled_gts, nullptr, false,
&bbox_targets_single); &bbox_targets_single);
// Scale rois // Scale rois
...@@ -427,7 +404,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> { ...@@ -427,7 +404,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
auto rpn_rois_lod = rpn_rois->lod().back(); auto rpn_rois_lod = rpn_rois->lod().back();
auto gt_classes_lod = gt_classes->lod().back(); auto gt_classes_lod = gt_classes->lod().back();
auto gt_boxes_lod = gt_boxes->lod().back(); auto gt_boxes_lod = gt_boxes->lod().back();
for (size_t i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
Tensor rpn_rois_slice = Tensor rpn_rois_slice =
rpn_rois->Slice(rpn_rois_lod[i], rpn_rois_lod[i + 1]); rpn_rois->Slice(rpn_rois_lod[i], rpn_rois_lod[i + 1]);
Tensor gt_classes_slice = Tensor gt_classes_slice =
......
...@@ -311,8 +311,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> { ...@@ -311,8 +311,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
rpn_rois->mutable_data<T>({bbox_deltas->numel() / 4, 4}, rpn_rois->mutable_data<T>({bbox_deltas->numel() / 4, 4},
context.GetPlace()); context.GetPlace());
rpn_roi_probs->mutable_data<T>({scores->numel() / 4, 1}, rpn_roi_probs->mutable_data<T>({scores->numel(), 1}, context.GetPlace());
context.GetPlace());
Tensor bbox_deltas_swap, scores_swap; Tensor bbox_deltas_swap, scores_swap;
bbox_deltas_swap.mutable_data<T>({num, h_bbox, w_bbox, c_bbox}, bbox_deltas_swap.mutable_data<T>({num, h_bbox, w_bbox, c_bbox},
...@@ -421,7 +420,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> { ...@@ -421,7 +420,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
CPUGather<T>(ctx, proposals, keep, &bbox_sel); CPUGather<T>(ctx, proposals, keep, &bbox_sel);
CPUGather<T>(ctx, scores_sel, keep, &scores_filter); CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
if (nms_thresh <= 0) { if (nms_thresh <= 0) {
return std::make_pair(bbox_sel, scores_sel); return std::make_pair(bbox_sel, scores_filter);
} }
Tensor keep_nms = NMS<T>(ctx, &bbox_sel, &scores_filter, nms_thresh, eta); Tensor keep_nms = NMS<T>(ctx, &bbox_sel, &scores_filter, nms_thresh, eta);
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include <random> #include <random>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detection/bbox_util.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
namespace paddle { namespace paddle {
...@@ -46,156 +47,219 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel { ...@@ -46,156 +47,219 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
auto in_dims = ctx->GetInputDim("DistMat"); auto in_dims = ctx->GetInputDim("DistMat");
PADDLE_ENFORCE_EQ(in_dims.size(), 2, PADDLE_ENFORCE_EQ(in_dims.size(), 2,
"The rank of Input(DistMat) must be 2."); "The rank of Input(DistMat) must be 2.");
ctx->SetOutputDim("LocationIndex", {-1});
ctx->SetOutputDim("ScoreIndex", {-1});
ctx->SetOutputDim("TargetLabel", {-1, 1});
ctx->SetOutputDim("TargetBBox", {-1, 4});
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(
ctx.Input<framework::LoDTensor>("DistMat")->type()),
platform::CPUPlace());
} }
}; };
template <typename T> template <typename T>
class RpnTargetAssignKernel : public framework::OpKernel<T> { class RpnTargetAssignKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override {
auto* anchor_t = context.Input<Tensor>("Anchor"); // (H*W*A) * 4
auto* gt_bbox_t = context.Input<Tensor>("GtBox");
auto* dist_t = context.Input<LoDTensor>("DistMat");
auto* loc_index_t = context.Output<Tensor>("LocationIndex");
auto* score_index_t = context.Output<Tensor>("ScoreIndex");
auto* tgt_bbox_t = context.Output<Tensor>("TargetBBox");
auto* tgt_lbl_t = context.Output<Tensor>("TargetLabel");
auto lod = dist_t->lod().back();
int64_t batch_num = static_cast<int64_t>(lod.size() - 1);
int64_t anchor_num = dist_t->dims()[1];
PADDLE_ENFORCE_EQ(anchor_num, anchor_t->dims()[0]);
int rpn_batch_size = context.Attr<int>("rpn_batch_size_per_im");
float pos_threshold = context.Attr<float>("rpn_positive_overlap");
float neg_threshold = context.Attr<float>("rpn_negative_overlap");
float fg_fraction = context.Attr<float>("fg_fraction");
int fg_num_per_batch = static_cast<int>(rpn_batch_size * fg_fraction);
int64_t max_num = batch_num * anchor_num;
auto place = context.GetPlace();
tgt_bbox_t->mutable_data<T>({max_num, 4}, place);
auto* loc_index = loc_index_t->mutable_data<int>({max_num}, place);
auto* score_index = score_index_t->mutable_data<int>({max_num}, place);
Tensor tmp_tgt_lbl;
auto* tmp_lbl_data = tmp_tgt_lbl.mutable_data<int64_t>({max_num}, place);
auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
math::SetConstant<platform::CPUDeviceContext, int64_t> iset;
iset(dev_ctx, &tmp_tgt_lbl, static_cast<int64_t>(-1));
std::random_device rnd;
std::minstd_rand engine;
int seed =
context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
engine.seed(seed);
int fg_num = 0;
int bg_num = 0;
for (int i = 0; i < batch_num; ++i) {
Tensor dist = dist_t->Slice(lod[i], lod[i + 1]);
Tensor gt_bbox = gt_bbox_t->Slice(lod[i], lod[i + 1]);
auto fg_bg_gt = SampleFgBgGt(dev_ctx, dist, pos_threshold, neg_threshold,
rpn_batch_size, fg_num_per_batch, engine,
tmp_lbl_data + i * anchor_num);
int cur_fg_num = fg_bg_gt[0].size();
int cur_bg_num = fg_bg_gt[1].size();
std::transform(fg_bg_gt[0].begin(), fg_bg_gt[0].end(), loc_index,
[i, anchor_num](int d) { return d + i * anchor_num; });
memcpy(score_index, loc_index, cur_fg_num * sizeof(int));
std::transform(fg_bg_gt[1].begin(), fg_bg_gt[1].end(),
score_index + cur_fg_num,
[i, anchor_num](int d) { return d + i * anchor_num; });
// get target bbox deltas
if (cur_fg_num) {
Tensor fg_gt;
T* gt_data = fg_gt.mutable_data<T>({cur_fg_num, 4}, place);
Tensor tgt_bbox = tgt_bbox_t->Slice(fg_num, fg_num + cur_fg_num);
T* tgt_data = tgt_bbox.data<T>();
Gather<T>(anchor_t->data<T>(), 4,
reinterpret_cast<int*>(&fg_bg_gt[0][0]), cur_fg_num,
tgt_data);
Gather<T>(gt_bbox.data<T>(), 4, reinterpret_cast<int*>(&fg_bg_gt[2][0]),
cur_fg_num, gt_data);
BoxToDelta<T>(cur_fg_num, tgt_bbox, fg_gt, nullptr, false, &tgt_bbox);
}
loc_index += cur_fg_num;
score_index += cur_fg_num + cur_bg_num;
fg_num += cur_fg_num;
bg_num += cur_bg_num;
}
int lbl_num = fg_num + bg_num;
PADDLE_ENFORCE_LE(fg_num, max_num);
PADDLE_ENFORCE_LE(lbl_num, max_num);
tgt_bbox_t->Resize({fg_num, 4});
loc_index_t->Resize({fg_num});
score_index_t->Resize({lbl_num});
auto* lbl_data = tgt_lbl_t->mutable_data<int64_t>({lbl_num, 1}, place);
Gather<int64_t>(tmp_lbl_data, 1, score_index_t->data<int>(), lbl_num,
lbl_data);
}
private:
void ScoreAssign(const T* dist_data, const Tensor& anchor_to_gt_max, void ScoreAssign(const T* dist_data, const Tensor& anchor_to_gt_max,
const int row, const int col, const float pos_threshold, const int row, const int col, const float pos_threshold,
const float neg_threshold, int64_t* target_label_data, const float neg_threshold, int64_t* target_label,
std::vector<int>* fg_inds, std::vector<int>* bg_inds) const { std::vector<int>* fg_inds, std::vector<int>* bg_inds) const {
int fg_offset = fg_inds->size(); float epsilon = 0.0001;
int bg_offset = bg_inds->size();
for (int64_t i = 0; i < row; ++i) { for (int64_t i = 0; i < row; ++i) {
const T* v = dist_data + i * col; const T* v = dist_data + i * col;
T max_dist = *std::max_element(v, v + col); T max = *std::max_element(v, v + col);
for (int64_t j = 0; j < col; ++j) { for (int64_t j = 0; j < col; ++j) {
T val = dist_data[i * col + j]; if (std::abs(max - v[j]) < epsilon) {
if (val == max_dist) target_label_data[j] = 1; target_label[j] = 1;
}
} }
} }
// Pick the fg/bg and count the number // Pick the fg/bg
const T* anchor_to_gt_max_data = anchor_to_gt_max.data<T>();
for (int64_t j = 0; j < col; ++j) { for (int64_t j = 0; j < col; ++j) {
if (anchor_to_gt_max.data<T>()[j] > pos_threshold) { if (anchor_to_gt_max_data[j] >= pos_threshold) {
target_label_data[j] = 1; target_label[j] = 1;
} else if (anchor_to_gt_max.data<T>()[j] < neg_threshold) { } else if (anchor_to_gt_max_data[j] < neg_threshold) {
target_label_data[j] = 0; target_label[j] = 0;
} }
if (target_label_data[j] == 1) { if (target_label[j] == 1) {
fg_inds->push_back(fg_offset + j); fg_inds->push_back(j);
} else if (target_label_data[j] == 0) { } else if (target_label[j] == 0) {
bg_inds->push_back(bg_offset + j); bg_inds->push_back(j);
} }
} }
} }
void ReservoirSampling(const int num, const int offset, void ReservoirSampling(const int num, std::minstd_rand engine,
std::minstd_rand engine,
std::vector<int>* inds) const { std::vector<int>* inds) const {
std::uniform_real_distribution<float> uniform(0, 1); std::uniform_real_distribution<float> uniform(0, 1);
const int64_t size = static_cast<int64_t>(inds->size() - offset); size_t len = inds->size();
if (size > num) { if (len > static_cast<size_t>(num)) {
for (int64_t i = num; i < size; ++i) { for (size_t i = num; i < len; ++i) {
int rng_ind = std::floor(uniform(engine) * i); int rng_ind = std::floor(uniform(engine) * i);
if (rng_ind < num) if (rng_ind < num)
std::iter_swap(inds->begin() + rng_ind + offset, std::iter_swap(inds->begin() + rng_ind, inds->begin() + i);
inds->begin() + i + offset);
} }
inds->resize(num);
} }
} }
void RpnTargetAssign(const framework::ExecutionContext& ctx, // std::vector<std::vector<int>> RpnTargetAssign(
const Tensor& dist, const float pos_threshold, std::vector<std::vector<int>> SampleFgBgGt(
const float neg_threshold, const int rpn_batch_size, const platform::CPUDeviceContext& ctx, const Tensor& dist,
const int fg_num, std::minstd_rand engine, const float pos_threshold, const float neg_threshold,
std::vector<int>* fg_inds, std::vector<int>* bg_inds, const int rpn_batch_size, const int fg_num, std::minstd_rand engine,
int64_t* target_label_data) const { int64_t* target_label) const {
auto* dist_data = dist.data<T>(); auto* dist_data = dist.data<T>();
int64_t row = dist.dims()[0]; int row = dist.dims()[0];
int64_t col = dist.dims()[1]; int col = dist.dims()[1];
int fg_offset = fg_inds->size();
int bg_offset = bg_inds->size(); std::vector<int> fg_inds;
std::vector<int> bg_inds;
std::vector<int> gt_inds;
// Calculate the max IoU between anchors and gt boxes // Calculate the max IoU between anchors and gt boxes
Tensor anchor_to_gt_max; // Map from anchor to gt box that has highest overlap
anchor_to_gt_max.mutable_data<T>( auto place = ctx.GetPlace();
framework::make_ddim({static_cast<int64_t>(col), 1}), Tensor anchor_to_gt_max, anchor_to_gt_argmax;
platform::CPUPlace()); anchor_to_gt_max.mutable_data<T>({col}, place);
auto& place = *ctx.template device_context<platform::CPUDeviceContext>() int* argmax = anchor_to_gt_argmax.mutable_data<int>({col}, place);
.eigen_device();
auto x = EigenMatrix<T>::From(dist); auto x = framework::EigenMatrix<T>::From(dist);
auto x_col_max = EigenMatrix<T>::From(anchor_to_gt_max); auto x_col_max = framework::EigenVector<T>::Flatten(anchor_to_gt_max);
x_col_max.device(place) = auto x_col_argmax =
x.maximum(Eigen::DSizes<int, 1>(0)) framework::EigenVector<int>::Flatten(anchor_to_gt_argmax);
.reshape(Eigen::DSizes<int, 2>(static_cast<int64_t>(col), 1)); x_col_max = x.maximum(Eigen::DSizes<int, 1>(0));
x_col_argmax = x.argmax(0).template cast<int>();
// Follow the Faster RCNN's implementation // Follow the Faster RCNN's implementation
ScoreAssign(dist_data, anchor_to_gt_max, row, col, pos_threshold, ScoreAssign(dist_data, anchor_to_gt_max, row, col, pos_threshold,
neg_threshold, target_label_data, fg_inds, bg_inds); neg_threshold, target_label, &fg_inds, &bg_inds);
// Reservoir Sampling // Reservoir Sampling
ReservoirSampling(fg_num, fg_offset, engine, fg_inds); ReservoirSampling(fg_num, engine, &fg_inds);
int bg_num = rpn_batch_size - (fg_inds->size() - fg_offset); int fg_num2 = static_cast<int>(fg_inds.size());
ReservoirSampling(bg_num, bg_offset, engine, bg_inds); int bg_num = rpn_batch_size - fg_num2;
} ReservoirSampling(bg_num, engine, &bg_inds);
void Compute(const framework::ExecutionContext& context) const override { gt_inds.reserve(fg_num2);
auto* dist = context.Input<LoDTensor>("DistMat"); for (int i = 0; i < fg_num2; ++i) {
auto* loc_index = context.Output<Tensor>("LocationIndex"); gt_inds.emplace_back(argmax[fg_inds[i]]);
auto* score_index = context.Output<Tensor>("ScoreIndex");
auto* tgt_lbl = context.Output<Tensor>("TargetLabel");
auto col = dist->dims()[1];
int64_t n = dist->lod().size() == 0UL
? 1
: static_cast<int64_t>(dist->lod().back().size() - 1);
if (dist->lod().size()) {
PADDLE_ENFORCE_EQ(dist->lod().size(), 1UL,
"Only support 1 level of LoD.");
} }
int rpn_batch_size = context.Attr<int>("rpn_batch_size_per_im"); std::vector<std::vector<int>> fg_bg_gt;
float pos_threshold = context.Attr<float>("rpn_positive_overlap"); fg_bg_gt.emplace_back(fg_inds);
float neg_threshold = context.Attr<float>("rpn_negative_overlap"); fg_bg_gt.emplace_back(bg_inds);
float fg_fraction = context.Attr<float>("fg_fraction"); fg_bg_gt.emplace_back(gt_inds);
int fg_num = static_cast<int>(rpn_batch_size * fg_fraction);
int64_t* target_label_data =
tgt_lbl->mutable_data<int64_t>({n * col, 1}, context.GetPlace());
auto& dev_ctx = context.device_context<platform::CPUDeviceContext>(); return fg_bg_gt;
math::SetConstant<platform::CPUDeviceContext, int64_t> iset;
iset(dev_ctx, tgt_lbl, static_cast<int>(-1));
std::vector<int> fg_inds;
std::vector<int> bg_inds;
std::random_device rnd;
std::minstd_rand engine;
int seed =
context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
engine.seed(seed);
if (n == 1) {
RpnTargetAssign(context, *dist, pos_threshold, neg_threshold,
rpn_batch_size, fg_num, engine, &fg_inds, &bg_inds,
target_label_data);
} else {
auto lod = dist->lod().back();
for (size_t i = 0; i < lod.size() - 1; ++i) {
Tensor one_ins = dist->Slice(lod[i], lod[i + 1]);
RpnTargetAssign(context, one_ins, pos_threshold, neg_threshold,
rpn_batch_size, fg_num, engine, &fg_inds, &bg_inds,
target_label_data + i * col);
}
}
int* loc_index_data = loc_index->mutable_data<int>(
{static_cast<int>(fg_inds.size())}, context.GetPlace());
int* score_index_data = score_index->mutable_data<int>(
{static_cast<int>(fg_inds.size() + bg_inds.size())},
context.GetPlace());
memcpy(loc_index_data, reinterpret_cast<int*>(&fg_inds[0]),
fg_inds.size() * sizeof(int));
memcpy(score_index_data, reinterpret_cast<int*>(&fg_inds[0]),
fg_inds.size() * sizeof(int));
memcpy(score_index_data + fg_inds.size(),
reinterpret_cast<int*>(&bg_inds[0]), bg_inds.size() * sizeof(int));
} }
}; };
class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("Anchor",
"(Tensor) input anchor is a 2-D Tensor with shape [H*W*A, 4].");
AddInput("GtBox", "(LoDTensor) input groud-truth bbox with shape [K, 4].");
AddInput( AddInput(
"DistMat", "DistMat",
"(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape " "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
...@@ -241,12 +305,15 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -241,12 +305,15 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
"ScoreIndex", "ScoreIndex",
"(Tensor), The indexes of foreground and background anchors in all " "(Tensor), The indexes of foreground and background anchors in all "
"RPN anchors(The rest anchors are ignored). The shape of the " "RPN anchors(The rest anchors are ignored). The shape of the "
"ScoreIndex is [F + B], F and B depend on the value of input " "ScoreIndex is [F + B], F and B are sampled foreground and backgroud "
"tensor and attributes."); " number.");
AddOutput("TargetLabel", AddOutput("TargetBBox",
"(Tensor<int64_t>), The target labels of each anchor with shape " "(Tensor<int64_t>), The target bbox deltas with shape "
"[K * M, 1], " "[F, 4], F is the sampled foreground number.");
"K and M is the same as they are in DistMat."); AddOutput(
"TargetLabel",
"(Tensor<int64_t>), The target labels of each anchor with shape "
"[F + B, 1], F and B are sampled foreground and backgroud number.");
AddComment(R"DOC( AddComment(R"DOC(
This operator can be, for given the IoU between the ground truth bboxes and the This operator can be, for given the IoU between the ground truth bboxes and the
anchors, to assign classification and regression targets to each prediction. anchors, to assign classification and regression targets to each prediction.
......
...@@ -39,19 +39,6 @@ bool RequestSendHandler::Handle(const std::string& varname, ...@@ -39,19 +39,6 @@ bool RequestSendHandler::Handle(const std::string& varname,
const std::string& out_var_name) { const std::string& out_var_name) {
VLOG(4) << "RequestSendHandler:" << varname; VLOG(4) << "RequestSendHandler:" << varname;
// Async
if (!sync_mode_) {
rpc_server_->Profiler().OneStep();
try {
executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
scope);
} catch (std::exception& e) {
LOG(ERROR) << "async: run sub program error " << e.what();
return false;
}
return true;
}
// Sync // Sync
if (varname == BATCH_BARRIER_MESSAGE) { if (varname == BATCH_BARRIER_MESSAGE) {
VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE"; VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE";
...@@ -60,17 +47,31 @@ bool RequestSendHandler::Handle(const std::string& varname, ...@@ -60,17 +47,31 @@ bool RequestSendHandler::Handle(const std::string& varname,
VLOG(3) << "sync: recv complete message"; VLOG(3) << "sync: recv complete message";
rpc_server_->Complete(); rpc_server_->Complete();
} else { } else {
VLOG(3) << "sync: received var_name: " << varname; // Async
rpc_server_->WaitCond(kRequestSend); if (!sync_mode_) {
VLOG(3) << "sync: processing received var: " << varname; VLOG(3) << "async process var: " << varname;
rpc_server_->Profiler().OneStep();
if (invar == nullptr) { try {
LOG(FATAL) << "sync: Can not find server side var: " << varname; executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
return false; scope);
} } catch (std::exception& e) {
if (invar->IsType<framework::SelectedRows>()) { LOG(ERROR) << "async: run sub program error " << e.what();
std::unique_lock<std::mutex> lock(mutex_sparse_vars_); return false;
sparse_vars_.push_back(invar); }
return true;
} else { // sync
rpc_server_->WaitCond(kRequestSend);
VLOG(3) << "sync: processing received var: " << varname;
if (invar == nullptr) {
LOG(FATAL) << "sync: Can not find server side var: " << varname;
return false;
}
if (invar->IsType<framework::SelectedRows>()) {
std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
sparse_vars_.push_back(invar);
}
} }
} }
return true; return true;
......
...@@ -89,12 +89,12 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -89,12 +89,12 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE_EQ(b_dims[0], 1, PADDLE_ENFORCE_EQ(b_dims[0], 1,
"The first dimension of Input(Bias) should be 1."); "The first dimension of Input(Bias) should be 1.");
PADDLE_ENFORCE(!ctx->Attrs().Get<bool>("use_peepholes"), auto use_peepholes = ctx->Attrs().Get<bool>("use_peepholes");
"Do not support peephole yet."); PADDLE_ENFORCE_EQ(b_dims[1], (use_peepholes ? 7 : 4) * frame_size,
PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
"The second dimension of Input(Bias) should be " "The second dimension of Input(Bias) should be "
"4 * %d if disable peepholes connection", "7 * %d if enable peepholes connection or"
frame_size); "4 * %d if disable peepholes",
frame_size, frame_size);
framework::DDim out_dims({x_dims[0], frame_size}); framework::DDim out_dims({x_dims[0], frame_size});
ctx->SetOutputDim("Hidden", out_dims); ctx->SetOutputDim("Hidden", out_dims);
...@@ -232,16 +232,17 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -232,16 +232,17 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
act_cand = act_functor(act_cand_str); \ act_cand = act_functor(act_cand_str); \
} }
#define INIT_BASE_INPUT_OUTPUT \ #define INIT_BASE_INPUT_OUTPUT \
auto* x = ctx.Input<LoDTensor>("X"); \ auto* x = ctx.Input<LoDTensor>("X"); \
auto* h0 = ctx.Input<Tensor>("H0"); \ auto* h0 = ctx.Input<Tensor>("H0"); \
auto* c0 = ctx.Input<Tensor>("C0"); \ auto* c0 = ctx.Input<Tensor>("C0"); \
auto* wx = ctx.Input<Tensor>("WeightX"); \ auto* wx = ctx.Input<Tensor>("WeightX"); \
auto* wh = ctx.Input<Tensor>("WeightH"); \ auto* wh = ctx.Input<Tensor>("WeightH"); \
auto* bias = ctx.Input<Tensor>("Bias"); \ auto* bias = ctx.Input<Tensor>("Bias"); \
auto* xx = ctx.Output<LoDTensor>("XX"); \ auto* xx = ctx.Output<LoDTensor>("XX"); \
auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \ auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
auto* cell_out = ctx.Output<LoDTensor>("Cell"); \ auto* cell_out = ctx.Output<LoDTensor>("Cell"); \
bool use_peepholes = ctx.Attr<bool>("use_peepholes"); \
bool is_reverse = ctx.Attr<bool>("is_reverse"); bool is_reverse = ctx.Attr<bool>("is_reverse");
#define INIT_BASE_SIZES \ #define INIT_BASE_SIZES \
...@@ -266,12 +267,21 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -266,12 +267,21 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
const T* x_data = x->data<T>(); const T* x_data = x->data<T>();
const T* h0_data = h0 ? h0->data<T>() : nullptr; const T* h0_data = h0 ? h0->data<T>() : nullptr;
const T* c0_data = c0 ? c0->data<T>() : nullptr; const T* c0_data = c0 ? c0->data<T>() : nullptr;
const T* bias_data = bias->data<T>();
const T* wc_data = bias_data + D4; // w_ic, w_fc, w_oc
const T* wx_data = wx->data<T>(); const T* wx_data = wx->data<T>();
const T* wh_data = wh->data<T>(); const T* wh_data = wh->data<T>();
T* xx_data = xx->mutable_data<T>(ctx.GetPlace()); T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
T* hidden_out_data = hidden_out->mutable_data<T>(ctx.GetPlace()); T* hidden_out_data = hidden_out->mutable_data<T>(ctx.GetPlace());
T* cell_out_data = cell_out->mutable_data<T>(ctx.GetPlace()); T* cell_out_data = cell_out->mutable_data<T>(ctx.GetPlace());
// use local variable
framework::DDim check_dims({3, D});
Tensor checked_cell; // w_ic * Ct-1, w_fc * Ct-1, w_oc * Ct
auto checked_cell_data =
checked_cell.mutable_data<T>(check_dims, ctx.GetPlace());
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = math::GetBlas<DeviceContext, T>(ctx);
math::FCCompute<DeviceContext, T>(blas, total_T, D4, M, x_data, wx_data, math::FCCompute<DeviceContext, T>(blas, total_T, D4, M, x_data, wx_data,
xx_data, bias->data<T>()); xx_data, bias->data<T>());
...@@ -297,46 +307,86 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -297,46 +307,86 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
int seq_len = x_lod[0][bid + 1] - x_lod[0][bid]; int seq_len = x_lod[0][bid + 1] - x_lod[0][bid];
const T* prev_c_data = nullptr; const T* prev_c_data = nullptr;
const T* prev_h_data = nullptr; const T* prev_h_data = nullptr;
int tstart = 0; int tstart = 0;
if (h0_data) { if (h0_data) {
prev_h_data = h0_data + bid * D; prev_h_data = h0_data + bid * D;
prev_c_data = c0_data + bid * D; prev_c_data = c0_data + bid * D;
} else { } else {
// W_ch, W_ih, W_fh, W_oh // If step == 0 and there is no initialized hidden state, that is to say
act_gate(D3, xx_data + D, xx_data + D); // the H0 is zeros. Then W_h * H_t-1 can be skipped
// ~C_t
act_cand(D, xx_data, xx_data); act_cand(D, xx_data, xx_data);
// cell out= input*tilde if (use_peepholes) {
// I_t, F_t
act_gate(D2, xx_data + D, xx_data + D);
} else {
// I_t, F_t, O_t
act_gate(D3, xx_data + D, xx_data + D);
}
// C_t = I_t * ~C_t
blas.VMUL(D, xx_data, xx_data + D, cell_out_data); blas.VMUL(D, xx_data, xx_data + D, cell_out_data);
if (use_peepholes) {
// + W_oc * C_t for peephole connection
blas.VMUL(D, wc_data + D2, cell_out_data, checked_cell_data + D2);
blas.VADD(D, xx_data + D3, checked_cell_data + D2, xx_data + D3);
// O_t
act_gate(D, xx_data + D3, xx_data + D3);
}
// hidden out= act_state(cellout) * outgate // hidden out= act_state(cellout) * outgate
act_cell(D, cell_out_data, xx_data + D2); act_cell(D, cell_out_data, xx_data + D2);
// H_t = O_t * act_state(C_t)
blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data); blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data);
// prev // prev
prev_h_data = hidden_out_data; prev_h_data = hidden_out_data;
prev_c_data = cell_out_data; prev_c_data = cell_out_data;
tstart = 1;
tstart = 1;
move_step(); move_step();
} }
for (int step = tstart; step < seq_len; ++step) { for (int step = tstart; step < seq_len; ++step) {
// + W_h * H_t-1
blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D4, D, static_cast<T>(1), blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D4, D, static_cast<T>(1),
prev_h_data, D, wh_data, D4, static_cast<T>(1), xx_data, D4); prev_h_data, D, wh_data, D4, static_cast<T>(1), xx_data, D4);
// W_ch, W_ih, W_fh, W_oh // ~C_t
act_gate(D3, xx_data + D, xx_data + D);
act_cand(D, xx_data, xx_data); act_cand(D, xx_data, xx_data);
// a = forget * prev_cell if (use_peepholes) {
// + W_ic|W_fc * C_t-1 for peephole connection
blas.VMUL(D, wc_data, prev_c_data, checked_cell_data);
blas.VMUL(D, wc_data + D, prev_c_data, checked_cell_data + D);
blas.VADD(D2, xx_data + D, checked_cell_data, xx_data + D);
// I_t, F_t
act_gate(D2, xx_data + D, xx_data + D);
} else {
// I_t, F_t, O_t
act_gate(D3, xx_data + D, xx_data + D);
}
// F_t * C_t-1
blas.VMUL(D, xx_data + D2, prev_c_data, xx_data + D2); blas.VMUL(D, xx_data + D2, prev_c_data, xx_data + D2);
// I_t * ~C_t
// b = input * tilde
blas.VMUL(D, xx_data, xx_data + D, xx_data + D); blas.VMUL(D, xx_data, xx_data + D, xx_data + D);
// C_t = F_t * C_t-1 + I_t * ~C_t
// cell out= a+b
blas.VADD(D, xx_data + D, xx_data + D2, cell_out_data); blas.VADD(D, xx_data + D, xx_data + D2, cell_out_data);
if (use_peepholes) {
// + W_oc * C_t for peephole connection
blas.VMUL(D, wc_data + D2, cell_out_data, checked_cell_data + D2);
blas.VADD(D, xx_data + D3, checked_cell_data + D2, xx_data + D3);
// O_t
act_gate(D, xx_data + D3, xx_data + D3);
}
// hidden out= act_state(cellout) * outgate // hidden out= act_state(cellout) * outgate
act_cell(D, cell_out_data, xx_data + D2); act_cell(D, cell_out_data, xx_data + D2);
// H_t = O_t * act_state(C_t)
blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data); blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data);
// prev // prev
...@@ -344,14 +394,14 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -344,14 +394,14 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
prev_c_data = cell_out_data; prev_c_data = cell_out_data;
move_step(); move_step();
} } // for each step in batch
} } // for each batch
} }
void BatchCompute(const framework::ExecutionContext& ctx) const { void BatchCompute(const framework::ExecutionContext& ctx) const {
using DeviceContext = platform::CPUDeviceContext; using DeviceContext = platform::CPUDeviceContext;
INIT_BASE_INPUT_OUTPUT INIT_BASE_INPUT_OUTPUT
if (x->lod()[0].size() == 2) { if (x->lod()[0].size() == 2) { // batch size == 1
SeqCompute(ctx); SeqCompute(ctx);
return; return;
} }
...@@ -367,6 +417,8 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -367,6 +417,8 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
const T* x_data = x->data<T>(); const T* x_data = x->data<T>();
const T* wx_data = wx->data<T>(); const T* wx_data = wx->data<T>();
const T* wh_data = wh->data<T>(); const T* wh_data = wh->data<T>();
const T* bias_data = bias->data<T>();
const T* wc_data = bias_data + D4; // w_ic, w_fc, w_oc
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
T* xx_data = xx->mutable_data<T>(place); T* xx_data = xx->mutable_data<T>(place);
T* batched_input_data = batched_input->mutable_data<T>(place); T* batched_input_data = batched_input->mutable_data<T>(place);
...@@ -375,6 +427,12 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -375,6 +427,12 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
hidden_out->mutable_data<T>(place); hidden_out->mutable_data<T>(place);
cell_out->mutable_data<T>(place); cell_out->mutable_data<T>(place);
// use local variable
framework::DDim check_dims({3, D});
Tensor checked_cell; // w_ic * Ct-1, w_fc * Ct-1, w_oc * Ct
auto checked_cell_data =
checked_cell.mutable_data<T>(check_dims, ctx.GetPlace());
math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch; math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
auto& dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx); auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
...@@ -396,17 +454,27 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -396,17 +454,27 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
reordered_h0->Resize({max_bs, D}); reordered_h0->Resize({max_bs, D});
reordered_c0->Resize({max_bs, D}); reordered_c0->Resize({max_bs, D});
T* prev_batch_h_data = nullptr;
T* prev_batch_c_data = nullptr;
T* cur_batch_in_data = batched_input_data;
T* cur_batch_h_out_data = batched_h_out_data;
T* cur_batch_c_out_data = batched_c_out_data;
auto move_step = [&](int bs) {
cur_batch_in_data += bs * D4;
cur_batch_c_out_data += bs * D;
cur_batch_h_out_data += bs * D;
};
int tstart = 0; int tstart = 0;
T* prev_h_data = nullptr;
T* prev_c_data = nullptr;
if (h0) { if (h0) {
// reorder h0, c0 // reorder h0, c0
T* reordered_h0_data = reordered_h0->mutable_data<T>(place); T* reordered_h0_data = reordered_h0->mutable_data<T>(place);
T* reordered_c0_data = reordered_c0->mutable_data<T>(place); T* reordered_c0_data = reordered_c0->mutable_data<T>(place);
const T* h0_data = h0->data<T>(); const T* h0_data = h0->data<T>();
const T* c0_data = c0->data<T>(); const T* c0_data = c0->data<T>();
prev_h_data = reordered_h0_data; prev_batch_h_data = reordered_h0_data;
prev_c_data = reordered_c0_data; prev_batch_c_data = reordered_c0_data;
size_t sz = sizeof(T) * D; size_t sz = sizeof(T) * D;
for (int i = 0; i < max_bs; ++i) { for (int i = 0; i < max_bs; ++i) {
std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz); std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz);
...@@ -415,71 +483,122 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -415,71 +483,122 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
reordered_c0_data += D; reordered_c0_data += D;
} }
} else { } else {
// compute without h0, c0 // Compute with no H0/C0
T* cur_in_data = batched_input_data; T* cur_in_data = cur_batch_in_data;
T* cur_h_out_data = batched_h_out_data; T* cur_c_out_data = cur_batch_c_out_data;
T* cur_c_out_data = batched_c_out_data; T* cur_h_out_data = cur_batch_h_out_data;
// W_ch, W_ih, W_fh, W_oh
for (int i = 0; i < max_bs; ++i) { // If step == 0 and there is no initialized hidden state, that is to say
act_gate(D3, cur_in_data + D, cur_in_data + D); // the H0 is zeros. Then W_h * H_t-1 can be skiped
for (int i = 0; i < max_bs; ++i) { // iterate each data in 1st batch
// ~C_t
act_cand(D, cur_in_data, cur_in_data); act_cand(D, cur_in_data, cur_in_data);
// cell out= input*tilde
if (use_peepholes) {
// I_t, F_t
act_gate(D2, cur_in_data + D, cur_in_data + D);
} else {
// I_t, F_t, O_t
act_gate(D3, cur_in_data + D, cur_in_data + D);
}
// C_t = I_t * ~C_t
blas.VMUL(D, cur_in_data, cur_in_data + D, cur_c_out_data); blas.VMUL(D, cur_in_data, cur_in_data + D, cur_c_out_data);
if (use_peepholes) {
// + W_oc * C_t for peephole connection
blas.VMUL(D, wc_data + D2, cur_c_out_data, checked_cell_data + D2);
blas.VADD(D, cur_in_data + D3, checked_cell_data + D2,
cur_in_data + D3);
// O_t
act_gate(D, cur_in_data + D3, cur_in_data + D3);
}
// hidden out= act_state(cellout) * outgate // hidden out= act_state(cellout) * outgate
act_cell(D, cur_c_out_data, cur_in_data + D2); act_cell(D, cur_c_out_data, cur_in_data + D2);
// H_t = O_t * act_state(C_t)
blas.VMUL(D, cur_in_data + D2, cur_in_data + D3, cur_h_out_data); blas.VMUL(D, cur_in_data + D2, cur_in_data + D3, cur_h_out_data);
// add offset // move to next data in the same batch
cur_in_data += D4; cur_in_data += D4;
cur_c_out_data += D; cur_c_out_data += D;
cur_h_out_data += D; cur_h_out_data += D;
} }
// move to data for next timestep
prev_batch_h_data = cur_batch_h_out_data;
prev_batch_c_data = cur_batch_c_out_data;
move_step(max_bs);
tstart = 1; tstart = 1;
prev_h_data = batched_h_out_data;
prev_c_data = batched_c_out_data;
} }
// Then start from next
const auto& batch_starts = batched_lod[0]; const auto& batch_starts = batched_lod[0];
const int max_seq_len = batch_starts.size() - 1; const int max_seq_len = batch_starts.size() - 1;
const int offset = tstart * max_bs * D;
batched_input_data = batched_input_data + offset * 4;
batched_h_out_data = batched_h_out_data + offset;
batched_c_out_data = batched_c_out_data + offset;
for (int step = tstart; step < max_seq_len; ++step) { for (int step = tstart; step < max_seq_len; ++step) {
const int cur_bs = batch_starts[step + 1] - batch_starts[step]; const int cur_bs = batch_starts[step + 1] - batch_starts[step];
// + W_h * H_t-1
blas.GEMM(CblasNoTrans, CblasNoTrans, cur_bs, D4, D, static_cast<T>(1), blas.GEMM(CblasNoTrans, CblasNoTrans, cur_bs, D4, D, static_cast<T>(1),
prev_h_data, D, wh_data, D4, static_cast<T>(1), prev_batch_h_data, D, wh_data, D4, static_cast<T>(1),
batched_input_data, D4); cur_batch_in_data, D4);
T* cur_in_data = batched_input_data; T* cur_in_data = cur_batch_in_data;
T* cur_prev_c_data = prev_c_data; T* cur_c_out_data = cur_batch_c_out_data;
T* cur_c_out_data = batched_c_out_data; T* cur_h_out_data = cur_batch_h_out_data;
T* cur_h_out_data = batched_h_out_data; T* prev_c_data = prev_batch_c_data; // NULL if no C0 in step0
for (int i = 0; i < cur_bs; ++i) { T* prev_h_data = prev_batch_h_data; // NULL if no H0 in step0
// W_ch, W_ih, W_fh, W_oh auto next_data_in_batch = [&]() {
act_gate(D3, cur_in_data + D, cur_in_data + D); cur_in_data += D4;
cur_c_out_data += D;
cur_h_out_data += D;
prev_c_data = prev_c_data ? prev_c_data + D : nullptr;
prev_h_data = prev_h_data ? prev_h_data + D : nullptr;
};
for (int i = 0; i < cur_bs; ++i) { // iterate each data in same batch
// ~C_t
act_cand(D, cur_in_data, cur_in_data); act_cand(D, cur_in_data, cur_in_data);
// a = forget * prev_cell
blas.VMUL(D, cur_in_data + D2, cur_prev_c_data, cur_in_data + D2); if (use_peepholes) {
// b = input * tilde // + W_ic|W_fc * C_t-1 for peephole connection
blas.VMUL(D, wc_data, prev_c_data, checked_cell_data);
blas.VMUL(D, wc_data + D, prev_c_data, checked_cell_data + D);
blas.VADD(D2, cur_in_data + D, checked_cell_data, cur_in_data + D);
// I_t, F_t
act_gate(D2, cur_in_data + D, cur_in_data + D);
} else {
// I_t, F_t, O_t
act_gate(D3, cur_in_data + D, cur_in_data + D);
}
// F_t * C_t-1
blas.VMUL(D, cur_in_data + D2, prev_c_data, cur_in_data + D2);
// I_t * ~C_t
blas.VMUL(D, cur_in_data, cur_in_data + D, cur_in_data + D); blas.VMUL(D, cur_in_data, cur_in_data + D, cur_in_data + D);
// cell out= a+b // C_t = F_t * C_t-1 + I_t * ~C_t
blas.VADD(D, cur_in_data + D, cur_in_data + D2, cur_c_out_data); blas.VADD(D, cur_in_data + D, cur_in_data + D2, cur_c_out_data);
if (use_peepholes) {
// + W_oc * C_t for peephole connection
blas.VMUL(D, wc_data + D2, cur_c_out_data, checked_cell_data + D2);
blas.VADD(D, cur_in_data + D3, checked_cell_data + D2,
cur_in_data + D3);
// O_t
act_gate(D, cur_in_data + D3, cur_in_data + D3);
}
// hidden out= act_state(cellout) * outgate // hidden out= act_state(cellout) * outgate
act_cell(D, cur_c_out_data, cur_in_data + D2); act_cell(D, cur_c_out_data, cur_in_data + D2);
// H_t = O_t * act_state(C_t)
blas.VMUL(D, cur_in_data + D2, cur_in_data + D3, cur_h_out_data); blas.VMUL(D, cur_in_data + D2, cur_in_data + D3, cur_h_out_data);
cur_in_data += D4; // move to next data in same batch
cur_prev_c_data += D; next_data_in_batch();
cur_c_out_data += D;
cur_h_out_data += D;
} }
// move to data for next timestep
prev_c_data = batched_c_out_data; prev_batch_h_data = cur_batch_h_out_data;
prev_h_data = batched_h_out_data; prev_batch_c_data = cur_batch_c_out_data;
batched_c_out_data = cur_c_out_data; move_step(cur_bs);
batched_h_out_data = cur_h_out_data;
batched_input_data = cur_in_data;
} }
math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq; math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
......
...@@ -92,12 +92,12 @@ class GRUUnitKernel : public framework::OpKernel<T> { ...@@ -92,12 +92,12 @@ class GRUUnitKernel : public framework::OpKernel<T> {
gate_data, frame_size * 3); gate_data, frame_size * 3);
// calculate activited gate // calculate activited gate
Eigen::array<int, 2> extents = {batch_size, frame_size}; Eigen::array<int, 2> extents{{batch_size, frame_size}};
Eigen::array<int, 2> u_offsets = {0, 0}; Eigen::array<int, 2> u_offsets{{0, 0}};
ActCompute(context.Attr<int>("gate_activation"), place, ActCompute(context.Attr<int>("gate_activation"), place,
g.slice(u_offsets, extents), g.slice(u_offsets, extents)); g.slice(u_offsets, extents), g.slice(u_offsets, extents));
auto u = g.slice(u_offsets, extents); // update gate auto u = g.slice(u_offsets, extents); // update gate
Eigen::array<int, 2> r_offsets = {0, frame_size}; Eigen::array<int, 2> r_offsets{{0, frame_size}};
ActCompute(context.Attr<int>("gate_activation"), place, ActCompute(context.Attr<int>("gate_activation"), place,
g.slice(r_offsets, extents), g.slice(r_offsets, extents)); g.slice(r_offsets, extents), g.slice(r_offsets, extents));
auto r = g.slice(r_offsets, extents); // reset gate auto r = g.slice(r_offsets, extents); // reset gate
...@@ -107,7 +107,7 @@ class GRUUnitKernel : public framework::OpKernel<T> { ...@@ -107,7 +107,7 @@ class GRUUnitKernel : public framework::OpKernel<T> {
weight_data + frame_size * frame_size * 2, frame_size, 1, weight_data + frame_size * frame_size * 2, frame_size, 1,
gate_data + frame_size * 2, frame_size * 3); gate_data + frame_size * 2, frame_size * 3);
Eigen::array<int, 2> c_offsets = {0, frame_size * 2}; Eigen::array<int, 2> c_offsets{{0, frame_size * 2}};
ActCompute(context.Attr<int>("activation"), place, ActCompute(context.Attr<int>("activation"), place,
g.slice(c_offsets, extents), g.slice(c_offsets, extents)); g.slice(c_offsets, extents), g.slice(c_offsets, extents));
auto c = g.slice(c_offsets, extents); // output candidate auto c = g.slice(c_offsets, extents); // output candidate
...@@ -171,12 +171,12 @@ class GRUUnitGradKernel : public framework::OpKernel<T> { ...@@ -171,12 +171,12 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
int batch_size = input->dims()[0]; int batch_size = input->dims()[0];
int frame_size = hidden_prev->dims()[1]; int frame_size = hidden_prev->dims()[1];
Eigen::array<int, 2> extents = {batch_size, frame_size}; Eigen::array<int, 2> extents{{batch_size, frame_size}};
Eigen::array<int, 2> u_offsets = {0, 0}; Eigen::array<int, 2> u_offsets{{0, 0}};
auto u = g.slice(u_offsets, extents); // update gate auto u = g.slice(u_offsets, extents); // update gate
Eigen::array<int, 2> r_offsets = {0, frame_size}; Eigen::array<int, 2> r_offsets{{0, frame_size}};
auto r = g.slice(r_offsets, extents); // reset gate auto r = g.slice(r_offsets, extents); // reset gate
Eigen::array<int, 2> c_offsets = {0, frame_size * 2}; Eigen::array<int, 2> c_offsets{{0, frame_size * 2}};
auto c = g.slice(c_offsets, extents); // output candidate auto c = g.slice(c_offsets, extents); // output candidate
// backward for unactivated update gate // backward for unactivated update gate
......
...@@ -57,7 +57,7 @@ class LookupTableKernel : public framework::OpKernel<T> { ...@@ -57,7 +57,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
memset(output + i * row_width, 0, row_width * sizeof(T)); memset(output + i * row_width, 0, row_width * sizeof(T));
} else { } else {
PADDLE_ENFORCE_LT(ids[i], row_number); PADDLE_ENFORCE_LT(ids[i], row_number);
PADDLE_ENFORCE_GE(ids[i], 0); PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i);
memcpy(output + i * row_width, table + ids[i] * row_width, memcpy(output + i * row_width, table + ids[i] * row_width,
row_width * sizeof(T)); row_width * sizeof(T));
} }
......
...@@ -36,9 +36,13 @@ class RmspropOp : public framework::OperatorWithKernel { ...@@ -36,9 +36,13 @@ class RmspropOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
"Output(param_out) of RmspropOp should not be null."); "Output(param_out) of RmspropOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("MomentOut"), PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
"Output(Momentum_out) of RmspropOp should not be null."); "Output(MomentOut) of RmspropOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("MeanSquareOut"), PADDLE_ENFORCE(ctx->HasOutput("MeanSquareOut"),
"Output(MeanSquareOut) of RmspropOp should not be null."); "Output(MeanSquareOut) of RmspropOp should not be null.");
if (ctx->Attrs().Get<bool>("centered")) {
PADDLE_ENFORCE(ctx->HasOutput("MeanGradOut"),
"Output(MeanGradOut) of RmspropOp should not be null.");
}
auto param_dim = ctx->GetInputDim("Param"); auto param_dim = ctx->GetInputDim("Param");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
...@@ -58,6 +62,9 @@ class RmspropOp : public framework::OperatorWithKernel { ...@@ -58,6 +62,9 @@ class RmspropOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("ParamOut", param_dim); ctx->SetOutputDim("ParamOut", param_dim);
ctx->SetOutputDim("MomentOut", param_dim); ctx->SetOutputDim("MomentOut", param_dim);
ctx->SetOutputDim("MeanSquareOut", param_dim); ctx->SetOutputDim("MeanSquareOut", param_dim);
if (ctx->Attrs().Get<bool>("centered")) {
ctx->SetOutputDim("MeanGradOut", param_dim);
}
} }
}; };
...@@ -70,6 +77,10 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -70,6 +77,10 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("MeanSquare", AddInput("MeanSquare",
"(Tensor, default Tensor<float>)" "(Tensor, default Tensor<float>)"
" The mean square value that gets updated."); " The mean square value that gets updated.");
AddInput("MeanGrad",
"(Tensor, default Tensor<float>)"
" The moving average of gradient")
.AsDispensable();
AddInput("LearningRate", AddInput("LearningRate",
"(Tensor, default Tensor<float>) " "(Tensor, default Tensor<float>) "
"The learning rate should be a tensor of size 1."); "The learning rate should be a tensor of size 1.");
...@@ -82,6 +93,8 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -82,6 +93,8 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("ParamOut", "(Tensor) Output updated parameter value."); AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
AddOutput("MomentOut", "(Tensor) Output updated moment."); AddOutput("MomentOut", "(Tensor) Output updated moment.");
AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value."); AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value.");
AddOutput("MeanGradOut",
"(Tensor) Output moving average of gradient updated value.");
AddAttr<float>("epsilon", AddAttr<float>("epsilon",
"(float, default 1e-10) Constant " "(float, default 1e-10) Constant "
...@@ -93,6 +106,8 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -93,6 +106,8 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault(0.9f); .SetDefault(0.9f);
AddAttr<float>("momentum", "(float, default 0.0) Constant value.") AddAttr<float>("momentum", "(float, default 0.0) Constant value.")
.SetDefault(0.0f); .SetDefault(0.0f);
AddAttr<bool>("centered", "(bool, default false) use centered rmsprop.")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Rmsprop Optimizer. Rmsprop Optimizer.
...@@ -103,6 +118,14 @@ MomentOut = momentum * Moment + ...@@ -103,6 +118,14 @@ MomentOut = momentum * Moment +
ParamOut = Param - MomentOut ParamOut = Param - MomentOut
$$ $$
if centered is true:
mean_grad = decay * mean_square{t-1} + (1-decay) * gradient
mean_square = decay * mean_square{t-1} + (1-decay) * gradient ** 2
mom = momentum * mom{t-1} + learning_rate * g_t /
sqrt(mean_square - mean_grad**2 + epsilon)
param -= mom
The original slides that proposed Rmsprop: Slide 29 of The original slides that proposed Rmsprop: Slide 29 of
http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf) http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
......
...@@ -41,6 +41,7 @@ class RmspropOpKernel : public framework::OpKernel<T> { ...@@ -41,6 +41,7 @@ class RmspropOpKernel : public framework::OpKernel<T> {
float epsilon = ctx.Attr<float>("epsilon"); float epsilon = ctx.Attr<float>("epsilon");
float rho = ctx.Attr<float>("decay"); float rho = ctx.Attr<float>("decay");
float momentum = ctx.Attr<float>("momentum"); float momentum = ctx.Attr<float>("momentum");
bool centered = ctx.Attr<bool>("centered");
auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param")); auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
auto ms = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanSquare")); auto ms = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanSquare"));
...@@ -53,12 +54,24 @@ class RmspropOpKernel : public framework::OpKernel<T> { ...@@ -53,12 +54,24 @@ class RmspropOpKernel : public framework::OpKernel<T> {
auto ms_out = EigenVector<T>::Flatten(*mean_square_out); auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
auto& place = *ctx.template device_context<DeviceContext>().eigen_device(); auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
Eigen::DSizes<int, 1> grad_dsize(grad->numel()); Eigen::DSizes<int, 1> grad_dsize(static_cast<int>(grad->numel()));
ms_out.device(place) = rho * ms + (1 - rho) * g * g; ms_out.device(place) = rho * ms + (1 - rho) * g * g;
mom_out.device(place) = if (centered) {
momentum * mom + auto mg = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanGrad"));
lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt(); auto* mean_grad_out = ctx.Output<Tensor>("MeanGradOut");
mean_grad_out->mutable_data<T>(ctx.GetPlace());
auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
mg_out.device(place) = rho * mg + (1 - rho) * g;
mom_out.device(place) = momentum * mom +
lr.broadcast(grad_dsize) * g /
(ms_out - mg_out.square() + epsilon).sqrt();
} else {
mom_out.device(place) =
momentum * mom +
lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt();
}
p_out.device(place) = p - mom_out; p_out.device(place) = p - mom_out;
} }
}; };
......
...@@ -31,7 +31,7 @@ static inline int NumBlocks(const int N) { ...@@ -31,7 +31,7 @@ static inline int NumBlocks(const int N) {
template <typename T> template <typename T>
__global__ void GPUROIPoolForward( __global__ void GPUROIPoolForward(
const int nthreads, const T* input_data, const int64_t* input_rois, const int nthreads, const T* input_data, const T* input_rois,
const float spatial_scale, const int channels, const int height, const float spatial_scale, const int channels, const int height,
const int width, const int pooled_height, const int pooled_width, const int width, const int pooled_height, const int pooled_width,
int* roi_batch_id_data, T* output_data, int64_t* argmax_data) { int* roi_batch_id_data, T* output_data, int64_t* argmax_data) {
...@@ -43,7 +43,7 @@ __global__ void GPUROIPoolForward( ...@@ -43,7 +43,7 @@ __global__ void GPUROIPoolForward(
int c = (i / pooled_width / pooled_height) % channels; int c = (i / pooled_width / pooled_height) % channels;
int n = i / pooled_width / pooled_height / channels; int n = i / pooled_width / pooled_height / channels;
const int64_t* offset_input_rois = input_rois + n * kROISize; const T* offset_input_rois = input_rois + n * kROISize;
int roi_batch_ind = roi_batch_id_data[n]; int roi_batch_ind = roi_batch_id_data[n];
int roi_start_w = round(offset_input_rois[0] * spatial_scale); int roi_start_w = round(offset_input_rois[0] * spatial_scale);
int roi_start_h = round(offset_input_rois[1] * spatial_scale); int roi_start_h = round(offset_input_rois[1] * spatial_scale);
...@@ -93,7 +93,7 @@ __global__ void GPUROIPoolForward( ...@@ -93,7 +93,7 @@ __global__ void GPUROIPoolForward(
template <typename T> template <typename T>
__global__ void GPUROIPoolBackward( __global__ void GPUROIPoolBackward(
const int nthreads, const int64_t* input_rois, const T* output_grad, const int nthreads, const T* input_rois, const T* output_grad,
const int64_t* argmax_data, const int num_rois, const float spatial_scale, const int64_t* argmax_data, const int num_rois, const float spatial_scale,
const int channels, const int height, const int width, const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, int* roi_batch_id_data, const int pooled_height, const int pooled_width, int* roi_batch_id_data,
...@@ -174,8 +174,8 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> { ...@@ -174,8 +174,8 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
GPUROIPoolForward< GPUROIPoolForward<
T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>( T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
output_size, in->data<T>(), rois->data<int64_t>(), spatial_scale, output_size, in->data<T>(), rois->data<T>(), spatial_scale, channels,
channels, height, width, pooled_height, pooled_width, height, width, pooled_height, pooled_width,
roi_batch_id_list_gpu.data<int>(), out->mutable_data<T>(ctx.GetPlace()), roi_batch_id_list_gpu.data<int>(), out->mutable_data<T>(ctx.GetPlace()),
argmax->mutable_data<int64_t>(ctx.GetPlace())); argmax->mutable_data<int64_t>(ctx.GetPlace()));
} }
...@@ -228,7 +228,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> { ...@@ -228,7 +228,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
if (output_grad_size > 0) { if (output_grad_size > 0) {
GPUROIPoolBackward< GPUROIPoolBackward<
T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>( T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
output_grad_size, rois->data<int64_t>(), out_grad->data<T>(), output_grad_size, rois->data<T>(), out_grad->data<T>(),
argmax->data<int64_t>(), rois_num, spatial_scale, channels, height, argmax->data<int64_t>(), rois_num, spatial_scale, channels, height,
width, pooled_height, pooled_width, width, pooled_height, pooled_width,
roi_batch_id_list_gpu.data<int>(), roi_batch_id_list_gpu.data<int>(),
......
...@@ -72,7 +72,7 @@ class CPUROIPoolOpKernel : public framework::OpKernel<T> { ...@@ -72,7 +72,7 @@ class CPUROIPoolOpKernel : public framework::OpKernel<T> {
T* output_data = out->mutable_data<T>(ctx.GetPlace()); T* output_data = out->mutable_data<T>(ctx.GetPlace());
int64_t* argmax_data = argmax->mutable_data<int64_t>(ctx.GetPlace()); int64_t* argmax_data = argmax->mutable_data<int64_t>(ctx.GetPlace());
const int64_t* rois_data = rois->data<int64_t>(); const T* rois_data = rois->data<T>();
for (int n = 0; n < rois_num; ++n) { for (int n = 0; n < rois_num; ++n) {
int roi_batch_id = roi_batch_id_data[n]; int roi_batch_id = roi_batch_id_data[n];
int roi_start_w = round(rois_data[0] * spatial_scale); int roi_start_w = round(rois_data[0] * spatial_scale);
...@@ -171,7 +171,7 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel<T> { ...@@ -171,7 +171,7 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
} }
} }
const int64_t* rois_data = rois->data<int64_t>(); const T* rois_data = rois->data<T>();
const T* out_grad_data = out_grad->data<T>(); const T* out_grad_data = out_grad->data<T>();
const int64_t* argmax_data = argmax->data<int64_t>(); const int64_t* argmax_data = argmax->data<int64_t>();
T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace()); T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
......
...@@ -145,26 +145,23 @@ def rpn_target_assign(loc, ...@@ -145,26 +145,23 @@ def rpn_target_assign(loc,
""" """
helper = LayerHelper('rpn_target_assign', **locals()) helper = LayerHelper('rpn_target_assign', **locals())
# 1. Compute the regression target bboxes # Compute overlaps between the prior boxes and the gt boxes overlaps
target_bbox = box_coder(
prior_box=anchor_box,
prior_box_var=anchor_var,
target_box=gt_box,
code_type='encode_center_size',
box_normalized=False)
# 2. Compute overlaps between the prior boxes and the gt boxes overlaps
iou = iou_similarity(x=gt_box, y=anchor_box) iou = iou_similarity(x=gt_box, y=anchor_box)
# 3. Assign target label to anchors # Assign target label to anchors
loc_index = helper.create_tmp_variable(dtype=anchor_box.dtype) loc_index = helper.create_tmp_variable(dtype='int32')
score_index = helper.create_tmp_variable(dtype=anchor_box.dtype) score_index = helper.create_tmp_variable(dtype='int32')
target_label = helper.create_tmp_variable(dtype=anchor_box.dtype) target_label = helper.create_tmp_variable(dtype='int64')
target_bbox = helper.create_tmp_variable(dtype=anchor_box.dtype)
helper.append_op( helper.append_op(
type="rpn_target_assign", type="rpn_target_assign",
inputs={'DistMat': iou}, inputs={'Anchor': anchor_box,
'GtBox': gt_box,
'DistMat': iou},
outputs={ outputs={
'LocationIndex': loc_index, 'LocationIndex': loc_index,
'ScoreIndex': score_index, 'ScoreIndex': score_index,
'TargetLabel': target_label 'TargetLabel': target_label,
'TargetBBox': target_bbox,
}, },
attrs={ attrs={
'rpn_batch_size_per_im': rpn_batch_size_per_im, 'rpn_batch_size_per_im': rpn_batch_size_per_im,
...@@ -173,16 +170,16 @@ def rpn_target_assign(loc, ...@@ -173,16 +170,16 @@ def rpn_target_assign(loc,
'fg_fraction': fg_fraction 'fg_fraction': fg_fraction
}) })
# 4. Reshape and gather the target entry loc_index.stop_gradient = True
scores = nn.reshape(x=scores, shape=(-1, 2)) score_index.stop_gradient = True
loc = nn.reshape(x=loc, shape=(-1, 4)) target_label.stop_gradient = True
target_label = nn.reshape(x=target_label, shape=(-1, 1)) target_bbox.stop_gradient = True
target_bbox = nn.reshape(x=target_bbox, shape=(-1, 4))
scores = nn.reshape(x=scores, shape=(-1, 1))
loc = nn.reshape(x=loc, shape=(-1, 4))
predicted_scores = nn.gather(scores, score_index) predicted_scores = nn.gather(scores, score_index)
predicted_location = nn.gather(loc, loc_index) predicted_location = nn.gather(loc, loc_index)
target_label = nn.gather(target_label, score_index)
target_bbox = nn.gather(target_bbox, loc_index)
return predicted_scores, predicted_location, target_label, target_bbox return predicted_scores, predicted_location, target_label, target_bbox
......
...@@ -78,7 +78,7 @@ def accuracy(input, label, k=1, correct=None, total=None): ...@@ -78,7 +78,7 @@ def accuracy(input, label, k=1, correct=None, total=None):
return acc_out return acc_out
def auc(input, label, curve='ROC', num_thresholds=200, topk=1): def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
""" """
**Area Under the Curve (AUC) Layer** **Area Under the Curve (AUC) Layer**
...@@ -118,16 +118,14 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1): ...@@ -118,16 +118,14 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
""" """
helper = LayerHelper("auc", **locals()) helper = LayerHelper("auc", **locals())
auc_out = helper.create_tmp_variable(dtype="float64") auc_out = helper.create_tmp_variable(dtype="float64")
batch_auc_out = helper.create_tmp_variable(dtype="float64")
# make tp, tn, fp, fn persistable, so that can accumulate all batches. # make tp, tn, fp, fn persistable, so that can accumulate all batches.
tp = helper.create_global_variable( stat_pos = helper.create_global_variable(
persistable=True, dtype='int64', shape=[num_thresholds]) persistable=True, dtype='int64', shape=[num_thresholds + 1])
tn = helper.create_global_variable( stat_neg = helper.create_global_variable(
persistable=True, dtype='int64', shape=[num_thresholds]) persistable=True, dtype='int64', shape=[num_thresholds + 1])
fp = helper.create_global_variable(
persistable=True, dtype='int64', shape=[num_thresholds]) for var in [stat_pos, stat_neg]:
fn = helper.create_global_variable(
persistable=True, dtype='int64', shape=[num_thresholds])
for var in [tp, tn, fp, fn]:
helper.set_variable_initializer( helper.set_variable_initializer(
var, Constant( var, Constant(
value=0.0, force_cpu=True)) value=0.0, force_cpu=True))
...@@ -137,18 +135,15 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1): ...@@ -137,18 +135,15 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
inputs={ inputs={
"Predict": [input], "Predict": [input],
"Label": [label], "Label": [label],
"TP": [tp], "StatPos": [stat_pos],
"TN": [tn], "StatNeg": [stat_neg]
"FP": [fp],
"FN": [fn]
}, },
attrs={"curve": curve, attrs={"curve": curve,
"num_thresholds": num_thresholds}, "num_thresholds": num_thresholds},
outputs={ outputs={
"AUC": [auc_out], "AUC": [auc_out],
"TPOut": [tp], "BatchAUC": [batch_auc_out],
"TNOut": [tn], "StatPosOut": [stat_pos],
"FPOut": [fp], "StatNegOut": [stat_neg]
"FNOut": [fn]
}) })
return auc_out, [tp, tn, fp, fn] return auc_out, batch_auc_out, [stat_pos, stat_neg]
...@@ -3546,11 +3546,6 @@ def topk(input, k, name=None): ...@@ -3546,11 +3546,6 @@ def topk(input, k, name=None):
top5_values, top5_indices = layers.topk(input, k=5) top5_values, top5_indices = layers.topk(input, k=5)
""" """
shape = input.shape
if k < 1 or k >= shape[-1]:
raise ValueError("k must be greater than 0 and less than %d." %
(shape[-1]))
helper = LayerHelper("top_k", **locals()) helper = LayerHelper("top_k", **locals())
values = helper.create_tmp_variable(dtype=input.dtype) values = helper.create_tmp_variable(dtype=input.dtype)
indices = helper.create_tmp_variable(dtype="int64") indices = helper.create_tmp_variable(dtype="int64")
......
...@@ -558,8 +558,6 @@ class Auc(MetricBase): ...@@ -558,8 +558,6 @@ class Auc(MetricBase):
name: metric name name: metric name
curve: Specifies the name of the curve to be computed, 'ROC' [default] or curve: Specifies the name of the curve to be computed, 'ROC' [default] or
'PR' for the Precision-Recall-curve. 'PR' for the Precision-Recall-curve.
num_thresholds: The number of thresholds to use when discretizing the roc
curve.
"NOTE: only implement the ROC curve type via Python now." "NOTE: only implement the ROC curve type via Python now."
...@@ -574,15 +572,14 @@ class Auc(MetricBase): ...@@ -574,15 +572,14 @@ class Auc(MetricBase):
numpy_auc = metric.eval() numpy_auc = metric.eval()
""" """
def __init__(self, name, curve='ROC', num_thresholds=200): def __init__(self, name, curve='ROC', num_thresholds=4095):
super(Auc, self).__init__(name=name) super(Auc, self).__init__(name=name)
self._curve = curve self._curve = curve
self._num_thresholds = num_thresholds self._num_thresholds = num_thresholds
self._epsilon = 1e-6
self.tp_list = np.zeros((num_thresholds, )) _num_pred_buckets = num_thresholds + 1
self.fn_list = np.zeros((num_thresholds, )) self._stat_pos = [0] * _num_pred_buckets
self.tn_list = np.zeros((num_thresholds, )) self._stat_neg = [0] * _num_pred_buckets
self.fp_list = np.zeros((num_thresholds, ))
def update(self, preds, labels): def update(self, preds, labels):
if not _is_numpy_(labels): if not _is_numpy_(labels):
...@@ -590,41 +587,32 @@ class Auc(MetricBase): ...@@ -590,41 +587,32 @@ class Auc(MetricBase):
if not _is_numpy_(preds): if not _is_numpy_(preds):
raise ValueError("The 'predictions' must be a numpy ndarray.") raise ValueError("The 'predictions' must be a numpy ndarray.")
kepsilon = 1e-7 # to account for floating point imprecisions for i, lbl in enumerate(labels):
thresholds = [(i + 1) * 1.0 / (self._num_thresholds - 1) value = preds[i, 1]
for i in range(self._num_thresholds - 2)] bin_idx = int(value * self._num_thresholds)
thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon] assert bin_idx <= self._num_thresholds
if lbl:
# calculate TP, FN, TN, FP count self._stat_pos[bin_idx] += 1.0
for idx_thresh, thresh in enumerate(thresholds): else:
tp, fn, tn, fp = 0, 0, 0, 0 self._stat_neg[bin_idx] += 1.0
for i, lbl in enumerate(labels):
if lbl: @staticmethod
if preds[i, 1] >= thresh: def trapezoid_area(x1, x2, y1, y2):
tp += 1 return abs(x1 - x2) * (y1 + y2) / 2.0
else:
fn += 1
else:
if preds[i, 1] >= thresh:
fp += 1
else:
tn += 1
self.tp_list[idx_thresh] += tp
self.fn_list[idx_thresh] += fn
self.tn_list[idx_thresh] += tn
self.fp_list[idx_thresh] += fp
def eval(self): def eval(self):
epsilon = self._epsilon tot_pos = 0.0
num_thresholds = self._num_thresholds tot_neg = 0.0
tpr = (self.tp_list.astype("float32") + epsilon) / ( auc = 0.0
self.tp_list + self.fn_list + epsilon)
fpr = self.fp_list.astype("float32") / ( idx = self._num_thresholds
self.fp_list + self.tn_list + epsilon) while idx >= 0:
rec = (self.tp_list.astype("float32") + epsilon) / ( tot_pos_prev = tot_pos
self.tp_list + self.fp_list + epsilon) tot_neg_prev = tot_neg
tot_pos += self._stat_pos[idx]
x = fpr[:num_thresholds - 1] - fpr[1:] tot_neg += self._stat_neg[idx]
y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0 auc += self.trapezoid_area(tot_neg, tot_neg_prev, tot_pos,
auc_value = np.sum(x * y) tot_pos_prev)
return auc_value idx -= 1
return auc / tot_pos / tot_neg if tot_pos > 0.0 and tot_neg > 0.0 else 0.0
...@@ -897,7 +897,20 @@ class RMSPropOptimizer(Optimizer): ...@@ -897,7 +897,20 @@ class RMSPropOptimizer(Optimizer):
r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{v(w,t) + v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) +
\\epsilon}} \\nabla Q_{i}(w)
w & = w - v(w, t)
if centered is True:
.. math::
r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
g(w, t) & = \\rho g(w, t-1) + (1 - \\rho)\\nabla Q_{i}(w)
v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) - (g(w, t))^2 +
\\epsilon}} \\nabla Q_{i}(w) \\epsilon}} \\nabla Q_{i}(w)
w & = w - v(w, t) w & = w - v(w, t)
...@@ -915,6 +928,10 @@ class RMSPropOptimizer(Optimizer): ...@@ -915,6 +928,10 @@ class RMSPropOptimizer(Optimizer):
avoid division by zero, set 1e-6 by default. avoid division by zero, set 1e-6 by default.
momentum(float): :math:`\\beta` in equation is the momentum term, momentum(float): :math:`\\beta` in equation is the momentum term,
set 0.0 by default. set 0.0 by default.
centered(bool): If True, gradients are normalized by the estimated variance of
the gradient; if False, by the uncentered second moment. Setting this to
True may help with training, but is slightly more expensive in terms of
computation and memory. Defaults to False.
Raises: Raises:
ValueError: If learning_rate, rho, epsilon, momentum are None. ValueError: If learning_rate, rho, epsilon, momentum are None.
...@@ -928,12 +945,14 @@ class RMSPropOptimizer(Optimizer): ...@@ -928,12 +945,14 @@ class RMSPropOptimizer(Optimizer):
_momentum_acc_str = "momentum" _momentum_acc_str = "momentum"
_mean_square_acc_str = "mean_square" _mean_square_acc_str = "mean_square"
_mean_grad_acc_str = "mean_grad"
def __init__(self, def __init__(self,
learning_rate, learning_rate,
rho=0.95, rho=0.95,
epsilon=1.0e-6, epsilon=1.0e-6,
momentum=0.0, momentum=0.0,
centered=False,
**kwargs): **kwargs):
super(RMSPropOptimizer, self).__init__( super(RMSPropOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs) learning_rate=learning_rate, **kwargs)
...@@ -950,6 +969,7 @@ class RMSPropOptimizer(Optimizer): ...@@ -950,6 +969,7 @@ class RMSPropOptimizer(Optimizer):
self._rho = rho self._rho = rho
self._epsilon = epsilon self._epsilon = epsilon
self._momentum = momentum self._momentum = momentum
self._centered = centered
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
if not isinstance(block, framework.Block): if not isinstance(block, framework.Block):
...@@ -958,6 +978,7 @@ class RMSPropOptimizer(Optimizer): ...@@ -958,6 +978,7 @@ class RMSPropOptimizer(Optimizer):
for p in parameters: for p in parameters:
self._add_accumulator(self._momentum_acc_str, p) self._add_accumulator(self._momentum_acc_str, p)
self._add_accumulator(self._mean_square_acc_str, p) self._add_accumulator(self._mean_square_acc_str, p)
self._add_accumulator(self._mean_grad_acc_str, p)
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
if not isinstance(block, framework.Block): if not isinstance(block, framework.Block):
...@@ -967,6 +988,8 @@ class RMSPropOptimizer(Optimizer): ...@@ -967,6 +988,8 @@ class RMSPropOptimizer(Optimizer):
param_and_grad[0]) param_and_grad[0])
mean_square_acc = self._get_accumulator(self._mean_square_acc_str, mean_square_acc = self._get_accumulator(self._mean_square_acc_str,
param_and_grad[0]) param_and_grad[0])
mean_grad_acc = self._get_accumulator(self._mean_grad_acc_str,
param_and_grad[0])
rmsprop_op = block.append_op( rmsprop_op = block.append_op(
type=self.type, type=self.type,
inputs={ inputs={
...@@ -974,17 +997,20 @@ class RMSPropOptimizer(Optimizer): ...@@ -974,17 +997,20 @@ class RMSPropOptimizer(Optimizer):
"Grad": param_and_grad[1], "Grad": param_and_grad[1],
"Moment": momentum_acc, "Moment": momentum_acc,
"MeanSquare": mean_square_acc, "MeanSquare": mean_square_acc,
"MeanGrad": mean_grad_acc,
"LearningRate": self._create_param_lr(param_and_grad), "LearningRate": self._create_param_lr(param_and_grad),
}, },
outputs={ outputs={
"ParamOut": param_and_grad[0], "ParamOut": param_and_grad[0],
"MomentOut": momentum_acc, "MomentOut": momentum_acc,
"MeanSquareOut": mean_square_acc "MeanSquareOut": mean_square_acc,
"MeanGradOut": mean_grad_acc
}, },
attrs={ attrs={
"epsilon": self._epsilon, "epsilon": self._epsilon,
"decay": self._rho, "decay": self._rho,
"momentum": self._momentum "momentum": self._momentum,
"centered": self._centered
}) })
return rmsprop_op return rmsprop_op
......
...@@ -281,7 +281,7 @@ class TestRpnTargetAssign(unittest.TestCase): ...@@ -281,7 +281,7 @@ class TestRpnTargetAssign(unittest.TestCase):
gt_box = layers.data( gt_box = layers.data(
name='gt_box', shape=[4], lod_level=1, dtype='float32') name='gt_box', shape=[4], lod_level=1, dtype='float32')
predicted_scores, predicted_location, target_label, target_bbox = layers.rpn_target_assign( pred_scores, pred_loc, tgt_lbl, tgt_bbox = layers.rpn_target_assign(
loc=loc, loc=loc,
scores=scores, scores=scores,
anchor_box=anchor_box, anchor_box=anchor_box,
...@@ -292,15 +292,13 @@ class TestRpnTargetAssign(unittest.TestCase): ...@@ -292,15 +292,13 @@ class TestRpnTargetAssign(unittest.TestCase):
rpn_positive_overlap=0.7, rpn_positive_overlap=0.7,
rpn_negative_overlap=0.3) rpn_negative_overlap=0.3)
self.assertIsNotNone(predicted_scores) self.assertIsNotNone(pred_scores)
self.assertIsNotNone(predicted_location) self.assertIsNotNone(pred_loc)
self.assertIsNotNone(target_label) self.assertIsNotNone(tgt_lbl)
self.assertIsNotNone(target_bbox) self.assertIsNotNone(tgt_bbox)
assert predicted_scores.shape[1] == 2 assert pred_scores.shape[1] == 1
assert predicted_location.shape[1] == 4 assert pred_loc.shape[1] == 4
assert predicted_location.shape[1] == target_bbox.shape[1] assert pred_loc.shape[1] == tgt_bbox.shape[1]
print(str(program))
class TestGenerateProposals(unittest.TestCase): class TestGenerateProposals(unittest.TestCase):
......
...@@ -291,7 +291,7 @@ class OpTest(unittest.TestCase): ...@@ -291,7 +291,7 @@ class OpTest(unittest.TestCase):
return_numpy=False) return_numpy=False)
return outs, fetch_list return outs, fetch_list
def check_output_with_place(self, place, atol): def check_output_with_place(self, place, atol, equal_nan=False):
outs, fetch_list = self._calc_output(place) outs, fetch_list = self._calc_output(place)
for out_name, out_dup in Operator.get_op_outputs(self.op_type): for out_name, out_dup in Operator.get_op_outputs(self.op_type):
if out_name not in self.outputs: if out_name not in self.outputs:
...@@ -321,7 +321,7 @@ class OpTest(unittest.TestCase): ...@@ -321,7 +321,7 @@ class OpTest(unittest.TestCase):
if isinstance(expect, tuple) else expect if isinstance(expect, tuple) else expect
self.assertTrue( self.assertTrue(
np.allclose( np.allclose(
actual_t, expect_t, atol=atol), actual_t, expect_t, atol=atol, equal_nan=equal_nan),
"Output (" + sub_out_name + ") has diff at " + "Output (" + sub_out_name + ") has diff at " +
str(place)) str(place))
if isinstance(expect, tuple): if isinstance(expect, tuple):
...@@ -337,7 +337,7 @@ class OpTest(unittest.TestCase): ...@@ -337,7 +337,7 @@ class OpTest(unittest.TestCase):
expect_t = expect[0] if isinstance(expect, tuple) else expect expect_t = expect[0] if isinstance(expect, tuple) else expect
self.assertTrue( self.assertTrue(
np.allclose( np.allclose(
actual_t, expect_t, atol=atol), actual_t, expect_t, atol=atol, equal_nan=equal_nan),
"Output (" + out_name + ") has diff at " + str(place) + "Output (" + out_name + ") has diff at " + str(place) +
"\nExpect " + str(expect_t) + "\n" + "But Got" + "\nExpect " + str(expect_t) + "\n" + "But Got" +
str(actual_t)) str(actual_t))
...@@ -360,10 +360,10 @@ class OpTest(unittest.TestCase): ...@@ -360,10 +360,10 @@ class OpTest(unittest.TestCase):
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
return places return places
def check_output(self, atol=1e-5): def check_output(self, atol=1e-5, equal_nan=False):
places = self._get_places() places = self._get_places()
for place in places: for place in places:
self.check_output_with_place(place, atol) self.check_output_with_place(place, atol, equal_nan)
def check_output_customized(self, checker): def check_output_customized(self, checker):
places = self._get_places() places = self._get_places()
......
...@@ -26,18 +26,15 @@ class TestAucOp(OpTest): ...@@ -26,18 +26,15 @@ class TestAucOp(OpTest):
pred = np.random.random((128, 2)).astype("float32") pred = np.random.random((128, 2)).astype("float32")
labels = np.random.randint(0, 2, (128, 1)) labels = np.random.randint(0, 2, (128, 1))
num_thresholds = 200 num_thresholds = 200
tp = np.zeros((num_thresholds, )).astype("int64")
tn = np.zeros((num_thresholds, )).astype("int64") stat_pos = np.zeros((num_thresholds + 1, )).astype("int64")
fp = np.zeros((num_thresholds, )).astype("int64") stat_neg = np.zeros((num_thresholds + 1, )).astype("int64")
fn = np.zeros((num_thresholds, )).astype("int64")
self.inputs = { self.inputs = {
'Predict': pred, 'Predict': pred,
'Label': labels, 'Label': labels,
'TP': tp, "StatPos": stat_pos,
'TN': tn, "StatNeg": stat_neg
'FP': fp,
'FN': fn
} }
self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds} self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds}
...@@ -47,11 +44,10 @@ class TestAucOp(OpTest): ...@@ -47,11 +44,10 @@ class TestAucOp(OpTest):
python_auc.update(pred, labels) python_auc.update(pred, labels)
self.outputs = { self.outputs = {
'AUC': python_auc.eval(), 'AUC': np.array(python_auc.eval()),
'TPOut': python_auc.tp_list, 'BatchAUC': np.array(python_auc.eval()),
'FNOut': python_auc.fn_list, 'StatPosOut': np.array(python_auc._stat_pos),
'TNOut': python_auc.tn_list, 'StatNegOut': np.array(python_auc._stat_neg)
'FPOut': python_auc.fp_list
} }
def test_check_output(self): def test_check_output(self):
......
...@@ -58,6 +58,7 @@ class TestFusionLSTMOp(OpTest): ...@@ -58,6 +58,7 @@ class TestFusionLSTMOp(OpTest):
self.act_cell = 'tanh' self.act_cell = 'tanh'
self.act_cand = 'tanh' self.act_cand = 'tanh'
self.use_peepholes = False self.use_peepholes = False
self.use_seq = False
self.set_conf() self.set_conf()
T = sum(self.lod[0]) T = sum(self.lod[0])
...@@ -107,6 +108,7 @@ class TestFusionLSTMOp(OpTest): ...@@ -107,6 +108,7 @@ class TestFusionLSTMOp(OpTest):
} }
self.attrs = { self.attrs = {
'use_peepholes': self.use_peepholes, 'use_peepholes': self.use_peepholes,
'use_seq': self.use_seq,
'is_reverse': self.is_reverse, 'is_reverse': self.is_reverse,
'gate_activation': self.act_gate, 'gate_activation': self.act_gate,
'cell_activation': self.act_cell, 'cell_activation': self.act_cell,
...@@ -159,5 +161,68 @@ class TestFusionLSTMOpBS1(TestFusionLSTMOp): ...@@ -159,5 +161,68 @@ class TestFusionLSTMOpBS1(TestFusionLSTMOp):
self.D = 16 self.D = 16
class TestFusionLSTMOpPeepholes(TestFusionLSTMOp):
def set_conf(self):
self.use_peepholes = True
class TestFusionLSTMOpPeepholesInit(TestFusionLSTMOp):
def set_conf(self):
self.use_peepholes = True
self.has_initial_state = True
class TestFusionLSTMOpPeepholesReverse(TestFusionLSTMOp):
def set_conf(self):
self.use_peepholes = True
self.is_reverse = True
class TestFusionLSTMOpPoopholesBS1(TestFusionLSTMOp):
def set_conf(self):
self.use_peepholes = True
self.lod = [[3]]
self.D = 16
class TestFusionLSTMOpSeqInit(TestFusionLSTMOp):
def set_conf(self):
self.use_seq = True
self.has_initial_state = True
class TestFusionLSTMOpSeqReverse(TestFusionLSTMOp):
def set_conf(self):
self.use_seq = True
self.is_reverse = True
class TestFusionLSTMOpSeqInitReverse(TestFusionLSTMOp):
def set_conf(self):
self.use_seq = True
self.has_initial_state = True
self.is_reverse = True
class TestFusionLSTMOpSeqPeepholes(TestFusionLSTMOp):
def set_conf(self):
self.use_seq = True
self.use_peepholes = True
class TestFusionLSTMOpSeqPeepholesInit(TestFusionLSTMOp):
def set_conf(self):
self.use_seq = True
self.use_peepholes = True
self.has_initial_state = True
class TestFusionLSTMOpSeqPeepholesReverse(TestFusionLSTMOp):
def set_conf(self):
self.use_seq = True
self.use_peepholes = True
self.is_reverse = True
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -177,8 +177,8 @@ def _box_to_delta(ex_boxes, gt_boxes, weights): ...@@ -177,8 +177,8 @@ def _box_to_delta(ex_boxes, gt_boxes, weights):
dx = (gt_ctr_x - ex_ctr_x) / ex_w / weights[0] dx = (gt_ctr_x - ex_ctr_x) / ex_w / weights[0]
dy = (gt_ctr_y - ex_ctr_y) / ex_h / weights[1] dy = (gt_ctr_y - ex_ctr_y) / ex_h / weights[1]
dw = (np.log(gt_w / ex_w)) / ex_w / weights[2] dw = (np.log(gt_w / ex_w)) / weights[2]
dh = (np.log(gt_h / ex_h)) / ex_h / weights[3] dh = (np.log(gt_h / ex_h)) / weights[3]
targets = np.vstack([dx, dy, dw, dh]).transpose() targets = np.vstack([dx, dy, dw, dh]).transpose()
return targets return targets
......
...@@ -15,90 +15,164 @@ ...@@ -15,90 +15,164 @@
from __future__ import print_function from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest import paddle.fluid.core as core
from paddle.fluid.op import Operator
class TestRmspropOp1(OpTest):
''' Test RMSProp with explicit inputs class TestBase(unittest.TestCase):
''' def setup(self, centered, epsilon=1e-6):
np.random.seed(5) # fix seed
def setUp(self):
self.op_type = "rmsprop" self.param_name = "param"
self.param = np.random.random((123, 321)).astype("float32")
param = np.random.random((123, 321)).astype("float32")
mean_square = np.random.random((123, 321)).astype("float32") self.mean_square_name = "mean_square"
learning_rate = np.array([0.01]).astype("float32") self.mean_square = np.random.random((123, 321)).astype("float32")
grad = np.random.random((123, 321)).astype("float32")
moment = np.zeros((123, 321)).astype("float32") self.mean_grad_name = "mean_grad"
self.mean_grad = np.random.random((123, 321)).astype("float32")
epsilon = 1e-6
decay = 0.9 self.lr_name = "lr"
momentum = 0.0 self.learning_rate = np.array([0.01]).astype("float32")
self.inputs = { self.grad_name = "grad"
'Param': param, self.grad = np.random.random((123, 321)).astype("float32")
'MeanSquare': mean_square,
'LearningRate': learning_rate, self.moment_name = "moment"
'Grad': grad, self.moment = np.zeros((123, 321)).astype("float32")
'Moment': moment,
} self.epsilon = epsilon
self.decay = 0.9
self.attrs = {'epsilon': epsilon, 'decay': decay, 'momentum': momentum} self.momentum = 0.0
self.centered = centered
ms_out = decay * mean_square + (1 - decay) * grad * grad
moment_out = momentum * moment + \ self.ms_out = self.decay * self.mean_square + (1 - self.decay
learning_rate * grad / np.sqrt(ms_out + epsilon) ) * self.grad * self.grad
param_out = param - moment_out if centered:
self.mg_out = self.decay * self.mean_grad + (1 - self.decay
self.outputs = { ) * self.grad
'ParamOut': param_out, self.moment_out = self.momentum * self.moment + \
'MomentOut': moment_out, self.learning_rate * self.grad / np.sqrt(self.ms_out - np.square(self.mg_out) + self.epsilon)
'MeanSquareOut': ms_out else:
} self.moment_out = self.momentum * self.moment + \
self.learning_rate * self.grad / np.sqrt(self.ms_out + self.epsilon)
def test_check_output(self):
self.check_output() self.param_out = self.param - self.moment_out
def check(self,
class TestRmspropOp2(OpTest): actual_t,
'''Test RMSProp with default values for attributes expect_t,
''' place,
out_name,
def setUp(self): atol=1e-5,
self.op_type = "rmsprop" equal_nan=False):
self.assertTrue(
param = np.random.random((123, 321)).astype("float32") np.allclose(
mean_square = np.random.random((123, 321)).astype("float32") actual_t, expect_t, atol=atol, equal_nan=equal_nan),
learning_rate = np.array([0.01]).astype("float32") "Output (" + out_name + ") has diff at " + str(place) + "\nExpect "
grad = np.random.random((123, 321)).astype("float32") + str(expect_t) + "\n" + "But Got" + str(actual_t))
moment = np.zeros((123, 321)).astype("float32")
epsilon = 1.0e-10 class TestRmspropOp(TestBase):
decay = 0.9 def check_with_place(self, place, centered, epsilon):
momentum = 0.0 self.setup(centered, epsilon)
scope = core.Scope()
self.inputs = {
'Param': param, # create and initialize Param Variable
'MeanSquare': mean_square, param = scope.var(self.param_name).get_tensor()
'LearningRate': learning_rate, param.set(self.param, place)
'Grad': grad,
'Moment': moment, mean_square = scope.var(self.mean_square_name).get_tensor()
} mean_square.set(self.mean_square, place)
ms_out = decay * mean_square + (1 - decay) * grad * grad lr = scope.var(self.lr_name).get_tensor()
moment_out = momentum * moment + \ lr.set(self.learning_rate, place)
learning_rate * grad / np.sqrt(ms_out + epsilon)
param_out = param - moment_out grad = scope.var(self.grad_name).get_tensor()
grad.set(self.grad, place)
self.outputs = {
'ParamOut': param_out, moment = scope.var(self.moment_name).get_tensor()
'MomentOut': moment_out, moment.set(self.moment, place)
'MeanSquareOut': ms_out
} # create and run sgd operator
def test_check_output(self): if self.centered:
self.check_output() mean_grad = scope.var(self.mean_grad_name).get_tensor()
mean_grad.set(self.mean_grad, place)
rmsprop_op = Operator(
"rmsprop",
Param=self.param_name,
Grad=self.grad_name,
MeanSquare=self.mean_square_name,
MeanGrad=self.mean_grad_name,
Moment=self.moment_name,
LearningRate=self.lr_name,
ParamOut=self.param_name,
MeanSquareOut=self.mean_square_name,
MomentOut=self.moment_name,
MeanGradOut=self.mean_grad_name,
epsilon=self.epsilon,
decay=self.decay,
momentum=self.momentum,
centered=True)
else:
rmsprop_op = Operator(
"rmsprop",
Param=self.param_name,
Grad=self.grad_name,
MeanSquare=self.mean_square_name,
Moment=self.moment_name,
LearningRate=self.lr_name,
ParamOut=self.param_name,
MeanSquareOut=self.mean_square_name,
MomentOut=self.moment_name,
epsilon=self.epsilon,
decay=self.decay,
momentum=self.momentum,
centered=False)
rmsprop_op.run(scope, place)
atol = 1e-5
equal_nan = False
if self.centered:
atol = 1e-3
equal_nan = True
self.check(
np.array(mean_square), self.ms_out, place, self.mean_square_name)
self.check(
np.array(moment),
self.moment_out,
place,
self.moment_name,
atol=atol,
equal_nan=equal_nan)
self.check(
np.array(param),
self.param_out,
place,
self.param_name,
atol=atol,
equal_nan=equal_nan)
if self.centered:
self.check(
np.array(mean_grad), self.mg_out, place, self.mean_grad_name)
def test_rmsprop(self):
places = [core.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
for place in places:
self.check_with_place(place, False, 1e-6)
self.check_with_place(place, False, 1e-10)
self.check_with_place(place, True, 1e-6)
self.check_with_place(place, True, 1e-10)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -61,7 +61,7 @@ class TestROIPoolOp(OpTest): ...@@ -61,7 +61,7 @@ class TestROIPoolOp(OpTest):
for i in range(self.rois_num): for i in range(self.rois_num):
roi = self.rois[i] roi = self.rois[i]
roi_batch_id = roi[0] roi_batch_id = int(roi[0])
roi_start_w = int(cpt.round(roi[1] * self.spatial_scale)) roi_start_w = int(cpt.round(roi[1] * self.spatial_scale))
roi_start_h = int(cpt.round(roi[2] * self.spatial_scale)) roi_start_h = int(cpt.round(roi[2] * self.spatial_scale))
roi_end_w = int(cpt.round(roi[3] * self.spatial_scale)) roi_end_w = int(cpt.round(roi[3] * self.spatial_scale))
...@@ -125,7 +125,7 @@ class TestROIPoolOp(OpTest): ...@@ -125,7 +125,7 @@ class TestROIPoolOp(OpTest):
roi = [bno, x1, y1, x2, y2] roi = [bno, x1, y1, x2, y2]
rois.append(roi) rois.append(roi)
self.rois_num = len(rois) self.rois_num = len(rois)
self.rois = np.array(rois).astype("int64") self.rois = np.array(rois).astype("float32")
def setUp(self): def setUp(self):
self.op_type = "roi_pool" self.op_type = "roi_pool"
......
...@@ -18,12 +18,17 @@ import unittest ...@@ -18,12 +18,17 @@ import unittest
import numpy as np import numpy as np
import paddle.fluid.core as core import paddle.fluid.core as core
from op_test import OpTest from op_test import OpTest
from test_anchor_generator_op import anchor_generator_in_python
from test_generate_proposal_labels import _generate_groundtruth
from test_generate_proposal_labels import _bbox_overlaps, _box_to_delta
def rpn_target_assign(iou, rpn_batch_size_per_im, rpn_positive_overlap, def rpn_target_assign(gt_anchor_iou, rpn_batch_size_per_im,
rpn_negative_overlap, fg_fraction): rpn_positive_overlap, rpn_negative_overlap, fg_fraction):
iou = np.transpose(iou) iou = np.transpose(gt_anchor_iou)
anchor_to_gt_max = iou.max(axis=1) anchor_to_gt_max = iou.max(axis=1)
anchor_to_gt_argmax = iou.argmax(axis=1)
gt_to_anchor_argmax = iou.argmax(axis=0) gt_to_anchor_argmax = iou.argmax(axis=0)
gt_to_anchor_max = iou[gt_to_anchor_argmax, np.arange(iou.shape[1])] gt_to_anchor_max = iou[gt_to_anchor_argmax, np.arange(iou.shape[1])]
anchors_with_max_overlap = np.where(iou == gt_to_anchor_max)[0] anchors_with_max_overlap = np.where(iou == gt_to_anchor_max)[0]
...@@ -42,59 +47,113 @@ def rpn_target_assign(iou, rpn_batch_size_per_im, rpn_positive_overlap, ...@@ -42,59 +47,113 @@ def rpn_target_assign(iou, rpn_batch_size_per_im, rpn_positive_overlap,
num_bg = rpn_batch_size_per_im - np.sum(tgt_lbl == 1) num_bg = rpn_batch_size_per_im - np.sum(tgt_lbl == 1)
bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0] bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0]
tgt_lbl[bg_inds] = 0
if len(bg_inds) > num_bg: if len(bg_inds) > num_bg:
enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)] enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)]
tgt_lbl[enable_inds] = 0 tgt_lbl[enable_inds] = 0
bg_inds = np.where(tgt_lbl == 0)[0] bg_inds = np.where(tgt_lbl == 0)[0]
tgt_lbl[bg_inds] = 0
loc_index = fg_inds loc_index = fg_inds
score_index = np.hstack((fg_inds, bg_inds)) score_index = np.hstack((fg_inds, bg_inds))
tgt_lbl = np.expand_dims(tgt_lbl, axis=1) tgt_lbl = np.expand_dims(tgt_lbl, axis=1)
return loc_index, score_index, tgt_lbl
gt_inds = anchor_to_gt_argmax[fg_inds]
return loc_index, score_index, tgt_lbl, gt_inds
def get_anchor(n, c, h, w):
input_feat = np.random.random((n, c, h, w)).astype('float32')
anchors, _ = anchor_generator_in_python(
input_feat=input_feat,
anchor_sizes=[32., 64.],
aspect_ratios=[0.5, 1.0],
variances=[1.0, 1.0, 1.0, 1.0],
stride=[16.0, 16.0],
offset=0.5)
return anchors
def rpn_blob(anchor, gt_boxes, iou, lod, rpn_batch_size_per_im,
rpn_positive_overlap, rpn_negative_overlap, fg_fraction):
loc_indexes = []
score_indexes = []
tmp_tgt_labels = []
tgt_bboxes = []
anchor_num = anchor.shape[0]
batch_size = len(lod) - 1
for i in range(batch_size):
b, e = lod[i], lod[i + 1]
iou_slice = iou[b:e, :]
bboxes_slice = gt_boxes[b:e, :]
loc_idx, score_idx, tgt_lbl, gt_inds = rpn_target_assign(
iou_slice, rpn_batch_size_per_im, rpn_positive_overlap,
rpn_negative_overlap, fg_fraction)
fg_bboxes = bboxes_slice[gt_inds]
fg_anchors = anchor[loc_idx]
box_deltas = _box_to_delta(fg_anchors, fg_bboxes, [1., 1., 1., 1.])
if i == 0:
loc_indexes = loc_idx
score_indexes = score_idx
tmp_tgt_labels = tgt_lbl
tgt_bboxes = box_deltas
else:
loc_indexes = np.concatenate(
[loc_indexes, loc_idx + i * anchor_num])
score_indexes = np.concatenate(
[score_indexes, score_idx + i * anchor_num])
tmp_tgt_labels = np.concatenate([tmp_tgt_labels, tgt_lbl])
tgt_bboxes = np.vstack([tgt_bboxes, box_deltas])
tgt_labels = tmp_tgt_labels[score_indexes]
return loc_indexes, score_indexes, tgt_bboxes, tgt_labels
class TestRpnTargetAssignOp(OpTest): class TestRpnTargetAssignOp(OpTest):
def setUp(self): def setUp(self):
iou = np.random.random((10, 8)).astype("float32") n, c, h, w = 2, 4, 14, 14
self.op_type = "rpn_target_assign" anchor = get_anchor(n, c, h, w)
self.inputs = {'DistMat': iou} gt_num = 10
self.attrs = { anchor = anchor.reshape(-1, 4)
'rpn_batch_size_per_im': 256, anchor_num = anchor.shape[0]
'rpn_positive_overlap': 0.95,
'rpn_negative_overlap': 0.3,
'fg_fraction': 0.25,
'fix_seed': True
}
loc_index, score_index, tgt_lbl = rpn_target_assign(iou, 256, 0.95, 0.3,
0.25)
self.outputs = {
'LocationIndex': loc_index,
'ScoreIndex': score_index,
'TargetLabel': tgt_lbl,
}
def test_check_output(self): im_shapes = [[64, 64], [64, 64]]
self.check_output() gt_box, lod = _generate_groundtruth(im_shapes, 3, 4)
bbox = np.vstack([v['boxes'] for v in gt_box])
iou = _bbox_overlaps(bbox, anchor)
anchor = anchor.astype('float32')
bbox = bbox.astype('float32')
iou = iou.astype('float32')
loc_index, score_index, tgt_bbox, tgt_lbl = rpn_blob(
anchor, bbox, iou, [0, 4, 8], 25600, 0.95, 0.03, 0.25)
class TestRpnTargetAssignOp2(OpTest):
def setUp(self):
iou = np.random.random((10, 20)).astype("float32")
self.op_type = "rpn_target_assign" self.op_type = "rpn_target_assign"
self.inputs = {'DistMat': iou} self.inputs = {
'Anchor': anchor,
'GtBox': (bbox, [[4, 4]]),
'DistMat': (iou, [[4, 4]]),
}
self.attrs = { self.attrs = {
'rpn_batch_size_per_im': 128, 'rpn_batch_size_per_im': 25600,
'rpn_positive_overlap': 0.5, 'rpn_positive_overlap': 0.95,
'rpn_negative_overlap': 0.5, 'rpn_negative_overlap': 0.03,
'fg_fraction': 0.5, 'fg_fraction': 0.25,
'fix_seed': True 'fix_seed': True
} }
loc_index, score_index, tgt_lbl = rpn_target_assign(iou, 128, 0.5, 0.5,
0.5)
self.outputs = { self.outputs = {
'LocationIndex': loc_index, 'LocationIndex': loc_index.astype('int32'),
'ScoreIndex': score_index, 'ScoreIndex': score_index.astype('int32'),
'TargetLabel': tgt_lbl, 'TargetBBox': tgt_bbox.astype('float32'),
'TargetLabel': tgt_lbl.astype('int64'),
} }
def test_check_output(self): def test_check_output(self):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册