diff --git a/Dockerfile b/Dockerfile index 402adee2ea2822250ebc8f6229fd6a44545d58e5..634be18a51bf61e96a8bf6f263b6674a7932d6e4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -53,7 +53,7 @@ RUN curl -s -q https://glide.sh/get | sh # and its size is only one-third of the official one. # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle. # See https://github.com/PaddlePaddle/Paddle/issues/10129 for details. -RUN wget -qO- http://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \ +RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \ tar -xz -C /usr/local && \ cp -rf /usr/local/TensorRT/include /usr && \ cp -rf /usr/local/TensorRT/lib /usr diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index bc36683a9facc253e7b9feb0c5a56e79491fb9b0..f61770514eb05a99c140cdb18575c89aa5235c14 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -128,16 +128,13 @@ set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid") set(module "framework") if (NOT WIN32) -copy(framework_lib DEPS framework_py_proto - SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h - DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} -) -else() -copy(framework_lib +set(framework_lib_deps framework_py_proto) +endif(NOT WIN32) +copy(framework_lib DEPS ${framework_lib_deps} SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h - DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} + ${src_dir}/${module}/ir/*.h + DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}/ir ) -endif(NOT WIN32) set(module "memory") copy(memory_lib @@ -161,7 +158,8 @@ set(module "inference") copy(inference_lib DEPS ${inference_deps} SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.* ${src_dir}/${module}/api/paddle_inference_api.h ${src_dir}/${module}/api/demo_ci - DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} + ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h + DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ) set(module "platform") diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md index fa2b930be0d26d816566599cece8afbedc1157e0..6e5f77fec8a894c390ced8c93ee344fd8d27370e 100644 --- a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md +++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md @@ -60,6 +60,7 @@ 图3. 编码器-解码器框架 + #### 编码器 编码阶段分为三步: @@ -81,7 +82,7 @@ 机器翻译任务的训练过程中,解码阶段的目标是最大化下一个正确的目标语言词的概率。思路是: 1. 每一个时刻,根据源语言句子的编码信息(又叫上下文向量,context vector)`$c$`、真实目标语言序列的第`$i$`个词`$u_i$`和`$i$`时刻RNN的隐层状态`$z_i$`,计算出下一个隐层状态`$z_{i+1}$`。计算公式如下: $$z_{i+1}=\phi_{\theta '} \left ( c,u_i,z_i \right )$$ -其中`$\phi _{\theta '}$`是一个非线性激活函数;`$c=q\mathbf{h}$`是源语言句子的上下文向量,在不使用[注意力机制](#注意力机制)时,如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素,则可以定义`$c=h_T$`;`$u_i$`是目标语言序列的第`$i$`个单词,`$u_0$`是目标语言序列的开始标记``,表示解码开始;`$z_i$`是`$i$`时刻解码RNN的隐层状态,`$z_0$`是一个全零的向量。 +其中`$\phi _{\theta '}$`是一个非线性激活函数;`$c=q\mathbf{h}$`是源语言句子的上下文向量,在不使用注意力机制时,如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素,则可以定义`$c=h_T$`;`$u_i$`是目标语言序列的第`$i$`个单词,`$u_0$`是目标语言序列的开始标记``,表示解码开始;`$z_i$`是`$i$`时刻解码RNN的隐层状态,`$z_0$`是一个全零的向量。 2. 将`$z_{i+1}$`通过`softmax`归一化,得到目标语言序列的第`$i+1$`个单词的概率分布`$p_{i+1}$`。概率分布公式如下: $$p\left ( u_{i+1}|u_{<i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$ @@ -93,6 +94,7 @@ $$p\left ( u_{i+1}|u_{<i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$ 机器翻译任务的生成过程,通俗来讲就是根据预先训练的模型来翻译源语言句子。生成过程中的解码阶段和上述训练过程的有所差异,具体介绍请见[柱搜索算法](#柱搜索算法)。 + ### 柱搜索算法 柱搜索([beam search](http://en.wikipedia.org/wiki/Beam_search))是一种启发式图搜索算法,用于在图或树中搜索有限集合中的最优扩展节点,通常用在解空间非常大的系统(如机器翻译、语音识别)中,原因是内存无法装下图或树中所有展开的解。如在机器翻译任务中希望翻译“`你好`”,就算目标语言字典中只有3个词(``, ``, `hello`),也可能生成无限句话(`hello`循环出现的次数不定),为了找到其中较好的翻译结果,我们可采用柱搜索算法。 diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md index 9900dfb9a67dc6f8940bd7dd3abfa15ac8a3488f..8477cf32146c33947ced447c8bdd287a3e1e71f5 100644 --- a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md +++ b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md @@ -149,6 +149,8 @@ def convolution_net(data, input_dim, class_dim, emb_dim, hid_dim): 网络的输入`input_dim`表示的是词典的大小,`class_dim`表示类别数。这里,我们使用[`sequence_conv_pool`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/trainer_config_helpers/networks.py) API实现了卷积和池化操作。 + + ### 栈式双向LSTM 栈式双向神经网络`stacked_lstm_net`的代码片段如下: diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md index 2c68cdac4f10319359b74bc92569dfd3f65380b5..904d99fe2ffc9ead69a86c9763568a5c098348d5 100644 --- a/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md +++ b/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md @@ -50,7 +50,7 @@ similarity: -0.0997506977351 ``` -以上结果可以通过运行`calculate_dis.py`, 加载字典里的单词和对应训练特征结果得到,我们将在[应用模型](#应用模型)中详细描述用法。 +以上结果可以通过运行`calculate_dis.py`, 加载字典里的单词和对应训练特征结果得到,我们将在[模型应用](#模型应用)中详细描述用法。 ## 模型概览 @@ -189,6 +189,7 @@ dream that one day 最后,每个输入会按其单词次在字典里的位置,转化成整数的索引序列,作为PaddlePaddle的输入。 + ## 编程实现 本配置的模型结构如下图所示: @@ -349,6 +350,7 @@ Step 20: Average Cost 5.766995 ... ``` + ## 模型应用 在模型训练后,我们可以用它做一些预测。 diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md index e6f89b23a95d1a07565f3e0a285e9c3f921930df..ac36c4ecf6b9b716fe5f0dbe2346e64918c22242 100644 --- a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md +++ b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md @@ -102,7 +102,7 @@ Softmax回归模型采用了最简单的两层神经网络,即只有输入层 池化是非线性下采样的一种形式,主要作用是通过减少网络的参数来减小计算量,并且能够在一定程度上控制过拟合。通常在卷积层的后面会加上一个池化层。池化包括最大池化、平均池化等。其中最大池化是用不重叠的矩形框将输入层分成不同的区域,对于每个矩形框的数取最大值作为输出层,如图6所示。 -更详细的关于卷积神经网络的具体知识可以参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/ )和[图像分类](https://github.com/PaddlePaddle/book/blob/develop/image_classification/README.md)教程。 +更详细的关于卷积神经网络的具体知识可以参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/ )和[图像分类]( https://github.com/PaddlePaddle/book/tree/develop/03.image_classification )教程。 ### 常见激活函数介绍 - sigmoid激活函数: $ f(x) = sigmoid(x) = \frac{1}{1+e^{-x}} $ diff --git a/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md b/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md index a2f30823a6fcd379f94e6e98d043b0d00681827f..99f8bee5ca1519ccf5d7c35ad2a64da4a8841ada 100644 --- a/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md +++ b/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md @@ -104,6 +104,7 @@ visualDL --logdir=scratch_log --port=8080 # 访问 http://127.0.0.1:8080 ``` +如果出现`TypeError: __init__() got an unexpected keyword argument 'file'`, 是因为protobuf不是3.5以上,运行`pip install --upgrade protobuf`就能解决。 如果在虚拟环境下仍然遇到安装问题,请尝试以下方法。 @@ -149,7 +150,7 @@ python setup.py bdist_wheel pip install --upgrade dist/visualdl-*.whl ``` -如果打包和安装遇到其他问题,不安装只想运行Visual DL可以看[这里](https://github.com/PaddlePaddle/VisualDL/blob/develop/docs/how_to_dev_frontend_en.md) +如果打包和安装遇到其他问题,不安装只想运行Visual DL可以看[这里](https://github.com/PaddlePaddle/VisualDL/blob/develop/docs/develop/how_to_dev_frontend_cn.md) ## SDK diff --git a/doc/fluid/new_docs/user_guides/howto/inference/native_infer.rst b/doc/fluid/new_docs/user_guides/howto/inference/native_infer.rst index 21a6fe5cf54d0c0c760ade4ba602024ffa29675f..6d6f3035c0b5c985cd39d45df9f1bcce50dcefa0 100644 --- a/doc/fluid/new_docs/user_guides/howto/inference/native_infer.rst +++ b/doc/fluid/new_docs/user_guides/howto/inference/native_infer.rst @@ -4,13 +4,12 @@ Paddle 预测 API 为了更简单方便的预测部署,Fluid 提供了一套高层 API 用来隐藏底层不同的优化实现。 -`预测库相关代码 `__ +`预测库相关代码 `_ 包括 - 头文件 ``paddle_inference_api.h`` 定义了所有的接口 - 库文件\ ``libpaddle_fluid.so`` 或 ``libpaddle_fluid.a`` -- 库文件 ``libpaddle_inference_api.so`` 或 - ``libpaddle_inference_api.a`` + 编译和依赖可以参考 :ref:`install_or_build_cpp_inference_lib` 。 @@ -97,8 +96,7 @@ engine CHECK(predictor->Run(slots, &outputs)); // 获取 outputs ... -编译时,联编 ``libpaddle_fluid.a/.so`` 和 -``libpaddle_inference_api.a/.so`` 便可。 +编译时,联编 ``libpaddle_fluid.a/.so`` 便可。 详细代码参考 ------------ diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index bb5f2894c08b5d8941ad8914f6b83280aa053e37..c2694144d708161a3bed214ceca745505656456f 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -43,6 +43,7 @@ paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.Trainer.__init__ ArgSpec(args=['self', 'train_func', 'optimizer_func', 'param_path', 'place', 'parallel', 'checkpoint_config'], varargs=None, keywords=None, defaults=(None, None, False, None)) +paddle.fluid.Trainer.save_inference_model ArgSpec(args=['self', 'param_path', 'feeded_var_names', 'target_var_indexes'], varargs=None, keywords=None, defaults=None) paddle.fluid.Trainer.save_params ArgSpec(args=['self', 'param_path'], varargs=None, keywords=None, defaults=None) paddle.fluid.Trainer.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Trainer.test ArgSpec(args=['self', 'reader', 'feed_order'], varargs=None, keywords=None, defaults=None) @@ -312,7 +313,7 @@ paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kw paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) -paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 200, 1)) +paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 4095, 1)) paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) @@ -376,7 +377,7 @@ paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'l paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power'], varargs=None, keywords='kwargs', defaults=(0.0, 0.0, -0.5)) paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06, 0.0)) +paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06, 0.0, False)) paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho'], varargs=None, keywords='kwargs', defaults=(1e-06, 0.95)) paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 0bfff745493d069e948e6d277ec2bbfb0673a70b..7a99169849debcbc57d6f197b36c5045b211f3ef 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -326,7 +326,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( ir::Graph &result = *graph; for (auto &node : nodes) { - if (node->NodeType() == ir::Node::Type::kVariable && node->Var()) { + if (node->IsVar() && node->Var()) { all_vars_.emplace(node->Name(), node->Var()); } } @@ -583,18 +583,6 @@ void MultiDevSSAGraphBuilder::InsertDataBalanceOp( } } -bool MultiDevSSAGraphBuilder::IsParameterGradientOnce( - const std::string &og, - std::unordered_set *og_has_been_broadcast) const { - bool is_pg_once = - grad_names_.count(og) != 0 && og_has_been_broadcast->count(og) == 0; - if (is_pg_once) { - // Insert NCCL AllReduce Op - og_has_been_broadcast->insert(og); - } - return is_pg_once; -} - int MultiDevSSAGraphBuilder::GetOpDeviceID(const ir::Graph &graph, ir::Node *node) const { if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { @@ -688,20 +676,6 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result, return var; } -// Find the first occurence of `prev_op_name` and make current `op` depend -// on it. -void MultiDevSSAGraphBuilder::ConnectOp(ir::Graph *result, OpHandleBase *op, - const std::string &prev_op_name) const { - for (auto &prev_op : result->Get(kGraphOps)) { - if (prev_op->Name() == prev_op_name) { - auto *dep_var = new DummyVarHandle(result->CreateControlDepVar()); - prev_op->AddOutput(dep_var); - result->Get(kGraphDepVars).emplace(dep_var); - op->AddInput(dep_var); - } - } -} - void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, ir::Node *node) const { int op_dev_id = -1; diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index 7a6f238f9cf7af18cb10ea271e453fec1902c833..ac6d9c5a64cfde60f75c76dae0a30cc7d735e996 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -69,9 +69,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass { std::vector FindDistTrainRecvVars( const std::vector &nodes) const; - void ConnectOp(ir::Graph *result, OpHandleBase *op, - const std::string &prev_op_name) const; - void CreateComputationalOps(ir::Graph *result, ir::Node *node, size_t num_places) const; @@ -83,10 +80,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass { void CreateComputationalOp(ir::Graph *result, ir::Node *node, int dev_id) const; - bool IsParameterGradientOnce( - const std::string &og, - std::unordered_set *og_has_been_broadcast) const; - int GetOpDeviceID(const ir::Graph &graph, ir::Node *node) const; void InsertAllReduceOp(ir::Graph *result, const std::string &og) const; diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index bfc649017f19d67660bd11d590134cf56772bb27..f5235f70ad79616801110644999d511eeda33a32 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -1,20 +1,35 @@ +set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h) +file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt. DO NOT EDIT!\n\n") +file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n") +function(pass_library TARGET) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass) + file(APPEND ${pass_file} "USE_PASS(${TARGET});\n") + set(PASS_LIBRARY ${TARGET} ${PASS_LIBRARY} PARENT_SCOPE) +endfunction() + cc_library(node SRCS node.cc DEPS proto_desc) cc_library(graph SRCS graph.cc DEPS node) cc_library(graph_helper SRCS graph_helper.cc DEPS graph) cc_library(pass SRCS pass.cc DEPS graph node graph_helper) -cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper) -cc_library(graph_to_program_pass SRCS graph_to_program_pass.cc DEPS graph pass graph_helper) cc_library(graph_traits SRCS graph_traits.cc DEPS graph) cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits) -cc_library(fc_fuse_pass SRCS fc_fuse_pass.cc DEPS graph graph_pattern_detector) -cc_library(attention_lstm_fuse_pass SRCS attention_lstm_fuse_pass.cc DEPS graph graph_pattern_detector) -cc_library(infer_clean_graph_pass SRCS infer_clean_graph_pass.cc DEPS graph pass) -cc_library(fc_lstm_fuse_pass SRCS fc_lstm_fuse_pass.cc DEPS graph graph_pattern_detector) -cc_library(seq_concat_fc_fuse_pass SRCS seq_concat_fc_fuse_pass.cc DEPS graph graph_pattern_detector) + +pass_library(graph_to_program_pass) +pass_library(graph_viz_pass) +pass_library(fc_fuse_pass) +pass_library(attention_lstm_fuse_pass) +pass_library(infer_clean_graph_pass) +pass_library(fc_lstm_fuse_pass) +pass_library(seq_concat_fc_fuse_pass) +set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library") cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry) cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry) cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) -cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass graph_pattern_detector graph pass graph_traits framework_proto) +cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc index 0278ade6763ec614701674691797d766878a378e..bb52d7e498e55c02ddc2cd6d07ccccd51ce4edc5 100644 --- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc @@ -13,13 +13,10 @@ // limitations under the License. #include "paddle/fluid/framework/ir/attention_lstm_fuse_pass.h" - #include - #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h" #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/inference/api/helper.h" namespace paddle { namespace framework { @@ -99,17 +96,13 @@ void FindWhileOp(Graph* graph) { auto* cell_init = graph->RetriveNode(6); auto* hidden_init = graph->RetriveNode(8); -#define LINK_TO(node0, node1) \ - node0->outputs.push_back(node1); \ - node1->inputs.push_back(node0); - auto* lstm_op = graph->CreateOpNode(&op_desc); PrepareParameters(graph, param); - LINK_TO(X, lstm_op); - LINK_TO(cell_init, lstm_op); - LINK_TO(hidden_init, lstm_op); - LINK_TO(lstm_op, LSTMOUT); + IR_NODE_LINK_TO(X, lstm_op); + IR_NODE_LINK_TO(cell_init, lstm_op); + IR_NODE_LINK_TO(hidden_init, lstm_op); + IR_NODE_LINK_TO(lstm_op, LSTMOUT); GraphSafeRemoveNodes(graph, marked_nodes); } diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc index 513742bab69d465aac1bfb7bcef2fe89108c14a0..5a4ebd6f3de555acccd72c61bd377ffd8ce69780 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc @@ -21,74 +21,26 @@ namespace paddle { namespace framework { namespace ir { -bool VarOutLinksToOp(Node* node, const std::string& op_type) { - for (auto* out : node->outputs) { - if (out->IsOp() && out->Op()->Type() == op_type) { - return true; - } - } - return false; -} - -void BuildFCPattern(PDPattern* pattern) { - // Create Operators - auto* mul_op = pattern->NewNode("mul")->assert_is_op("mul"); - auto* elementwise_add_op = - pattern->NewNode("elementwise_add")->assert_is_op("elementwise_add"); - // Create variables - // w - auto* mul_weight_var = pattern->NewNode("mul_weight") - ->AsInput() - ->assert_is_op_nth_input("mul", "Y", 0); - // x - auto* mul_tmp_var = pattern->NewNode("mul_tmp_var") - ->AsInput() - ->assert_is_op_nth_input("mul", "X", 0); - // intermediate variable, will be removed in the IR after fuse. - auto* mul_out_var = pattern->NewNode("mul_out") - ->AsIntermediate() - ->assert_is_only_output_of_op("mul") - ->assert_is_op_input("elementwise_add"); - // bias - auto* elementwise_add_tmp_var = pattern->NewNode("elementwise_add_tmpvar") - ->assert_is_op_input("elementwise_add") - ->AsInput(); - // output - auto* elementwise_add_out_var = pattern->NewNode("elementwise_add_out") - ->AsOutput() - ->assert_is_op_output("elementwise_add"); - - mul_op->LinksFrom({mul_weight_var, mul_tmp_var}).LinksTo({mul_out_var}); - elementwise_add_op->LinksFrom({mul_out_var, elementwise_add_tmp_var}) - .LinksTo({elementwise_add_out_var}); -} - -// Replace the node `from` in the links to `to` -bool LinksReplace(std::vector* links, Node* from, Node* to) { - for (auto*& n : *links) { - if (n == from) { - n = to; - return true; - } - } - return false; -} - std::unique_ptr FCFusePass::ApplyImpl( std::unique_ptr graph) const { PADDLE_ENFORCE(graph.get()); - FusePassBase::Init("fc", graph.get()); + FusePassBase::Init("fc_fuse", graph.get()); std::unordered_set nodes2delete; GraphPatternDetector gpd; - BuildFCPattern(gpd.mutable_pattern()); - -#define GET_NODE(id) \ - PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetrieveNode(#id)), \ - "pattern has no Node called %s", #id); \ - auto* id = subgraph.at(gpd.pattern().RetrieveNode(#id)); \ - PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id); + // BuildFCPattern(gpd.mutable_pattern()); + auto* x = gpd.mutable_pattern() + ->NewNode("fc_fuse/x") + ->AsInput() + ->assert_is_op_input("mul", "X"); + patterns::FC(gpd.mutable_pattern(), "fc_fuse", x, true /*with bias*/); + +#define GET_NODE(id) \ + PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetrieveNode("fc_fuse/" #id)), \ + "pattern has no Node called %s", #id); \ + auto* id = subgraph.at(gpd.pattern().RetrieveNode("fc_fuse/" #id)); \ + PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", "fc_fuse/" #id); int found_fc_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, @@ -98,43 +50,33 @@ std::unique_ptr FCFusePass::ApplyImpl( // scenerio. // FC's fusion is simple, just op fuse, no need to process the // parameters. - GET_NODE(mul_tmp_var); // x - GET_NODE(mul_weight); // Y - GET_NODE(elementwise_add_tmpvar); // bias - GET_NODE(elementwise_add_out); // Out - GET_NODE(mul); // MUL op - GET_NODE(elementwise_add); // ELEMENT_ADD op - GET_NODE(mul_out); // tmp + GET_NODE(x); // x + GET_NODE(w); // Y + GET_NODE(fc_bias); // bias + GET_NODE(fc_out); // Out + GET_NODE(mul); // MUL op + GET_NODE(elementwise_add); // ELEMENT_ADD op + GET_NODE(mul_out); // tmp #undef GET_NODE // Create an FC Node. OpDesc desc; - std::string fc_x_in = mul_tmp_var->Name(); - std::string fc_Y_in = mul_weight->Name(); - std::string fc_bias_in = elementwise_add_tmpvar->Name(); - std::string fc_out = elementwise_add_out->Name(); + std::string fc_x_in = x->Name(); + std::string fc_Y_in = w->Name(); + std::string fc_bias_in = fc_bias->Name(); + std::string fc_out_out = fc_out->Name(); desc.SetInput("Input", std::vector({fc_x_in})); desc.SetInput("W", std::vector({fc_Y_in})); desc.SetInput("Bias", std::vector({fc_bias_in})); - desc.SetOutput("Out", std::vector({fc_out})); + desc.SetOutput("Out", std::vector({fc_out_out})); desc.SetType("fc"); auto fc_node = g->CreateOpNode(&desc); // OpDesc will be copied. - fc_node->inputs = - std::vector({mul_tmp_var, mul_weight, elementwise_add_tmpvar}); - fc_node->outputs.push_back(elementwise_add_out); - - // Update link relatons - PADDLE_ENFORCE(LinksReplace(&mul_tmp_var->outputs, mul, fc_node)); - PADDLE_ENFORCE(LinksReplace(&mul_weight->outputs, mul, fc_node)); - PADDLE_ENFORCE(LinksReplace(&elementwise_add_tmpvar->outputs, - elementwise_add, fc_node)); - PADDLE_ENFORCE( - LinksReplace(&elementwise_add_out->inputs, elementwise_add, fc_node)); + GraphSafeRemoveNodes(graph.get(), {mul, elementwise_add, mul_out}); - // Drop old nodes - graph->RemoveNode(mul); - graph->RemoveNode(elementwise_add); - graph->RemoveNode(mul_out); // tmp variable + IR_NODE_LINK_TO(x, fc_node); + IR_NODE_LINK_TO(w, fc_node); + IR_NODE_LINK_TO(fc_bias, fc_node); + IR_NODE_LINK_TO(fc_node, fc_out); found_fc_count++; }; diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc index c404a6c44ccea8287ddfad976889a9f80cf6bad9..0d69dfa79aa26940f8f56f84b35ffed34f29f703 100644 --- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc @@ -11,7 +11,6 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - #include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h" #include #include "paddle/fluid/framework/lod_tensor.h" @@ -87,15 +86,24 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, } op_desc.SetInput("Bias", {new_bias_var}); } - #undef GET_NODE + // Create temp variables. + scope->Var(name_scope + "/BatchedInput.new") + ->GetMutable(); + scope->Var(name_scope + "/BatchCellPreAct.new") + ->GetMutable(); + scope->Var(name_scope + "/BatchedGate.new") + ->GetMutable(); + op_desc.SetInput("H0", {}); op_desc.SetInput("C0", {}); op_desc.SetOutput("Hidden", {hidden_n->Name()}); op_desc.SetOutput("Cell", {cell_n->Name()}); op_desc.SetOutput("XX", {xx_n->Name()}); - op_desc.SetOutput("BatchedInput", {"blstm_0.tmp_2"}); + op_desc.SetOutput("BatchedGate", {name_scope + "/BatchedGate.new"}); + op_desc.SetOutput("BatchCellPreAct", {name_scope + "/BatchCellPreAct.new"}); + op_desc.SetOutput("BatchedInput", {name_scope + "/BatchedInput.new"}); op_desc.SetAttr("is_reverse", lstm_n->Op()->GetAttr("is_reverse")); op_desc.SetAttr("use_peepholes", lstm_n->Op()->GetAttr("use_peepholes")); // TODO(TJ): get from attr @@ -121,22 +129,18 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, #undef TMP_NEW #undef TMP_NAME -#define LINK_TO(a, b) \ - a->outputs.push_back(b); \ - b->inputs.push_back(a); - LINK_TO(input_n, op); - LINK_TO(weight_x_n, op); - LINK_TO(weight_h_n, op); - LINK_TO(bias_n, op); - LINK_TO(op, hidden_n); -#undef LINK_TO + IR_NODE_LINK_TO(input_n, op); + IR_NODE_LINK_TO(weight_x_n, op); + IR_NODE_LINK_TO(weight_h_n, op); + IR_NODE_LINK_TO(bias_n, op); + IR_NODE_LINK_TO(op, hidden_n); return op; }; int fusion_count{0}; - auto fc_no_bias_handler = [&]( - const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { #define GET_NODE(name__) \ std::string name__##key = name_scope + "/" + #name__; \ auto* name__##n = pattern->RetrieveNode(name__##key); \ @@ -157,21 +161,24 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, if (with_fc_bias) { GET_NODE(fc_bias); + GET_NODE(elementwise_add); lstm_creator(lstm, x, w, Weight, Bias, Hidden, Cell, fc_out, fc_bias); + // Remove unneeded nodes. + std::unordered_set marked_nodes( + {mul_n, lstm_n, elementwise_add_n}); + GraphSafeRemoveNodes(graph, marked_nodes); } else { lstm_creator(lstm, x, w, Weight, Bias, Hidden, Cell, fc_out, -1); + // Remove unneeded nodes. + std::unordered_set marked_nodes({mul_n, lstm_n}); + GraphSafeRemoveNodes(graph, marked_nodes); } #undef GET_NODE - // Remove unneeded nodes. - std::unordered_set marked_nodes({mul_n, lstm_n}); - - GraphSafeRemoveNodes(graph, marked_nodes); - ++fusion_count; }; - gpd(graph, fc_no_bias_handler); + gpd(graph, handler); return fusion_count; } diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h index 5a6687872eb3ab4a032227fda9ff0e7f5254670b..3ee32c63a46fcc34bdccd1e14d4bbaf9668c49e9 100644 --- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#pragma once + #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index f651ab635eadc9f248964e91dceebf3aa9c42926..731b89423354532f684e19305dfa87e8eb75d4b1 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -73,7 +73,6 @@ void PDPattern::AddEdge(PDNode* a, PDNode* b) { void GraphPatternDetector::operator()(Graph* graph, GraphPatternDetector::handle_t handler) { if (!MarkPDNodesInGraph(*graph)) { - LOG(INFO) << "Mark failed"; return; } @@ -86,7 +85,7 @@ void GraphPatternDetector::operator()(Graph* graph, LOG(INFO) << "detect " << subgraphs.size() << " subgraph matches the pattern"; int id = 0; for (auto& g : subgraphs) { - LOG(INFO) << "optimizing #" << id++ << " subgraph"; + VLOG(3) << "optimizing #" << id++ << " subgraph"; handler(g, graph); } } @@ -111,6 +110,11 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) { return false; } } + for (auto& item : pdnodes2nodes_) { + for (auto& n : item.second) { + GetMarkedNodes(const_cast(&graph)).insert(n); + } + } VLOG(3) << pdnodes2nodes_.size() << " nodes marked"; return !pdnodes2nodes_.empty(); @@ -278,7 +282,7 @@ void GraphPatternDetector::RemoveOverlappedMatch( for (const auto& subgraph : *subgraphs) { bool valid = true; for (auto& item : subgraph) { - if (node_set.count(item.second)) { + if (item.first->IsIntermediate() && node_set.count(item.second)) { valid = false; break; } @@ -334,22 +338,22 @@ PDNode& PDNode::LinksFrom(const std::vector& others) { } PDNode* PDNode::assert_is_op() { - asserts_.emplace_back([this](Node* x) { return x && x->IsOp(); }); + asserts_.emplace_back([](Node* x) { return x && x->IsOp(); }); return this; } PDNode* PDNode::assert_is_op(const std::string& op_type) { - asserts_.emplace_back([this, op_type](Node* x) { + asserts_.emplace_back([op_type](Node* x) { return x && x->IsOp() && x->Op()->Type() == op_type; }); return this; } PDNode* PDNode::assert_is_var() { - asserts_.emplace_back([this](Node* x) { return x && x->IsVar(); }); + asserts_.emplace_back([](Node* x) { return x && x->IsVar(); }); return this; } PDNode* PDNode::assert_var_not_persistable() { assert_is_var(); - asserts_.emplace_back([this](Node* x) { return !x->Var()->Persistable(); }); + asserts_.emplace_back([](Node* x) { return !x->Var()->Persistable(); }); return this; } PDNode* PDNode::assert_is_persistable_var() { @@ -491,14 +495,16 @@ void GraphSafeRemoveNodes(Graph* graph, for (auto it = node->inputs.begin(); it != node->inputs.end();) { if (nodes.count(*it)) { it = const_cast(node)->inputs.erase(it); - } else + } else { it++; + } } for (auto it = node->outputs.begin(); it != node->outputs.end();) { if (nodes.count(*it)) { it = const_cast(node)->outputs.erase(it); - } else + } else { it++; + } } } } diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 024ce8ce55616cc5e0eaced4a27a6e1fb004af2c..eacea1750f6f1e86a8fe79637c3bd757a7275398 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -19,6 +19,9 @@ #endif #include +#include +#include +#include #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/inference/analysis/dot.h" @@ -245,6 +248,8 @@ class GraphPatternDetector { void UniquePatterns(std::vector* subgraphs); // Remove overlapped match subgraphs, when overlapped, keep the previous one. + // The intermediate PDNodes will be removed, so can't shared by multiple + // patterns. void RemoveOverlappedMatch(std::vector* subgraphs); // Validate whether the intermediate nodes are linked by external nodes. @@ -295,6 +300,10 @@ PDNode* LSTM(PDPattern* pattern, const std::string& name_scope, PDNode* x); } // namespace patterns +#define IR_NODE_LINK_TO(a, b) \ + a->outputs.push_back(b); \ + b->inputs.push_back(a); + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc index 7e5c86b033a7c69a306491cf4bf8d099018c5f19..6c466fb21fb46e09961dc874e9e39655f83d17c6 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc @@ -140,8 +140,9 @@ TEST(GraphPatternDetecter, MultiSubgraph) { return node->IsOp() && (node->Name() == "op2" || node->Name() == "op3"); }, "OP0"); - auto* any_var = x.mutable_pattern()->NewNode( - [](Node* node) { return node->IsVar(); }, "VAR"); + auto* any_var = x.mutable_pattern() + ->NewNode([](Node* node) { return node->IsVar(); }, "VAR") + ->AsIntermediate(); auto* any_op1 = x.mutable_pattern()->NewNode( [](Node* node) { return node->IsOp(); }, "OP1"); diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc index 4c7ffe69e933de3d52c8f762a1eeb73de17e0561..31ed98db72c8fd4af8c970861d386687962001ce 100644 --- a/paddle/fluid/framework/ir/graph_viz_pass.cc +++ b/paddle/fluid/framework/ir/graph_viz_pass.cc @@ -50,20 +50,37 @@ std::unique_ptr GraphVizPass::ApplyImpl( Dot dot; - std::vector op_attrs({Dot::Attr("style", "filled"), - Dot::Attr("shape", "box"), - Dot::Attr("fillcolor", "red")}); - std::vector var_attrs({Dot::Attr("style", "filled,rounded"), - // Dot::Attr("shape", "diamond"), - Dot::Attr("fillcolor", "yellow")}); - - std::vector marked_op_attrs({Dot::Attr("style", "filled"), - Dot::Attr("shape", "box"), - Dot::Attr("fillcolor", "lightgray")}); - std::vector marked_var_attrs( - {Dot::Attr("style", "filled,rounded"), - // Dot::Attr("shape", "diamond"), - Dot::Attr("fillcolor", "lightgray")}); + const std::vector op_attrs({ + Dot::Attr("style", "rounded,filled,bold"), // + Dot::Attr("shape", "box"), // + Dot::Attr("color", "#303A3A"), // + Dot::Attr("fontcolor", "#ffffff"), // + Dot::Attr("width", "1.3"), // + Dot::Attr("height", "0.84"), // + Dot::Attr("fontname", "Arial"), // + }); + const std::vector arg_attrs({ + Dot::Attr("shape", "box"), // + Dot::Attr("style", "rounded,filled,bold"), // + Dot::Attr("fontname", "Arial"), // + Dot::Attr("fillcolor", "#999999"), // + Dot::Attr("color", "#dddddd"), // + }); + + const std::vector param_attrs({ + Dot::Attr("shape", "box"), // + Dot::Attr("style", "rounded,filled,bold"), // + Dot::Attr("fontname", "Arial"), // + Dot::Attr("color", "#148b97"), // + Dot::Attr("fontcolor", "#ffffff"), // + }); + + const std::vector marked_op_attrs( + {Dot::Attr("style", "rounded,filled,bold"), Dot::Attr("shape", "box"), + Dot::Attr("fillcolor", "yellow")}); + const std::vector marked_var_attrs( + {Dot::Attr("style", "filled,rounded"), Dot::Attr("shape", "box"), + Dot::Attr("fillcolor", "yellow")}); auto marked_nodes = ConsumeMarkedNodes(graph.get()); // Create nodes @@ -74,9 +91,17 @@ std::unique_ptr GraphVizPass::ApplyImpl( marked_nodes.count(n) ? marked_op_attrs : op_attrs; dot.AddNode(node_id, attr, node_id); } else if (n->IsVar()) { - decltype(op_attrs) attr = - marked_nodes.count(n) ? marked_var_attrs : var_attrs; - dot.AddNode(node_id, attr, node_id); + decltype(op_attrs)* attr; + if (marked_nodes.count(n)) { + attr = &marked_var_attrs; + } else if (const_cast(n)->Var() && + const_cast(n)->Var()->Persistable()) { + attr = ¶m_attrs; + } else { + attr = &arg_attrs; + } + + dot.AddNode(node_id, *attr, node_id); } node2dot[n] = node_id; } diff --git a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc index f885567da1965b997b2063e06c839af95b43e1e1..7713ed1eab88ee4fa16d52e7425075ae66f721a3 100644 --- a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc +++ b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc @@ -13,42 +13,41 @@ // limitations under the License. #include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" namespace paddle { namespace framework { namespace ir { -class InferCleanGraphPass : public Pass { +class InferCleanGraphPass : public FusePassBase { public: virtual ~InferCleanGraphPass() {} protected: std::unique_ptr ApplyImpl(std::unique_ptr graph) const { + FusePassBase::Init("original_graph", graph.get()); PADDLE_ENFORCE(graph.get()); auto is_valid_node = [](Node* x) { return x && IsControlDepVar(*x) && x->IsVar() && !x->Var(); }; - std::unordered_set invalid_nodes; + std::unordered_set invalid_nodes; + int valid_op = 0; for (auto* node : graph->Nodes()) { if (is_valid_node(node)) { invalid_nodes.insert(node); + } else if (node->IsOp()) { + // Collect all the operators to help tracking number of operators. + ++valid_op; } } - // remove nodes from the graph. - for (auto* node : invalid_nodes) { - graph->RemoveNode(node); - } + GraphSafeRemoveNodes(graph.get(), invalid_nodes); - // clean edges. - for (auto* node : graph->Nodes()) { - CleanEdges(&node->inputs, invalid_nodes); - CleanEdges(&node->outputs, invalid_nodes); - } + AddStatis(valid_op); return graph; } diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc index a776a898a5ee13b4dde12460dce71433268fb9d4..e1a441d09aaa3647c4b2a582210a2c7e2b64e0da 100644 --- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc +++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc @@ -219,16 +219,13 @@ std::unique_ptr SeqConcatFcFusePass::ApplyImpl( op_desc.SetAttr("fc_activation", act->Op()->Type()); auto* op_node = graph->CreateOpNode(&op_desc); -// Add links -#define NODE_LINKS(a, b) \ - a->outputs.push_back(b); \ - b->inputs.push_back(a); - NODE_LINKS(fc_w, op_node); - NODE_LINKS(fc_bias, op_node); - NODE_LINKS(concat_in0, op_node); - NODE_LINKS(sequence_expand0_in, op_node); - NODE_LINKS(sequence_expand1_in, op_node); - NODE_LINKS(op_node, fc_out); + // Add links + IR_NODE_LINK_TO(fc_w, op_node); + IR_NODE_LINK_TO(fc_bias, op_node); + IR_NODE_LINK_TO(concat_in0, op_node); + IR_NODE_LINK_TO(sequence_expand0_in, op_node); + IR_NODE_LINK_TO(sequence_expand1_in, op_node); + IR_NODE_LINK_TO(op_node, fc_out); // Clean nodes. std::unordered_set marked_nodes; @@ -241,7 +238,6 @@ std::unique_ptr SeqConcatFcFusePass::ApplyImpl( marked_nodes.erase(sequence_expand0_in); marked_nodes.erase(sequence_expand1_in); marked_nodes.erase(fc_out); - GraphSafeRemoveNodes(graph, marked_nodes); }); diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 86392078b356df774fbc47aed9214e9f10fe33be..2006e3b24f71d0ae32b4e2ae34f1a1e4d3a82f91 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -10,7 +10,7 @@ set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor) # TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal? cc_library(paddle_fluid_api SRCS io.cc - DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} graph_to_program_pass) + DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) @@ -22,7 +22,7 @@ cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) #endif() # Create static library -cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api) +cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api analysis_predictor) if(NOT APPLE) # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac. set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym") @@ -32,6 +32,7 @@ endif() # Create shared library cc_library(paddle_fluid_shared SHARED SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc + ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc DEPS ${fluid_modules} paddle_fluid_api) set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index cc0dd0d492d42e9552c9ce081e268330599104f0..226645058e85da55b47e26efe5a199f50aef3847 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -33,7 +33,7 @@ function (inference_analysis_test TARGET) endif() cc_test(${TARGET} SRCS "${analysis_test_SRCS}" - DEPS analysis graph fc_fuse_pass graph_viz_pass infer_clean_graph_pass graph_pattern_detector pass ${analysis_test_EXTRA_DEPS} + DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS} ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt} ${analysis_test_ARGS}) set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec) endif(WITH_TESTING) @@ -56,25 +56,13 @@ if (NOT EXISTS ${DITU_INSTALL_DIR} AND WITH_TESTING) endif() inference_analysis_test(test_analyzer SRCS analyzer_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis - analysis_predictor - # ir - fc_fuse_pass - fc_lstm_fuse_pass - seq_concat_fc_fuse_pass - graph_viz_pass - infer_clean_graph_pass - graph_pattern_detector - infer_clean_graph_pass - attention_lstm_fuse_pass - paddle_inference_api - pass + EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor ARGS --infer_ditu_rnn_model=${DITU_INSTALL_DIR}/model - --infer_ditu_rnn_data=${DITU_INSTALL_DIR}/data.txt) + --infer_ditu_rnn_data=${DITU_INSTALL_DIR}/data.txt) inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc) -inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc EXTRA_DEPS paddle_inference_api) -inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc EXTRA_DEPS paddle_fluid) +inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc) +inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc) inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc) inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc) inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc) @@ -86,7 +74,7 @@ inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc) set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz") set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz") set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner" CACHE PATH "Chinese ner model and data root." FORCE) -if (NOT EXISTS ${CHINESE_NER_INSTALL_DIR} AND WITH_TESTING) +if (NOT EXISTS ${CHINESE_NER_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE) inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} "chinese_ner_model.tar.gz") inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_DATA_URL} "chinese_ner-data.txt.tar.gz") endif() @@ -99,7 +87,7 @@ inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz") set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz") set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac" CACHE PATH "LAC model and data root." FORCE) -if (NOT EXISTS ${LAC_INSTALL_DIR} AND WITH_TESTING) +if (NOT EXISTS ${LAC_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE) inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} "lac_model.tar.gz") inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_DATA_URL} "lac_data.txt.tar.gz") endif() @@ -108,3 +96,15 @@ inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc EXTRA_DEPS paddle_inference_api paddle_fluid_api ARGS --infer_model=${LAC_INSTALL_DIR}/model --infer_data=${LAC_INSTALL_DIR}/data.txt) + + +set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/text-classification-Senta.tar.gz") +set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification" CACHE PATH "Text Classification model and data root." FORCE) + +if (NOT EXISTS ${TEXT_CLASSIFICATION_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE) + inference_download_and_uncompress(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} "text-classification-Senta.tar.gz") +endif() + +inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc + EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor + ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta) diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc index 192ac2daa6a78efec6db19870f71e80593c62da9..1fd884435d173800563ea37809003ed3aee16c7c 100644 --- a/paddle/fluid/inference/analysis/analyzer.cc +++ b/paddle/fluid/inference/analysis/analyzer.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/inference/analysis/analyzer.h" #include +#include #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h" #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" @@ -41,20 +42,16 @@ class DfgPassManagerImpl final : public DfgPassManager { public: DfgPassManagerImpl() { // TODO(Superjomn) set the key with pass reprs. - LOG(INFO) - << "-----------------------------------------------------------------"; - if (FLAGS_IA_enable_ir) { - AddPass("fluid-to-ir-pass", new FluidToIrPass); - } else { + if (!FLAGS_IA_enable_ir) { AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass); + } else { + AddPass("fluid-to-ir-pass", new FluidToIrPass); } TryAddTensorRtPass(); AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass); if (!FLAGS_IA_output_storage_path.empty()) { AddPass("model-store-pass", new ModelStorePass); } - LOG(INFO) - << "-----------------------------------------------------------------"; } std::string repr() const override { return "dfg-pass-manager"; } @@ -101,19 +98,15 @@ class DfgPassManagerImpl final : public DfgPassManager { Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); } void Analyzer::Run(Argument* argument) { - // Ugly support fluid-to-ir-pass - argument->Set(kFluidToIrPassesAttr, - new std::vector({ - // Manual update the passes here. - "graph_viz_pass", // - "infer_clean_graph_pass", "graph_viz_pass", // - "attention_lstm_fuse_pass", "graph_viz_pass", // - "fc_lstm_fuse_pass", "graph_viz_pass", // - "mul_lstm_fuse_pass", "graph_viz_pass", // - "seq_concat_fc_fuse_pass", "graph_viz_pass", // - "fc_fuse_pass", "graph_viz_pass" // - - })); + std::vector passes; + for (auto& pass : all_ir_passes_) { + if (!disabled_ir_passes_.count(pass)) { + passes.push_back(pass); + passes.push_back("graph_viz_pass"); // add graphviz for debug. + } + } + passes.push_back("graph_viz_pass"); + argument->Set(kFluidToIrPassesAttr, new std::vector(passes)); for (auto& x : data_) { PADDLE_ENFORCE(x->Initialize(argument)); @@ -122,6 +115,11 @@ void Analyzer::Run(Argument* argument) { } } +Analyzer& Analyzer::DisableIrPasses(const std::vector& passes) { + disabled_ir_passes_.insert(passes.begin(), passes.end()); + return *this; +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h index 2e107c82dd50d5cf22797f4c82e69d302514f955..3fdd2b9ec7537c891a04efb3ca9a1d45075ffa5e 100644 --- a/paddle/fluid/inference/analysis/analyzer.h +++ b/paddle/fluid/inference/analysis/analyzer.h @@ -36,16 +36,10 @@ limitations under the License. */ */ #include +#include "paddle/fluid/inference/analysis/flags.h" #include "paddle/fluid/inference/analysis/pass.h" #include "paddle/fluid/inference/analysis/pass_manager.h" -// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this -// flag if not available. -DECLARE_bool(IA_enable_tensorrt_subgraph_engine); -DECLARE_string(IA_graphviz_log_root); -DECLARE_string(IA_output_storage_path); -DECLARE_bool(IA_enable_ir); - namespace paddle { namespace inference { namespace analysis { @@ -57,7 +51,26 @@ class Analyzer : public OrderedRegistry { void Run(Argument* argument); + Analyzer& DisableIrPasses(const std::vector& passes); + DISABLE_COPY_AND_ASSIGN(Analyzer); + + private: + // All avaiable IR passes. + // The bigger fuse comes first, so that the small operators prefer to be + // merged in a larger fuse op. The small fusion will not break the pattern of + // larger fusion. + const std::vector all_ir_passes_{{ + // Manual update the passes here. + "infer_clean_graph_pass", // + "attention_lstm_fuse_pass", // + "fc_lstm_fuse_pass", // + "mul_lstm_fuse_pass", // + "seq_concat_fc_fuse_pass", // + "fc_fuse_pass", // + }}; + + std::unordered_set disabled_ir_passes_; }; } // namespace analysis diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index ec1f3979a74bd86ee7402bca441e95d3d177d113..4cf26d3c70eafd951d14c26335416ec2c71c001d 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -16,19 +16,21 @@ #include #include +#include // NOLINT #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/utils/singleton.h" -#include "paddle/fluid/platform/profiler.h" DEFINE_string(infer_ditu_rnn_model, "", "model path for ditu RNN"); DEFINE_string(infer_ditu_rnn_data, "", "data path for ditu RNN"); DEFINE_int32(batch_size, 10, "batch size."); DEFINE_int32(repeat, 1, "Running the inference program repeat times."); +DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); namespace paddle { namespace inference { @@ -219,39 +221,6 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, } } -std::string DescribeTensor(const PaddleTensor &tensor) { - std::stringstream os; - os << "Tensor [" << tensor.name << "]\n"; - os << " - type: "; - switch (tensor.dtype) { - case PaddleDType::FLOAT32: - os << "float32"; - break; - case PaddleDType::INT64: - os << "int64"; - break; - default: - os << "unset"; - } - os << '\n'; - - os << " - shape: " << to_string(tensor.shape) << '\n'; - os << " - lod: "; - for (auto &l : tensor.lod) { - os << to_string(l) << "; "; - } - os << "\n"; - os << " - data: "; - - int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1, - [](int a, int b) { return a * b; }); - for (int i = 0; i < dim; i++) { - os << static_cast(tensor.data.data())[i] << " "; - } - os << '\n'; - return os.str(); -} - } // namespace const float ditu_rnn_target_data[] = { @@ -265,57 +234,97 @@ const float ditu_rnn_target_data[] = { 10.7286, 12.0595, 10.6672, 0, 0, 0, 0, 0, 93.5771, 3.84641, 0, 0, 0, 0, 0, 0, 169.426, 0, 0, 0, 0, 0, 0, 0}; +void CompareResult(const std::vector &outputs, + const std::vector &base_outputs) { + PADDLE_ENFORCE_GT(outputs.size(), 0); + PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size()); + for (size_t i = 0; i < outputs.size(); i++) { + auto &out = outputs[i]; + auto &base_out = base_outputs[i]; + size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, + [](int a, int b) { return a * b; }); + size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(), + 1, [](int a, int b) { return a * b; }); + PADDLE_ENFORCE_EQ(size, size1); + PADDLE_ENFORCE_GT(size, 0); + float *data = static_cast(out.data.data()); + float *base_data = static_cast(base_out.data.data()); + for (size_t i = 0; i < size; i++) { + EXPECT_NEAR(data[i], base_data[i], 1e-3); + } + } +} // Test with a really complicate model. -void TestDituRNNPrediction(const std::string &model_path, - const std::string &data_path, int batch_size, - bool use_analysis, bool activate_ir, - int num_times = 1) { - NativeConfig config; +void TestDituRNNPrediction(bool use_analysis, bool activate_ir, + int num_threads) { + AnalysisConfig config; config.prog_file = FLAGS_infer_ditu_rnn_model + "/__model__"; config.param_file = FLAGS_infer_ditu_rnn_model + "/param"; config.use_gpu = false; config.device = 0; config.specify_input_name = true; + config.enable_ir_optim = activate_ir; + PADDLE_ENFORCE(config.ir_mode == + AnalysisConfig::IrPassMode::kExclude); // default + config.ir_passes.clear(); // Do not exclude any pass. + int batch_size = FLAGS_batch_size; + int num_times = FLAGS_repeat; auto base_predictor = CreatePaddlePredictor(config); auto predictor = - CreatePaddlePredictor(config); + CreatePaddlePredictor( + config); std::vector input_slots; - DataRecord data(data_path, batch_size); + DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size); // Prepare inputs. PrepareInputs(&input_slots, &data, batch_size); std::vector outputs, base_outputs; base_predictor->Run(input_slots, &base_outputs); - Timer timer; - timer.tic(); - for (int i = 0; i < num_times; i++) { - predictor->Run(input_slots, &outputs); - } LOG(INFO) << "===========profile result==========="; - LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << num_times - << ", latency: " << timer.toc() / num_times << "ms"; - LOG(INFO) << "====================================="; - - PADDLE_ENFORCE_GT(outputs.size(), 0); - PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size()); - for (size_t i = 0; i < outputs.size(); i++) { - auto &out = outputs[i]; - auto &base_out = base_outputs[i]; - size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, - [](int a, int b) { return a * b; }); - size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(), - 1, [](int a, int b) { return a * b; }); - PADDLE_ENFORCE_EQ(size, size1); - PADDLE_ENFORCE_GT(size, 0); - float *data = static_cast(out.data.data()); - float *base_data = static_cast(base_out.data.data()); - for (size_t j = 0; j < size; j++) { - EXPECT_NEAR(data[j], base_data[j], 1e-3); + if (num_threads == 1) { + // Prepare inputs. + Timer timer; + timer.tic(); + for (int i = 0; i < num_times; i++) { + predictor->Run(input_slots, &outputs); + } + PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times); + CompareResult(outputs, base_outputs); + } else { + std::vector threads; + std::vector> predictors; + // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled + // because AttentionLSTM's hard code nodeid will be damanged. + for (int tid = 0; tid < num_threads; ++tid) { + predictors.emplace_back( + CreatePaddlePredictor( + config)); + } + for (int tid = 0; tid < num_threads; ++tid) { + threads.emplace_back([&, tid]() { + // Each thread should have local input_slots and outputs. + std::vector input_slots; + DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size); + PrepareInputs(&input_slots, &data, batch_size); + std::vector outputs; + Timer timer; + timer.tic(); + for (int i = 0; i < num_times; i++) { + predictors[tid]->Run(input_slots, &outputs); + } + PrintTime(batch_size, num_times, num_threads, tid, + timer.toc() / num_times); + CompareResult(outputs, base_outputs); + }); + } + for (int i = 0; i < num_threads; ++i) { + threads[i].join(); } } + LOG(INFO) << "====================================="; if (use_analysis && activate_ir) { AnalysisPredictor *analysis_predictor = @@ -327,40 +336,45 @@ void TestDituRNNPrediction(const std::string &model_path, LOG(INFO) << "fused " << item.first << " " << item.second; } - ASSERT_TRUE(fuse_statis.count("fc")); - EXPECT_EQ(fuse_statis.at("fc"), 1); - EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 1); - } -} + int num_ops = 0; + for (auto &node : + analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) { + if (node->IsFunction()) { + ++num_ops; + } + } + LOG(INFO) << "has num ops: " << num_ops; -// Directly infer with the original model. -TEST(Analyzer, DituRNN_without_analysis) { - TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data, - FLAGS_batch_size, false, false, FLAGS_repeat); + ASSERT_TRUE(fuse_statis.count("fc_fuse")); + EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); + EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM + EXPECT_EQ(num_ops, + 13); // After graph optimization, only 13 operators exists. + } } -// Inference with the original model with the analysis turned on, the analysis -// module will transform the program to a data flow graph. -TEST(Analyzer, DituRNN_with_analysis) { - LOG(INFO) << "ditu rnn with analysis"; - TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data, - FLAGS_batch_size, true, false, FLAGS_repeat); +// Inference with analysis and IR, easy for profiling independently. +TEST(Analyzer, DituRNN) { + TestDituRNNPrediction(true, true, FLAGS_num_threads); } -// Inference with analysis and IR. The IR module will fuse some large kernels. -TEST(Analyzer, DituRNN_with_analysis_with_IR) { - LOG(INFO) << "ditu rnn with analysis and IR fuse"; - TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data, - FLAGS_batch_size, true, true, FLAGS_repeat); +// Other unit-tests of DituRNN, test different options of use_analysis, +// activate_ir and multi-threads. +TEST(Analyzer, DituRNN_tests) { + int num_threads[2] = {1, 4}; + for (auto i : num_threads) { + // Directly infer with the original model. + TestDituRNNPrediction(false, false, i); + // Inference with the original model with the analysis turned on, the + // analysis + // module will transform the program to a data flow graph. + TestDituRNNPrediction(true, false, i); + // Inference with analysis and IR. The IR module will fuse some large + // kernels. + TestDituRNNPrediction(true, true, i); + } } } // namespace analysis } // namespace inference } // namespace paddle - -USE_PASS(fc_fuse_pass); -USE_PASS(seq_concat_fc_fuse_pass); -USE_PASS(fc_lstm_fuse_pass); -USE_PASS(graph_viz_pass); -USE_PASS(infer_clean_graph_pass); -USE_PASS(attention_lstm_fuse_pass); diff --git a/paddle/fluid/inference/analysis/analyzer_text_classification_tester.cc b/paddle/fluid/inference/analysis/analyzer_text_classification_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..265e814acd594d6185251cbaa4d6880bb9ee7405 --- /dev/null +++ b/paddle/fluid/inference/analysis/analyzer_text_classification_tester.cc @@ -0,0 +1,103 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/analyzer.h" +#include +#include // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files. +#include +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/inference/analysis/ut_helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" +#include "paddle/fluid/inference/api/timer.h" + +DEFINE_string(infer_model, "", "Directory of the inference model."); +DEFINE_string(infer_data, "", "Path of the dataset."); +DEFINE_int32(batch_size, 1, "batch size."); +DEFINE_int32(repeat, 1, "How many times to repeat run."); + +namespace paddle { + +template +std::string to_string(const std::vector &vec) { + std::stringstream ss; + for (const auto &c : vec) { + ss << c << " "; + } + return ss.str(); +} + +void PrintTime(const double latency, const int bs, const int repeat) { + LOG(INFO) << "===========profile result==========="; + LOG(INFO) << "batch_size: " << bs << ", repeat: " << repeat + << ", avg latency: " << latency / repeat << "ms"; + LOG(INFO) << "====================================="; +} + +void Main(int batch_size) { + // Three sequence inputs. + std::vector input_slots(1); + // one batch starts + // data -- + int64_t data0[] = {0, 1, 2}; + for (auto &input : input_slots) { + input.data.Reset(data0, sizeof(data0)); + input.shape = std::vector({3, 1}); + // dtype -- + input.dtype = PaddleDType::INT64; + // LoD -- + input.lod = std::vector>({{0, 3}}); + } + + // shape -- + // Create Predictor -- + AnalysisConfig config; + config.model_dir = FLAGS_infer_model; + config.use_gpu = false; + config.enable_ir_optim = true; + config.ir_passes.push_back("fc_lstm_fuse_pass"); + auto predictor = + CreatePaddlePredictor( + config); + + inference::Timer timer; + double sum = 0; + std::vector output_slots; + for (int i = 0; i < FLAGS_repeat; i++) { + timer.tic(); + CHECK(predictor->Run(input_slots, &output_slots)); + sum += timer.toc(); + } + PrintTime(sum, batch_size, FLAGS_repeat); + + // Get output + LOG(INFO) << "get outputs " << output_slots.size(); + + for (auto &output : output_slots) { + LOG(INFO) << "output.shape: " << to_string(output.shape); + // no lod ? + CHECK_EQ(output.lod.size(), 0UL); + LOG(INFO) << "output.dtype: " << output.dtype; + std::stringstream ss; + for (int i = 0; i < 5; i++) { + ss << static_cast(output.data.data())[i] << " "; + } + LOG(INFO) << "output.data summary: " << ss.str(); + // one batch ends + } +} + +TEST(text_classification, basic) { Main(FLAGS_batch_size); } + +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/flags.h b/paddle/fluid/inference/analysis/flags.h new file mode 100644 index 0000000000000000000000000000000000000000..717e543f01dfa071865a5c14c0b7679e65239daf --- /dev/null +++ b/paddle/fluid/inference/analysis/flags.h @@ -0,0 +1,22 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this +// flag if not available. +DECLARE_bool(IA_enable_tensorrt_subgraph_engine); +DECLARE_string(IA_graphviz_log_root); +DECLARE_string(IA_output_storage_path); +DECLARE_bool(IA_enable_ir); diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass.h b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h index 6731b1f759363eec5dd8645783212a72ace67b2f..3086085710d6e850ed27e82d2323690dfdd3ef19 100644 --- a/paddle/fluid/inference/analysis/fluid_to_ir_pass.h +++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h @@ -15,6 +15,7 @@ #pragma once #include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/inference/analysis/flags.h" #include "paddle/fluid/inference/analysis/ir_pass_manager.h" #include "paddle/fluid/inference/analysis/pass.h" @@ -85,9 +86,11 @@ class FluidToIrPass final : public DataFlowGraphPass { new Scope *(&argument_->Get(ir::kParamScopeAttr))); } - const auto &ir_passes_to_apply = - argument_->Get>(kFluidToIrPassesAttr); - ir_passes.Apply(ir_passes_to_apply); + if (FLAGS_IA_enable_ir) { + const auto &ir_passes_to_apply = + argument_->Get>(kFluidToIrPassesAttr); + ir_passes.Apply(ir_passes_to_apply); + } PADDLE_ENFORCE(argument_->main_dfg.get()); argument_->main_dfg->Build(ir_passes.graph()); diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc index 6a13c60e7b2ebf645b12d5ddf83ef6ab3a2e83bd..367c25805d05f8d10fb8341158760ac6356a5c48 100644 --- a/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc +++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc @@ -16,6 +16,7 @@ #include #include "paddle/fluid/inference/analysis/ut_helper.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" namespace paddle { namespace inference { @@ -33,10 +34,3 @@ TEST(FluidToIrPass, Test) { } // namespace analysis } // namespace inference } // namespace paddle - -USE_PASS(graph_viz_pass); -USE_PASS(infer_clean_graph_pass); -USE_PASS(attention_lstm_fuse_pass); -USE_PASS(fc_lstm_fuse_pass); -USE_PASS(seq_concat_fc_fuse_pass); -USE_PASS(fc_fuse_pass); diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index adfe4392448557a30cd834022b9a5d21d9086b95..6b8278a0395c9ae71e32337d9735409de7ba0c96 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -18,10 +18,7 @@ if(APPLE) endif(APPLE) -set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager - graph_viz_pass fc_fuse_pass - infer_clean_graph_pass - ) +set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager ${GLOB_PASS_LIB}) if(WITH_GPU AND TENSORRT_FOUND) set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine) @@ -47,7 +44,7 @@ function(inference_api_test TARGET_NAME) endfunction(inference_api_test) cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor) -cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api) +cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis) cc_test(test_paddle_inference_api SRCS api_tester.cc diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 33862232bdaae817b9ca72879605386c32ed3e8b..79eeea88ea83ad862b5e2ac1390dae377b676685 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -14,10 +14,13 @@ #include "paddle/fluid/inference/api/analysis_predictor.h" #include +#include +#include #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/utils/singleton.h" namespace paddle { @@ -27,10 +30,11 @@ bool AnalysisPredictor::Init( VLOG(3) << "Predictor::init()"; if (config_.use_gpu) { place_ = paddle::platform::CUDAPlace(config_.device); + LOG(WARNING) << "ir optimize only supports CPU currently"; + config_.enable_ir_optim = false; } else { place_ = paddle::platform::CPUPlace(); } - PADDLE_ENFORCE(!parent_scope); if (parent_scope) { scope_ = parent_scope; sub_scope_ = &(parent_scope->NewScope()); @@ -72,7 +76,7 @@ bool AnalysisPredictor::Init( void AnalysisPredictor::OptimizeInferenceProgram() { LOG(INFO) << "optimize begin"; - FLAGS_IA_enable_ir = true; + FLAGS_IA_enable_ir = config_.enable_ir_optim; FLAGS_IA_enable_tensorrt_subgraph_engine = false; FLAGS_IA_output_storage_path = ""; // Don't output the model. // Analyze inference_program @@ -89,24 +93,26 @@ void AnalysisPredictor::OptimizeInferenceProgram() { } argument_.origin_program_desc.reset( new ProgramDesc(*inference_program_->Proto())); - Analyzer().Run(&argument_); + PADDLE_ENFORCE(config_.ir_mode == AnalysisConfig::IrPassMode::kExclude, + "Only kExclude is supported yet."); + Analyzer().DisableIrPasses(config_.ir_passes).Run(&argument_); + CHECK(argument_.transformed_program_desc); VLOG(5) << "to prepare executor"; - // LOG(INFO) << "transformed_parogram_desc " << - // argument.transformed_program_desc->DebugString(); inference_program_.reset( new framework::ProgramDesc(*argument_.transformed_program_desc)); - PADDLE_ENFORCE(argument_.Has(framework::ir::kParamScopeAttr)); - // Update scope. - scope_.reset( - argument_.Release(framework::ir::kParamScopeAttr)); - LOG(INFO) << "optimize end =="; + if (argument_.Has(framework::ir::kParamScopeAttr)) { + // Update scope. + scope_.reset( + argument_.Release(framework::ir::kParamScopeAttr)); + } + LOG(INFO) << "== optimize end =="; } template <> std::unique_ptr CreatePaddlePredictor< - NativeConfig, PaddleEngineKind::kAnalysis>(const NativeConfig& config) { - VLOG(3) << "create NativePredictor"; + AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig& config) { + VLOG(3) << "create AnalysisConfig"; if (config.use_gpu) { // 1. GPU memeroy PADDLE_ENFORCE_GT( @@ -133,7 +139,3 @@ std::unique_ptr CreatePaddlePredictor< } } // namespace paddle - -USE_PASS(fc_fuse_pass); -USE_PASS(graph_viz_pass); -USE_PASS(infer_clean_graph_pass); diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index e32b6185f6044ab3577bde0a8f8dcf2391688aa8..e53925366e9214cd60422efe56884751297c15e5 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include #include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" @@ -28,7 +30,7 @@ using framework::proto::ProgramDesc; */ class AnalysisPredictor : public NativePaddlePredictor { public: - explicit AnalysisPredictor(const NativeConfig& config) + explicit AnalysisPredictor(const AnalysisConfig& config) : NativePaddlePredictor(config), config_(config) {} bool Init(const std::shared_ptr& parent_scope); @@ -44,7 +46,7 @@ class AnalysisPredictor : public NativePaddlePredictor { Argument& analysis_argument() { return argument_; } private: - NativeConfig config_; + AnalysisConfig config_; Argument argument_; }; diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 38b11d9113e4b03f8365b969009f7a385a683a70..bd9b4b1a814f995e3979105f5b9830b95fd8ea7d 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -176,7 +176,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, framework::Scope *scope) { VLOG(3) << "Predictor::set_feed"; if (inputs.size() != feeds_.size()) { - LOG(ERROR) << "wrong feed input size."; + LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get " + << inputs.size(); return false; } for (size_t i = 0; i < inputs.size(); ++i) { diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh index 7824ef2649af81a2390ff3bc537eb7c93c70e402..0f7d541c5edfc62e80cf50f83b491f06dcb42644 100755 --- a/paddle/fluid/inference/api/demo_ci/run.sh +++ b/paddle/fluid/inference/api/demo_ci/run.sh @@ -14,7 +14,7 @@ else fi PREFIX=inference-vis-demos%2F -URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX} +URL_ROOT=http://paddlemodels.cdn.bcebos.com/${PREFIX} # download vis_demo data function download() { diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index bdc9a15d543818da94ac2acf34ecabbbbae3291e..2c2ac656e8005369bb0e9033236a431cb09caa15 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -14,8 +14,10 @@ #pragma once +#include #include #include +#include #include #include #include @@ -87,5 +89,45 @@ static void TensorAssignData(PaddleTensor *tensor, } } +std::string DescribeTensor(const PaddleTensor &tensor) { + std::stringstream os; + os << "Tensor [" << tensor.name << "]\n"; + os << " - type: "; + switch (tensor.dtype) { + case PaddleDType::FLOAT32: + os << "float32"; + break; + case PaddleDType::INT64: + os << "int64"; + break; + default: + os << "unset"; + } + os << '\n'; + + os << " - shape: " << to_string(tensor.shape) << '\n'; + os << " - lod: "; + for (auto &l : tensor.lod) { + os << to_string(l) << "; "; + } + os << "\n"; + os << " - data: "; + + int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1, + [](int a, int b) { return a * b; }); + for (int i = 0; i < dim; i++) { + os << static_cast(tensor.data.data())[i] << " "; + } + os << '\n'; + return os.str(); +} + +void PrintTime(int batch_size, int repeat, int num_threads, int tid, + double latency) { + LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << repeat + << ", threads: " << num_threads << ", thread id: " << tid + << ", latency: " << latency << "ms"; +} + } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index 1baa64c249f291ec1bc874be5031abe6d4368274..995da11e4a30eca72a91a53d3293aa8b033b012b 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -150,6 +150,21 @@ struct TensorRTConfig : public NativeConfig { int workspace_size{1 << 30}; }; +// NOTE WIP, not stable yet. +struct AnalysisConfig : public NativeConfig { + // + enum class IrPassMode { + kSystem, // Use system default passes, not customize. + kInclude, // Specify the passes in `ir_passes`. + kExclude // Specify the disabled passes in `ir_passes`. + }; + + bool enable_ir_optim = true; + IrPassMode ir_mode{IrPassMode::kExclude}; + // attention lstm fuse works only on some specific models, disable as default. + std::vector ir_passes{"attention_lstm_fuse_pass"}; +}; + // A factory to help create different predictors. // // FOR EXTENSION DEVELOPER: diff --git a/paddle/fluid/inference/paddle_fluid.map b/paddle/fluid/inference/paddle_fluid.map index 5203784dc1fcb672eb6a26d9dfd3ffbe02e08038..7e5cae04b81e6ce759b92f6c4b921ecf974e8260 100644 --- a/paddle/fluid/inference/paddle_fluid.map +++ b/paddle/fluid/inference/paddle_fluid.map @@ -1,6 +1,7 @@ { global: *paddle*; + *Pass*; local: *; }; diff --git a/paddle/fluid/operators/auc_op.cc b/paddle/fluid/operators/auc_op.cc index 5edecd18e673da326ec119cf9a383f24f8045089..dfaa7456f917c1308984b361afed752f96ea6f59 100644 --- a/paddle/fluid/operators/auc_op.cc +++ b/paddle/fluid/operators/auc_op.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/auc_op.h" -#include namespace paddle { namespace operators { @@ -36,15 +35,12 @@ class AucOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(predict_height, label_height, "Out and Label should have same height."); - int num_thres = ctx->Attrs().Get("num_thresholds"); + int num_pred_buckets = ctx->Attrs().Get("num_thresholds") + 1; ctx->SetOutputDim("AUC", {1}); - ctx->SetOutputDim("TPOut", {num_thres}); - ctx->SetOutputDim("TNOut", {num_thres}); - ctx->SetOutputDim("FPOut", {num_thres}); - ctx->SetOutputDim("FNOut", {num_thres}); - - ctx->ShareLoD("Predict", /*->*/ "AUC"); + ctx->SetOutputDim("BatchAUC", {1}); + ctx->SetOutputDim("StatPosOut", {num_pred_buckets}); + ctx->SetOutputDim("StatNegOut", {num_pred_buckets}); } protected: @@ -66,25 +62,24 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Label", "A 2D int tensor indicating the label of the training data. " "shape: [batch_size, 1]"); - AddInput("TP", "True-Positive value."); - AddInput("FP", "False-Positive value."); - AddInput("TN", "True-Negative value."); - AddInput("FN", "False-Negative value."); // TODO(typhoonzero): support weight input + AddInput("StatPos", "Statistic value when label = 1"); + AddInput("StatNeg", "Statistic value when label = 0"); + AddOutput("AUC", "A scalar representing the " "current area-under-the-curve."); - AddOutput("TPOut", "True-Positive value."); - AddOutput("FPOut", "False-Positive value."); - AddOutput("TNOut", "True-Negative value."); - AddOutput("FNOut", "False-Negative value."); + AddOutput("BatchAUC", "The AUC for current batch"); + AddOutput("StatPosOut", "Statistic value when label = 1"); + AddOutput("StatNegOut", "Statistic value when label = 0"); AddAttr("curve", "Curve type, can be 'ROC' or 'PR'.") .SetDefault("ROC"); + AddAttr("num_thresholds", "The number of thresholds to use when discretizing the" " roc curve.") - .SetDefault(200); + .SetDefault((2 << 12) - 1); AddComment(R"DOC( Area Under The Curve (AUC) Operator. diff --git a/paddle/fluid/operators/auc_op.h b/paddle/fluid/operators/auc_op.h index 0a18585edb54a76aff5ae72ecc71e0eebb9f9361..fb0517d70635e090f8c5b59ff9d8420fc34c747b 100644 --- a/paddle/fluid/operators/auc_op.h +++ b/paddle/fluid/operators/auc_op.h @@ -13,9 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + #include #include -#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" namespace paddle { @@ -23,106 +23,85 @@ namespace operators { using Tensor = framework::Tensor; -template -using EigenVector = framework::EigenVector; - template class AucKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* predict = ctx.Input("Predict"); - auto* label = ctx.Input("Label"); - auto* auc = ctx.Output("AUC"); + void Compute(const framework::ExecutionContext &ctx) const override { + auto *predict = ctx.Input("Predict"); + auto *label = ctx.Input("Label"); + + std::string curve = ctx.Attr("curve"); + int num_thresholds = ctx.Attr("num_thresholds"); + int num_pred_buckets = num_thresholds + 1; + // Only use output var for now, make sure it's persistable and // not cleaned up for each batch. - auto* true_positive = ctx.Output("TPOut"); - auto* false_positive = ctx.Output("FPOut"); - auto* true_negative = ctx.Output("TNOut"); - auto* false_negative = ctx.Output("FNOut"); + auto *auc = ctx.Output("AUC"); + auto *stat_pos = ctx.Output("StatPosOut"); + auto *stat_neg = ctx.Output("StatNegOut"); - auto* auc_data = auc->mutable_data(ctx.GetPlace()); + auto *stat_pos_data = stat_pos->mutable_data(ctx.GetPlace()); + auto *stat_neg_data = stat_neg->mutable_data(ctx.GetPlace()); + calcAuc(ctx, label, predict, stat_pos_data, stat_neg_data, num_thresholds, + auc); - std::string curve = ctx.Attr("curve"); - int num_thresholds = ctx.Attr("num_thresholds"); - std::vector thresholds_list; - thresholds_list.reserve(num_thresholds); - for (int i = 1; i < num_thresholds - 1; i++) { - thresholds_list[i] = static_cast(i) / (num_thresholds - 1); - } - const double kEpsilon = 1e-7; - thresholds_list[0] = 0.0f - kEpsilon; - thresholds_list[num_thresholds - 1] = 1.0f + kEpsilon; + auto *batch_auc = ctx.Output("BatchAUC"); + std::vector stat_pos_batch(num_pred_buckets, 0); + std::vector stat_neg_batch(num_pred_buckets, 0); + calcAuc(ctx, label, predict, stat_pos_batch.data(), stat_neg_batch.data(), + num_thresholds, batch_auc); + } + private: + inline static double trapezoidArea(double X1, double X2, double Y1, + double Y2) { + return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0; + } + + inline static void calcAuc(const framework::ExecutionContext &ctx, + const framework::Tensor *label, + const framework::Tensor *predict, + int64_t *stat_pos, int64_t *stat_neg, + int num_thresholds, + framework::Tensor *auc_tensor) { size_t batch_size = predict->dims()[0]; size_t inference_width = predict->dims()[1]; + const T *inference_data = predict->data(); + const auto *label_data = label->data(); + + auto *auc = auc_tensor->mutable_data(ctx.GetPlace()); - const T* inference_data = predict->data(); - const auto* label_data = label->data(); - - auto* tp_data = true_positive->mutable_data(ctx.GetPlace()); - auto* fn_data = false_negative->mutable_data(ctx.GetPlace()); - auto* tn_data = true_negative->mutable_data(ctx.GetPlace()); - auto* fp_data = false_positive->mutable_data(ctx.GetPlace()); - - for (int idx_thresh = 0; idx_thresh < num_thresholds; idx_thresh++) { - // calculate TP, FN, TN, FP for current thresh - int64_t tp = 0, fn = 0, tn = 0, fp = 0; - for (size_t i = 0; i < batch_size; i++) { - // NOTE: label_data used as bool, labels > 0 will be treated as true. - if (label_data[i]) { - if (inference_data[i * inference_width + 1] >= - (thresholds_list[idx_thresh])) { - tp++; - } else { - fn++; - } - } else { - if (inference_data[i * inference_width + 1] >= - (thresholds_list[idx_thresh])) { - fp++; - } else { - tn++; - } - } + for (size_t i = 0; i < batch_size; i++) { + uint32_t binIdx = static_cast( + inference_data[i * inference_width + 1] * num_thresholds); + if (label_data[i]) { + stat_pos[binIdx] += 1.0; + } else { + stat_neg[binIdx] += 1.0; } - // store rates - tp_data[idx_thresh] += tp; - fn_data[idx_thresh] += fn; - tn_data[idx_thresh] += tn; - fp_data[idx_thresh] += fp; } - // epsilon to avoid divide by zero. - double epsilon = 1e-6; - // Riemann sum to caculate auc. - Tensor tp_rate, fp_rate, rec_rate; - tp_rate.Resize({num_thresholds}); - fp_rate.Resize({num_thresholds}); - rec_rate.Resize({num_thresholds}); - auto* tp_rate_data = tp_rate.mutable_data(ctx.GetPlace()); - auto* fp_rate_data = fp_rate.mutable_data(ctx.GetPlace()); - auto* rec_rate_data = rec_rate.mutable_data(ctx.GetPlace()); - for (int i = 0; i < num_thresholds; i++) { - tp_rate_data[i] = (static_cast(tp_data[i]) + epsilon) / - (tp_data[i] + fn_data[i] + epsilon); - fp_rate_data[i] = - static_cast(fp_data[i]) / (fp_data[i] + tn_data[i] + epsilon); - rec_rate_data[i] = (static_cast(tp_data[i]) + epsilon) / - (tp_data[i] + fp_data[i] + epsilon); + + *auc = 0.0f; + + double totPos = 0.0; + double totNeg = 0.0; + double totPosPrev = 0.0; + double totNegPrev = 0.0; + + int idx = num_thresholds; + + while (idx >= 0) { + totPosPrev = totPos; + totNegPrev = totNeg; + totPos += stat_pos[idx]; + totNeg += stat_neg[idx]; + *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev); + + --idx; } - *auc_data = 0.0f; - if (curve == "ROC") { - for (int i = 0; i < num_thresholds - 1; i++) { - auto dx = fp_rate_data[i] - fp_rate_data[i + 1]; - auto y = (tp_rate_data[i] + tp_rate_data[i + 1]) / 2.0f; - *auc_data = *auc_data + dx * y; - } - } else if (curve == "PR") { - for (int i = 1; i < num_thresholds; i++) { - auto dx = tp_rate_data[i] - tp_rate_data[i - 1]; - auto y = (rec_rate_data[i] + rec_rate_data[i - 1]) / 2.0f; - *auc_data = *auc_data + dx * y; - } + + if (totPos > 0.0 && totNeg > 0.0) { + *auc = *auc / totPos / totNeg; } } }; diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc index 66784f0b5149a7c479a90a407709d993f4a40a8b..31159a02592a2aff75f7ecf5be924989f0f47071 100644 --- a/paddle/fluid/operators/distributed/request_handler_impl.cc +++ b/paddle/fluid/operators/distributed/request_handler_impl.cc @@ -39,19 +39,6 @@ bool RequestSendHandler::Handle(const std::string& varname, const std::string& out_var_name) { VLOG(4) << "RequestSendHandler:" << varname; - // Async - if (!sync_mode_) { - rpc_server_->Profiler().OneStep(); - try { - executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), - scope); - } catch (std::exception& e) { - LOG(ERROR) << "async: run sub program error " << e.what(); - return false; - } - return true; - } - // Sync if (varname == BATCH_BARRIER_MESSAGE) { VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE"; @@ -60,17 +47,31 @@ bool RequestSendHandler::Handle(const std::string& varname, VLOG(3) << "sync: recv complete message"; rpc_server_->Complete(); } else { - VLOG(3) << "sync: received var_name: " << varname; - rpc_server_->WaitCond(kRequestSend); - VLOG(3) << "sync: processing received var: " << varname; - - if (invar == nullptr) { - LOG(FATAL) << "sync: Can not find server side var: " << varname; - return false; - } - if (invar->IsType()) { - std::unique_lock lock(mutex_sparse_vars_); - sparse_vars_.push_back(invar); + // Async + if (!sync_mode_) { + VLOG(3) << "async process var: " << varname; + rpc_server_->Profiler().OneStep(); + try { + executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), + scope); + } catch (std::exception& e) { + LOG(ERROR) << "async: run sub program error " << e.what(); + return false; + } + return true; + } else { // sync + rpc_server_->WaitCond(kRequestSend); + VLOG(3) << "sync: processing received var: " << varname; + + if (invar == nullptr) { + LOG(FATAL) << "sync: Can not find server side var: " << varname; + return false; + } + + if (invar->IsType()) { + std::unique_lock lock(mutex_sparse_vars_); + sparse_vars_.push_back(invar); + } } } return true; diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu index 7c65d6dba7d67b5d31720bae1f4877dd22210138..a0ff6396210c2b3a7f8bd6b9f274b875d7fd4933 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu +++ b/paddle/fluid/operators/fake_quantize_op.cu @@ -119,7 +119,8 @@ struct FindRangeAbsMaxFunctor { const framework::Tensor& last_scale, const framework::Tensor& iter, const int window_size, framework::Tensor* scales_arr, framework::Tensor* out_scale) { - auto& gpu_place = boost::get(ctx.GetPlace()); + const auto gpu_place = boost::get(ctx.GetPlace()); + T* scale_arr = scales_arr->mutable_data(gpu_place); T* out_scale_data = out_scale->mutable_data(gpu_place); diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc index fdda01381e117cecffb2a05f8399f3ad82a46339..8e80dc0e641c443923076c31e269689b5bc134a7 100644 --- a/paddle/fluid/operators/flatten_op.cc +++ b/paddle/fluid/operators/flatten_op.cc @@ -157,6 +157,116 @@ class FlattenGradOp : public framework::OperatorBase { } }; +// FIXME(zcd): flatten2 adds an intermediate output(XShape) based on flatten, +// the XShape is used to carry the shape and lod of X which will be used in +// flatten_grad, in this way, the framework can reuse the memory of X +// immediately the flatten2_op is finished. +// Considering compatibility issues, we could not fix flatten2_op +class Flatten2OpInferShape : public FlattenOpInferShape { + public: + void operator()(framework::InferShapeContext *ctx) const override { + FlattenOpInferShape::operator()(ctx); + PADDLE_ENFORCE(ctx->HasOutput("XShape"), + "Output (XShape) of Flatten op should not be null."); + const auto &in_dims = ctx->GetInputDim("X"); + std::vector xshape_dims(in_dims.size() + 1); + xshape_dims[0] = 0; + for (int i = 0; i < in_dims.size(); ++i) { + xshape_dims[i + 1] = in_dims[i]; + } + ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims)); + ctx->ShareLoD("X", "XShape"); + } +}; + +class Flatten2Op : public framework::OperatorBase { + public: + using OperatorBase::OperatorBase; + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + auto &axis = Attr("axis"); + auto in_dims = + scope.FindVar(Input("X"))->Get().dims(); + const auto &out_dims = FlattenOpInferShape::GetOutputShape(axis, in_dims); + + framework::AttributeMap attrs; + attrs["shape"] = out_dims; + attrs["inplace"] = false; + // Invoke Reshape Op + auto reshape_op = framework::OpRegistry::CreateOp( + "reshape2", {{"X", {Input("X")}}, {"Shape", {}}}, + {{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs); + reshape_op->Run(scope, place); + } +}; + +class Flatten2OpMaker : public FlattenOpMaker { + public: + void Make() override { + FlattenOpMaker::Make(); + AddOutput("XShape", + "XShape is just used to store the shape and lod of X, which will " + "be used in FlattenGradOp.") + .AsIntermediate(); + } +}; + +class Flatten2GradOpMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("flatten2_grad"); + grad_op->SetInput("XShape", Output("XShape")); + grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + grad_op->SetAttrMap(Attrs()); + return std::unique_ptr(grad_op); + } +}; + +class Flatten2GradInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInput("XShape"), + "Input(XShape) shouldn't be null."); + PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) shouldn't be null."); + auto xshape_dims = context->GetInputDim("XShape"); + auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); + context->SetOutputDim(framework::GradVarName("X"), x_dims); + context->ShareLoD("XShape", framework::GradVarName("X")); + } +}; + +class Flatten2GradOp : public framework::OperatorBase { + public: + using OperatorBase::OperatorBase; + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + auto dx_name = Output(framework::GradVarName("X")); + auto dout_name = Input(framework::GradVarName("Out")); + auto xshape_name = Input("XShape"); + auto xshape_dims = + scope.FindVar(xshape_name)->Get().dims(); + auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); + + framework::AttributeMap attrs; + attrs["shape"] = framework::vectorize2int(x_dims); + attrs["inplace"] = false; + + auto reshape_op = framework::OpRegistry::CreateOp( + "reshape2", {{"X", {dout_name}}, {"Shape", {}}}, + {{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs); + reshape_op->Run(scope, place); + } +}; + } // namespace operators } // namespace paddle @@ -167,3 +277,8 @@ REGISTER_OPERATOR(flatten, ops::FlattenOp, ops::FlattenOpMaker, ops::FlattenOpInferShape, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(flatten_grad, ops::FlattenGradOp, ops::FlattenGradInferShape); + +REGISTER_OPERATOR(flatten2, ops::Flatten2Op, ops::Flatten2OpMaker, + ops::Flatten2OpInferShape, ops::Flatten2GradOpMaker); +REGISTER_OPERATOR(flatten2_grad, ops::Flatten2GradOp, + ops::Flatten2GradInferShape); diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc index f91236975d0cf0c89a464188bd6ea1b5b01e0f6d..104e160e2d7069ec247cc51e927ce8824f1b69e8 100644 --- a/paddle/fluid/operators/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fusion_lstm_op.cc @@ -89,12 +89,12 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { PADDLE_ENFORCE_EQ(b_dims[0], 1, "The first dimension of Input(Bias) should be 1."); - PADDLE_ENFORCE(!ctx->Attrs().Get("use_peepholes"), - "Do not support peephole yet."); - PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size, + auto use_peepholes = ctx->Attrs().Get("use_peepholes"); + PADDLE_ENFORCE_EQ(b_dims[1], (use_peepholes ? 7 : 4) * frame_size, "The second dimension of Input(Bias) should be " - "4 * %d if disable peepholes connection", - frame_size); + "7 * %d if enable peepholes connection or" + "4 * %d if disable peepholes", + frame_size, frame_size); framework::DDim out_dims({x_dims[0], frame_size}); ctx->SetOutputDim("Hidden", out_dims); @@ -232,16 +232,17 @@ class FuisonLSTMKernel : public framework::OpKernel { act_cand = act_functor(act_cand_str); \ } -#define INIT_BASE_INPUT_OUTPUT \ - auto* x = ctx.Input("X"); \ - auto* h0 = ctx.Input("H0"); \ - auto* c0 = ctx.Input("C0"); \ - auto* wx = ctx.Input("WeightX"); \ - auto* wh = ctx.Input("WeightH"); \ - auto* bias = ctx.Input("Bias"); \ - auto* xx = ctx.Output("XX"); \ - auto* hidden_out = ctx.Output("Hidden"); \ - auto* cell_out = ctx.Output("Cell"); \ +#define INIT_BASE_INPUT_OUTPUT \ + auto* x = ctx.Input("X"); \ + auto* h0 = ctx.Input("H0"); \ + auto* c0 = ctx.Input("C0"); \ + auto* wx = ctx.Input("WeightX"); \ + auto* wh = ctx.Input("WeightH"); \ + auto* bias = ctx.Input("Bias"); \ + auto* xx = ctx.Output("XX"); \ + auto* hidden_out = ctx.Output("Hidden"); \ + auto* cell_out = ctx.Output("Cell"); \ + bool use_peepholes = ctx.Attr("use_peepholes"); \ bool is_reverse = ctx.Attr("is_reverse"); #define INIT_BASE_SIZES \ @@ -266,12 +267,21 @@ class FuisonLSTMKernel : public framework::OpKernel { const T* x_data = x->data(); const T* h0_data = h0 ? h0->data() : nullptr; const T* c0_data = c0 ? c0->data() : nullptr; + const T* bias_data = bias->data(); + const T* wc_data = bias_data + D4; // w_ic, w_fc, w_oc const T* wx_data = wx->data(); const T* wh_data = wh->data(); + T* xx_data = xx->mutable_data(ctx.GetPlace()); T* hidden_out_data = hidden_out->mutable_data(ctx.GetPlace()); T* cell_out_data = cell_out->mutable_data(ctx.GetPlace()); + // use local variable + framework::DDim check_dims({3, D}); + Tensor checked_cell; // w_ic * Ct-1, w_fc * Ct-1, w_oc * Ct + auto checked_cell_data = + checked_cell.mutable_data(check_dims, ctx.GetPlace()); + auto blas = math::GetBlas(ctx); math::FCCompute(blas, total_T, D4, M, x_data, wx_data, xx_data, bias->data()); @@ -297,46 +307,86 @@ class FuisonLSTMKernel : public framework::OpKernel { int seq_len = x_lod[0][bid + 1] - x_lod[0][bid]; const T* prev_c_data = nullptr; const T* prev_h_data = nullptr; + int tstart = 0; if (h0_data) { prev_h_data = h0_data + bid * D; prev_c_data = c0_data + bid * D; } else { - // W_ch, W_ih, W_fh, W_oh - act_gate(D3, xx_data + D, xx_data + D); + // If step == 0 and there is no initialized hidden state, that is to say + // the H0 is zeros. Then W_h * H_t-1 can be skipped + + // ~C_t act_cand(D, xx_data, xx_data); - // cell out= input*tilde + if (use_peepholes) { + // I_t, F_t + act_gate(D2, xx_data + D, xx_data + D); + } else { + // I_t, F_t, O_t + act_gate(D3, xx_data + D, xx_data + D); + } + // C_t = I_t * ~C_t blas.VMUL(D, xx_data, xx_data + D, cell_out_data); + + if (use_peepholes) { + // + W_oc * C_t for peephole connection + blas.VMUL(D, wc_data + D2, cell_out_data, checked_cell_data + D2); + blas.VADD(D, xx_data + D3, checked_cell_data + D2, xx_data + D3); + // O_t + act_gate(D, xx_data + D3, xx_data + D3); + } + // hidden out= act_state(cellout) * outgate act_cell(D, cell_out_data, xx_data + D2); + // H_t = O_t * act_state(C_t) blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data); // prev prev_h_data = hidden_out_data; prev_c_data = cell_out_data; - tstart = 1; + tstart = 1; move_step(); } + for (int step = tstart; step < seq_len; ++step) { + // + W_h * H_t-1 blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D4, D, static_cast(1), prev_h_data, D, wh_data, D4, static_cast(1), xx_data, D4); - // W_ch, W_ih, W_fh, W_oh - act_gate(D3, xx_data + D, xx_data + D); + // ~C_t act_cand(D, xx_data, xx_data); - // a = forget * prev_cell + if (use_peepholes) { + // + W_ic|W_fc * C_t-1 for peephole connection + blas.VMUL(D, wc_data, prev_c_data, checked_cell_data); + blas.VMUL(D, wc_data + D, prev_c_data, checked_cell_data + D); + blas.VADD(D2, xx_data + D, checked_cell_data, xx_data + D); + // I_t, F_t + act_gate(D2, xx_data + D, xx_data + D); + } else { + // I_t, F_t, O_t + act_gate(D3, xx_data + D, xx_data + D); + } + + // F_t * C_t-1 blas.VMUL(D, xx_data + D2, prev_c_data, xx_data + D2); - - // b = input * tilde + // I_t * ~C_t blas.VMUL(D, xx_data, xx_data + D, xx_data + D); - - // cell out= a+b + // C_t = F_t * C_t-1 + I_t * ~C_t blas.VADD(D, xx_data + D, xx_data + D2, cell_out_data); + if (use_peepholes) { + // + W_oc * C_t for peephole connection + blas.VMUL(D, wc_data + D2, cell_out_data, checked_cell_data + D2); + blas.VADD(D, xx_data + D3, checked_cell_data + D2, xx_data + D3); + // O_t + act_gate(D, xx_data + D3, xx_data + D3); + } + // hidden out= act_state(cellout) * outgate act_cell(D, cell_out_data, xx_data + D2); + // H_t = O_t * act_state(C_t) blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data); // prev @@ -344,14 +394,14 @@ class FuisonLSTMKernel : public framework::OpKernel { prev_c_data = cell_out_data; move_step(); - } - } + } // for each step in batch + } // for each batch } void BatchCompute(const framework::ExecutionContext& ctx) const { using DeviceContext = platform::CPUDeviceContext; INIT_BASE_INPUT_OUTPUT - if (x->lod()[0].size() == 2) { + if (x->lod()[0].size() == 2) { // batch size == 1 SeqCompute(ctx); return; } @@ -367,6 +417,8 @@ class FuisonLSTMKernel : public framework::OpKernel { const T* x_data = x->data(); const T* wx_data = wx->data(); const T* wh_data = wh->data(); + const T* bias_data = bias->data(); + const T* wc_data = bias_data + D4; // w_ic, w_fc, w_oc auto place = ctx.GetPlace(); T* xx_data = xx->mutable_data(place); T* batched_input_data = batched_input->mutable_data(place); @@ -375,6 +427,12 @@ class FuisonLSTMKernel : public framework::OpKernel { hidden_out->mutable_data(place); cell_out->mutable_data(place); + // use local variable + framework::DDim check_dims({3, D}); + Tensor checked_cell; // w_ic * Ct-1, w_fc * Ct-1, w_oc * Ct + auto checked_cell_data = + checked_cell.mutable_data(check_dims, ctx.GetPlace()); + math::LoDTensor2BatchFunctor to_batch; auto& dev_ctx = ctx.template device_context(); auto blas = math::GetBlas(dev_ctx); @@ -396,17 +454,27 @@ class FuisonLSTMKernel : public framework::OpKernel { reordered_h0->Resize({max_bs, D}); reordered_c0->Resize({max_bs, D}); + T* prev_batch_h_data = nullptr; + T* prev_batch_c_data = nullptr; + T* cur_batch_in_data = batched_input_data; + T* cur_batch_h_out_data = batched_h_out_data; + T* cur_batch_c_out_data = batched_c_out_data; + + auto move_step = [&](int bs) { + cur_batch_in_data += bs * D4; + cur_batch_c_out_data += bs * D; + cur_batch_h_out_data += bs * D; + }; + int tstart = 0; - T* prev_h_data = nullptr; - T* prev_c_data = nullptr; if (h0) { // reorder h0, c0 T* reordered_h0_data = reordered_h0->mutable_data(place); T* reordered_c0_data = reordered_c0->mutable_data(place); const T* h0_data = h0->data(); const T* c0_data = c0->data(); - prev_h_data = reordered_h0_data; - prev_c_data = reordered_c0_data; + prev_batch_h_data = reordered_h0_data; + prev_batch_c_data = reordered_c0_data; size_t sz = sizeof(T) * D; for (int i = 0; i < max_bs; ++i) { std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz); @@ -415,71 +483,122 @@ class FuisonLSTMKernel : public framework::OpKernel { reordered_c0_data += D; } } else { - // compute without h0, c0 - T* cur_in_data = batched_input_data; - T* cur_h_out_data = batched_h_out_data; - T* cur_c_out_data = batched_c_out_data; - // W_ch, W_ih, W_fh, W_oh - for (int i = 0; i < max_bs; ++i) { - act_gate(D3, cur_in_data + D, cur_in_data + D); + // Compute with no H0/C0 + T* cur_in_data = cur_batch_in_data; + T* cur_c_out_data = cur_batch_c_out_data; + T* cur_h_out_data = cur_batch_h_out_data; + + // If step == 0 and there is no initialized hidden state, that is to say + // the H0 is zeros. Then W_h * H_t-1 can be skiped + + for (int i = 0; i < max_bs; ++i) { // iterate each data in 1st batch + // ~C_t act_cand(D, cur_in_data, cur_in_data); - // cell out= input*tilde + + if (use_peepholes) { + // I_t, F_t + act_gate(D2, cur_in_data + D, cur_in_data + D); + } else { + // I_t, F_t, O_t + act_gate(D3, cur_in_data + D, cur_in_data + D); + } + + // C_t = I_t * ~C_t blas.VMUL(D, cur_in_data, cur_in_data + D, cur_c_out_data); + + if (use_peepholes) { + // + W_oc * C_t for peephole connection + blas.VMUL(D, wc_data + D2, cur_c_out_data, checked_cell_data + D2); + blas.VADD(D, cur_in_data + D3, checked_cell_data + D2, + cur_in_data + D3); + // O_t + act_gate(D, cur_in_data + D3, cur_in_data + D3); + } + // hidden out= act_state(cellout) * outgate act_cell(D, cur_c_out_data, cur_in_data + D2); + // H_t = O_t * act_state(C_t) blas.VMUL(D, cur_in_data + D2, cur_in_data + D3, cur_h_out_data); - // add offset + // move to next data in the same batch cur_in_data += D4; cur_c_out_data += D; cur_h_out_data += D; } + + // move to data for next timestep + prev_batch_h_data = cur_batch_h_out_data; + prev_batch_c_data = cur_batch_c_out_data; + move_step(max_bs); tstart = 1; - prev_h_data = batched_h_out_data; - prev_c_data = batched_c_out_data; } - // Then start from next + const auto& batch_starts = batched_lod[0]; const int max_seq_len = batch_starts.size() - 1; - const int offset = tstart * max_bs * D; - batched_input_data = batched_input_data + offset * 4; - batched_h_out_data = batched_h_out_data + offset; - batched_c_out_data = batched_c_out_data + offset; for (int step = tstart; step < max_seq_len; ++step) { const int cur_bs = batch_starts[step + 1] - batch_starts[step]; + // + W_h * H_t-1 blas.GEMM(CblasNoTrans, CblasNoTrans, cur_bs, D4, D, static_cast(1), - prev_h_data, D, wh_data, D4, static_cast(1), - batched_input_data, D4); - - T* cur_in_data = batched_input_data; - T* cur_prev_c_data = prev_c_data; - T* cur_c_out_data = batched_c_out_data; - T* cur_h_out_data = batched_h_out_data; - for (int i = 0; i < cur_bs; ++i) { - // W_ch, W_ih, W_fh, W_oh - act_gate(D3, cur_in_data + D, cur_in_data + D); + prev_batch_h_data, D, wh_data, D4, static_cast(1), + cur_batch_in_data, D4); + + T* cur_in_data = cur_batch_in_data; + T* cur_c_out_data = cur_batch_c_out_data; + T* cur_h_out_data = cur_batch_h_out_data; + T* prev_c_data = prev_batch_c_data; // NULL if no C0 in step0 + T* prev_h_data = prev_batch_h_data; // NULL if no H0 in step0 + auto next_data_in_batch = [&]() { + cur_in_data += D4; + cur_c_out_data += D; + cur_h_out_data += D; + prev_c_data = prev_c_data ? prev_c_data + D : nullptr; + prev_h_data = prev_h_data ? prev_h_data + D : nullptr; + }; + + for (int i = 0; i < cur_bs; ++i) { // iterate each data in same batch + // ~C_t act_cand(D, cur_in_data, cur_in_data); - // a = forget * prev_cell - blas.VMUL(D, cur_in_data + D2, cur_prev_c_data, cur_in_data + D2); - // b = input * tilde + + if (use_peepholes) { + // + W_ic|W_fc * C_t-1 for peephole connection + blas.VMUL(D, wc_data, prev_c_data, checked_cell_data); + blas.VMUL(D, wc_data + D, prev_c_data, checked_cell_data + D); + blas.VADD(D2, cur_in_data + D, checked_cell_data, cur_in_data + D); + // I_t, F_t + act_gate(D2, cur_in_data + D, cur_in_data + D); + } else { + // I_t, F_t, O_t + act_gate(D3, cur_in_data + D, cur_in_data + D); + } + + // F_t * C_t-1 + blas.VMUL(D, cur_in_data + D2, prev_c_data, cur_in_data + D2); + // I_t * ~C_t blas.VMUL(D, cur_in_data, cur_in_data + D, cur_in_data + D); - // cell out= a+b + // C_t = F_t * C_t-1 + I_t * ~C_t blas.VADD(D, cur_in_data + D, cur_in_data + D2, cur_c_out_data); + + if (use_peepholes) { + // + W_oc * C_t for peephole connection + blas.VMUL(D, wc_data + D2, cur_c_out_data, checked_cell_data + D2); + blas.VADD(D, cur_in_data + D3, checked_cell_data + D2, + cur_in_data + D3); + // O_t + act_gate(D, cur_in_data + D3, cur_in_data + D3); + } + // hidden out= act_state(cellout) * outgate act_cell(D, cur_c_out_data, cur_in_data + D2); + // H_t = O_t * act_state(C_t) blas.VMUL(D, cur_in_data + D2, cur_in_data + D3, cur_h_out_data); - cur_in_data += D4; - cur_prev_c_data += D; - cur_c_out_data += D; - cur_h_out_data += D; + // move to next data in same batch + next_data_in_batch(); } - - prev_c_data = batched_c_out_data; - prev_h_data = batched_h_out_data; - batched_c_out_data = cur_c_out_data; - batched_h_out_data = cur_h_out_data; - batched_input_data = cur_in_data; + // move to data for next timestep + prev_batch_h_data = cur_batch_h_out_data; + prev_batch_c_data = cur_batch_c_out_data; + move_step(cur_bs); } math::Batch2LoDTensorFunctor to_seq; diff --git a/paddle/fluid/operators/gru_unit_op.h b/paddle/fluid/operators/gru_unit_op.h index f18d09d33e9052929b1ff9b36bb2b371fb513d37..451ec61ba1f7239d92c6dfbad0b2961e74e1bc17 100644 --- a/paddle/fluid/operators/gru_unit_op.h +++ b/paddle/fluid/operators/gru_unit_op.h @@ -92,12 +92,12 @@ class GRUUnitKernel : public framework::OpKernel { gate_data, frame_size * 3); // calculate activited gate - Eigen::array extents = {batch_size, frame_size}; - Eigen::array u_offsets = {0, 0}; + Eigen::array extents{{batch_size, frame_size}}; + Eigen::array u_offsets{{0, 0}}; ActCompute(context.Attr("gate_activation"), place, g.slice(u_offsets, extents), g.slice(u_offsets, extents)); auto u = g.slice(u_offsets, extents); // update gate - Eigen::array r_offsets = {0, frame_size}; + Eigen::array r_offsets{{0, frame_size}}; ActCompute(context.Attr("gate_activation"), place, g.slice(r_offsets, extents), g.slice(r_offsets, extents)); auto r = g.slice(r_offsets, extents); // reset gate @@ -107,7 +107,7 @@ class GRUUnitKernel : public framework::OpKernel { weight_data + frame_size * frame_size * 2, frame_size, 1, gate_data + frame_size * 2, frame_size * 3); - Eigen::array c_offsets = {0, frame_size * 2}; + Eigen::array c_offsets{{0, frame_size * 2}}; ActCompute(context.Attr("activation"), place, g.slice(c_offsets, extents), g.slice(c_offsets, extents)); auto c = g.slice(c_offsets, extents); // output candidate @@ -171,12 +171,12 @@ class GRUUnitGradKernel : public framework::OpKernel { int batch_size = input->dims()[0]; int frame_size = hidden_prev->dims()[1]; - Eigen::array extents = {batch_size, frame_size}; - Eigen::array u_offsets = {0, 0}; + Eigen::array extents{{batch_size, frame_size}}; + Eigen::array u_offsets{{0, 0}}; auto u = g.slice(u_offsets, extents); // update gate - Eigen::array r_offsets = {0, frame_size}; + Eigen::array r_offsets{{0, frame_size}}; auto r = g.slice(r_offsets, extents); // reset gate - Eigen::array c_offsets = {0, frame_size * 2}; + Eigen::array c_offsets{{0, frame_size * 2}}; auto c = g.slice(c_offsets, extents); // output candidate // backward for unactivated update gate diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu index 0886c41a1b582881faf24f5531d414db4e4db71c..22343d7724b2f0dc01bff8c2274e3dd914bf70ef 100644 --- a/paddle/fluid/operators/layer_norm_op.cu +++ b/paddle/fluid/operators/layer_norm_op.cu @@ -67,27 +67,27 @@ template __global__ void LayerNormForward(const T *x, const T *scale, const T *bias, T *y, T *mean, T *var, float epsilon, int feature_size) { - using BlockReduce = cub::BlockReduce, BlockDim>; + using BlockReduce = cub::BlockReduce, BlockDim>; __shared__ typename BlockReduce::TempStorage temp_storage; int beg_idx = blockIdx.x * feature_size + threadIdx.x; int end_idx = (blockIdx.x + 1) * feature_size; // Step 1: Reduce to calculate mean and var - T mean_val = static_cast(0); - T var_val = static_cast(0); + double mean_val = 0; + double var_val = 0; for (int i = beg_idx; i < end_idx; i += BlockDim) { T tmp = x[i]; mean_val += tmp; var_val += (tmp * tmp); } auto pair = BlockReduce(temp_storage) - .Reduce(PairForLayerNorm(mean_val, var_val), - PairForLayerNormAddFunctor()); + .Reduce(PairForLayerNorm(mean_val, var_val), + PairForLayerNormAddFunctor()); if (threadIdx.x == 0) { auto tmp = pair.first_ / feature_size; - mean[blockIdx.x] = tmp; - var[blockIdx.x] = pair.second_ / feature_size - tmp * tmp; + mean[blockIdx.x] = static_cast(tmp); + var[blockIdx.x] = static_cast(pair.second_ / feature_size - tmp * tmp); } __syncthreads(); mean_val = mean[blockIdx.x]; diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h index f5c10ced8305b64c6386c5051804f8c9a8f71802..58463dc4d6fd7cc3454de766814a947fee161070 100644 --- a/paddle/fluid/operators/lookup_table_op.h +++ b/paddle/fluid/operators/lookup_table_op.h @@ -57,7 +57,7 @@ class LookupTableKernel : public framework::OpKernel { memset(output + i * row_width, 0, row_width * sizeof(T)); } else { PADDLE_ENFORCE_LT(ids[i], row_number); - PADDLE_ENFORCE_GE(ids[i], 0); + PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i); memcpy(output + i * row_width, table + ids[i] * row_width, row_width * sizeof(T)); } diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index a1dfe39c3a4f84f5e4aaa2306813a7decf0e49ea..d72f85f2c44db2fa887732cfc05e1376a6a79e4a 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -246,6 +246,88 @@ class ReshapeGradKernel { } }; +// FIXME(zcd): reshape2 adds an intermediate output(XShape) based on reshape, +// the XShape is used to carry the shape and lod of X which will be used in +// reshape_grad, in this way, the framework can reuse the memory of X +// immediately the reshape_op is finished. +// Considering compatibility issues, we could not fix reshape_op +class Reshape2Op : public ReshapeOp { + public: + Reshape2Op(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : ReshapeOp(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + ReshapeOp::InferShape(ctx); + PADDLE_ENFORCE(ctx->HasOutput("XShape"), + "Output(XShape) of ReshapeOp should not be null."); + const auto &x_dims = ctx->GetInputDim("X"); + std::vector xshape_dims(x_dims.size() + 1); + xshape_dims[0] = 0; + for (int i = 0; i < x_dims.size(); ++i) { + xshape_dims[i + 1] = x_dims[i]; + } + ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims)); + ctx->ShareLoD("X", /*->*/ "XShape"); + } +}; + +class Reshape2OpMaker : public ReshapeOpMaker { + public: + void Make() override { + ReshapeOpMaker::Make(); + AddOutput("XShape", + "XShape is just used to store the shape and lod of X, which will " + "be used in FlattenGradOp.") + .AsIntermediate(); + } +}; + +class Reshape2GradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("reshape2_grad"); + grad_op->SetInput("XShape", Output("XShape")); + grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + grad_op->SetAttrMap(Attrs()); + return std::unique_ptr(grad_op); + } +}; + +class Reshape2GradOp : public framework::OperatorWithKernel { + public: + Reshape2GradOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("XShape"), "Input(XShape) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) shouldn't be null."); + auto xshape_dims = ctx->GetInputDim("XShape"); + auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + ctx->ShareLoD("XShape", framework::GradVarName("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType( + ctx.Input(framework::GradVarName("Out")) + ->type()), + ctx.device_context()); + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; @@ -261,6 +343,17 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel, ops::ReshapeGradKernel, int64_t, ops::ReshapeGradKernel); +REGISTER_OPERATOR(reshape2, ops::Reshape2Op, ops::Reshape2OpMaker, + ops::Reshape2GradMaker); +REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp); +REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double, + ops::ReshapeKernel, int, ops::ReshapeKernel, + int64_t, ops::ReshapeKernel); +REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel, + double, ops::ReshapeGradKernel, int, + ops::ReshapeGradKernel, int64_t, + ops::ReshapeGradKernel); + #ifdef PADDLE_WITH_CUDA REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double, ops::ReshapeKernel, int, ops::ReshapeKernel, @@ -269,4 +362,11 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel, double, ops::ReshapeGradKernel, int, ops::ReshapeGradKernel, int64_t, ops::ReshapeGradKernel); +REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double, + ops::ReshapeKernel, int, ops::ReshapeKernel, + int64_t, ops::ReshapeKernel); +REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel, + double, ops::ReshapeGradKernel, int, + ops::ReshapeGradKernel, int64_t, + ops::ReshapeGradKernel); #endif diff --git a/paddle/fluid/operators/rmsprop_op.cc b/paddle/fluid/operators/rmsprop_op.cc index 919ebe48ca38040274bd2052b95ef96eccff4db6..2f773f222e50a440801b06a4fd997bf237b34772 100644 --- a/paddle/fluid/operators/rmsprop_op.cc +++ b/paddle/fluid/operators/rmsprop_op.cc @@ -36,9 +36,13 @@ class RmspropOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), "Output(param_out) of RmspropOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("MomentOut"), - "Output(Momentum_out) of RmspropOp should not be null."); + "Output(MomentOut) of RmspropOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("MeanSquareOut"), "Output(MeanSquareOut) of RmspropOp should not be null."); + if (ctx->Attrs().Get("centered")) { + PADDLE_ENFORCE(ctx->HasOutput("MeanGradOut"), + "Output(MeanGradOut) of RmspropOp should not be null."); + } auto param_dim = ctx->GetInputDim("Param"); PADDLE_ENFORCE_EQ( @@ -58,6 +62,9 @@ class RmspropOp : public framework::OperatorWithKernel { ctx->SetOutputDim("ParamOut", param_dim); ctx->SetOutputDim("MomentOut", param_dim); ctx->SetOutputDim("MeanSquareOut", param_dim); + if (ctx->Attrs().Get("centered")) { + ctx->SetOutputDim("MeanGradOut", param_dim); + } } }; @@ -70,6 +77,10 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("MeanSquare", "(Tensor, default Tensor)" " The mean square value that gets updated."); + AddInput("MeanGrad", + "(Tensor, default Tensor)" + " The moving average of gradient") + .AsDispensable(); AddInput("LearningRate", "(Tensor, default Tensor) " "The learning rate should be a tensor of size 1."); @@ -82,6 +93,8 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("ParamOut", "(Tensor) Output updated parameter value."); AddOutput("MomentOut", "(Tensor) Output updated moment."); AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value."); + AddOutput("MeanGradOut", + "(Tensor) Output moving average of gradient updated value."); AddAttr("epsilon", "(float, default 1e-10) Constant " @@ -93,6 +106,8 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault(0.9f); AddAttr("momentum", "(float, default 0.0) Constant value.") .SetDefault(0.0f); + AddAttr("centered", "(bool, default false) use centered rmsprop.") + .SetDefault(false); AddComment(R"DOC( Rmsprop Optimizer. @@ -103,6 +118,14 @@ MomentOut = momentum * Moment + ParamOut = Param - MomentOut $$ +if centered is true: + +mean_grad = decay * mean_square{t-1} + (1-decay) * gradient +mean_square = decay * mean_square{t-1} + (1-decay) * gradient ** 2 +mom = momentum * mom{t-1} + learning_rate * g_t / + sqrt(mean_square - mean_grad**2 + epsilon) +param -= mom + The original slides that proposed Rmsprop: Slide 29 of http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf) diff --git a/paddle/fluid/operators/rmsprop_op.h b/paddle/fluid/operators/rmsprop_op.h index 12836f43bde47ac87eb0af33dea501593b659a5d..25ed32c5ebb2ff5be962ac1e3e38c970623d705c 100644 --- a/paddle/fluid/operators/rmsprop_op.h +++ b/paddle/fluid/operators/rmsprop_op.h @@ -41,6 +41,7 @@ class RmspropOpKernel : public framework::OpKernel { float epsilon = ctx.Attr("epsilon"); float rho = ctx.Attr("decay"); float momentum = ctx.Attr("momentum"); + bool centered = ctx.Attr("centered"); auto p = EigenVector::Flatten(*ctx.Input("Param")); auto ms = EigenVector::Flatten(*ctx.Input("MeanSquare")); @@ -53,12 +54,24 @@ class RmspropOpKernel : public framework::OpKernel { auto ms_out = EigenVector::Flatten(*mean_square_out); auto& place = *ctx.template device_context().eigen_device(); - Eigen::DSizes grad_dsize(grad->numel()); + Eigen::DSizes grad_dsize(static_cast(grad->numel())); ms_out.device(place) = rho * ms + (1 - rho) * g * g; - mom_out.device(place) = - momentum * mom + - lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt(); + if (centered) { + auto mg = EigenVector::Flatten(*ctx.Input("MeanGrad")); + auto* mean_grad_out = ctx.Output("MeanGradOut"); + mean_grad_out->mutable_data(ctx.GetPlace()); + auto mg_out = EigenVector::Flatten(*mean_grad_out); + + mg_out.device(place) = rho * mg + (1 - rho) * g; + mom_out.device(place) = momentum * mom + + lr.broadcast(grad_dsize) * g / + (ms_out - mg_out.square() + epsilon).sqrt(); + } else { + mom_out.device(place) = + momentum * mom + + lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt(); + } p_out.device(place) = p - mom_out; } }; diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc index 8a683116b8054de12fc4419b5aa5fbc019b675bb..e389c6a65e1e8220685294931c4d08e6fd928b7f 100644 --- a/paddle/fluid/operators/squeeze_op.cc +++ b/paddle/fluid/operators/squeeze_op.cc @@ -126,15 +126,15 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault({}); AddComment(R"DOC( Squeeze Operator. - - Remove single-dimensional entries from the shape of a tensor. - Takes a parameter axes with a list of axes to squeeze. - If axes is not provided, all the single dimensions will be removed from the shape. + + Remove single-dimensional entries from the shape of a tensor. + Takes a parameter axes with a list of axes to squeeze. + If axes is not provided, all the single dimensions will be removed from the shape. If an axis is selected with shape entry not equal to one, an error is raised. - + Examples: Case 1: - Given + Given X.shape = (1, 3, 1, 5) and axes = [0] @@ -144,7 +144,7 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker { Case 2: Given X.shape = (1, 3, 1, 5) - and + and axes = [] we get: Out.shape = (3, 5) @@ -181,6 +181,113 @@ class SqueezeGradOp : public framework::OperatorBase { } }; +// FIXME(zcd): squeeze2 adds an intermediate output(XShape) based on squeeze, +// the XShape is used to carry the shape and lod of X which will be used in +// squeeze_grad, in this way, the framework can reuse the memory of X +// immediately the squeeze2_op is finished. +// Considering compatibility issues, we could not fix squeeze2_op +class Squeeze2OpMaker : public SqueezeOpMaker { + public: + void Make() override { + SqueezeOpMaker::Make(); + AddOutput("XShape", + "XShape is just used to store the shape and lod of X, which will " + "be used in SqueezeGradOp.") + .AsIntermediate(); + } +}; + +class Squeeze2OpInferShape : public SqueezeOpInferShape { + public: + void operator()(framework::InferShapeContext *ctx) const override { + SqueezeOpInferShape::operator()(ctx); + PADDLE_ENFORCE(ctx->HasOutput("XShape"), + "Output(XShape) of Squeeze operator should not be null."); + const auto &x_dims = ctx->GetInputDim("X"); + std::vector xshape_dims(x_dims.size() + 1); + xshape_dims[0] = 0; + for (int i = 0; i < x_dims.size(); ++i) { + xshape_dims[i + 1] = x_dims[i]; + } + ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims)); + ctx->ShareLoD("X", /*->*/ "XShape"); + } +}; + +class Squeeze2Op : public framework::OperatorBase { + public: + using OperatorBase::OperatorBase; + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + auto &axes = Attr>("axes"); + auto x_dims = scope.FindVar(Input("X"))->Get().dims(); + auto out_dims = Squeeze2OpInferShape::GetOutputShape(axes, x_dims); + + framework::AttributeMap attrs; + attrs["shape"] = framework::vectorize2int(out_dims); + // Invoke Reshape Op + auto reshape_op = framework::OpRegistry::CreateOp( + "reshape2", {{"X", {Input("X")}}, {"Shape", {}}}, + {{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs); + reshape_op->Run(scope, place); + } +}; + +class Squeeze2GradOpMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("squeeze2_grad"); + grad_op->SetInput("XShape", Output("XShape")); + grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + grad_op->SetAttrMap(Attrs()); + return std::unique_ptr(grad_op); + } +}; + +class Squeeze2GradInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInput("XShape"), + "Input(XShape) shouldn't be null."); + PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) shouldn't be null."); + auto xshape_dims = context->GetInputDim("XShape"); + auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); + context->SetOutputDim(framework::GradVarName("X"), x_dims); + context->ShareLoD("XShape", framework::GradVarName("X")); + } +}; + +class Squeeze2GradOp : public framework::OperatorBase { + public: + using OperatorBase::OperatorBase; + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + auto dx_name = Output(framework::GradVarName("X")); + auto dout_name = Input(framework::GradVarName("Out")); + auto xshape_name = Input("XShape"); + auto xshape_dims = + scope.FindVar(xshape_name)->Get().dims(); + auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); + + framework::AttributeMap attrs; + attrs["shape"] = framework::vectorize2int(x_dims); + + auto reshape_op = framework::OpRegistry::CreateOp( + "reshape2", {{"X", {dout_name}}, {"Shape", {}}}, + {{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs); + reshape_op->Run(scope, place); + } +}; + } // namespace operators } // namespace paddle @@ -192,3 +299,8 @@ REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker, ops::SqueezeOpInferShape, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp, ops::SqueezeGradInferShape); + +REGISTER_OPERATOR(squeeze2, ops::Squeeze2Op, ops::Squeeze2OpMaker, + ops::Squeeze2OpInferShape, ops::Squeeze2GradOpMaker); +REGISTER_OPERATOR(squeeze2_grad, ops::Squeeze2GradOp, + ops::Squeeze2GradInferShape); diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc index 60556a564c25c08612447ebd47a4b432b8a12d29..6a9fc6611a8f8eaa6749aefac0673ccabaebbcfe 100644 --- a/paddle/fluid/operators/transpose_op.cc +++ b/paddle/fluid/operators/transpose_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/transpose_op.h" +#include #include namespace paddle { @@ -24,7 +25,7 @@ class TransposeOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { + void InferShape(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null"); auto x_dims = ctx->GetInputDim("X"); @@ -90,7 +91,7 @@ The behavior of this operator is similar to how `numpy.transpose` works. 2 &5 \end{pmatrix}$$ -- Given a input tensor with shape $(N, C, H, W)$ and the `axes` is +- Given a input tensor with shape $(N, C, H, W)$ and the `axes` is $[0, 2, 3, 1]$, then shape of the output tensor will be: $(N, H, W, C)$. )DOC"); @@ -101,7 +102,7 @@ class TransposeOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { + void InferShape(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), "Input(Out@GRAD) should not be null"); @@ -113,6 +114,93 @@ class TransposeOpGrad : public framework::OperatorWithKernel { } }; +// FIXME(zcd): transpose2 adds an intermediate output(XShape) based on +// transpose, the XShape is used to carry the shape and lod of X which +// will be used in transpose_grad, in this way, the framework can reuse +// the memory of X immediately the transpose2_op is finished. +// Considering compatibility issues, we could not fix transpose2_op +class Transpose2Op : public TransposeOp { + public: + Transpose2Op(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : TransposeOp(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + TransposeOp::InferShape(ctx); + PADDLE_ENFORCE(ctx->HasOutput("XShape"), + "Output(XShape) should not be null"); + const auto &in_dims = ctx->GetInputDim("X"); + std::vector x_shape_dim(in_dims.size() + 1); + x_shape_dim[0] = 0; + for (int i = 0; i < in_dims.size(); ++i) { + x_shape_dim[i + 1] = in_dims[i]; + } + ctx->SetOutputDim("XShape", framework::make_ddim(x_shape_dim)); + ctx->ShareLoD("X", /*->*/ "XShape"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class Transpose2OpMaker : public TransposeOpMaker { + public: + void Make() override { + TransposeOpMaker::Make(); + AddOutput("XShape", "(Tensor)The output tensor.").AsIntermediate(); + } +}; + +class Transpose2GradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("transpose2_grad"); + grad_op->SetInput("XShape", Output("XShape")); + grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + grad_op->SetAttrMap(Attrs()); + return std::unique_ptr(grad_op); + } +}; + +class Transpose2OpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("XShape"), "Input(XShape) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + if (ctx->HasOutput(framework::GradVarName("X"))) { + auto xshape_dim = ctx->GetInputDim("XShape"); + auto x_shape_dim = + framework::slice_ddim(xshape_dim, 1, xshape_dim.size()); + ctx->SetOutputDim(framework::GradVarName("X"), x_shape_dim); + ctx->ShareLoD("XShape", framework::GradVarName("X")); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType( + ctx.Input(framework::GradVarName("Out")) + ->type()), + ctx.device_context()); + } +}; + } // namespace operators } // namespace paddle @@ -120,8 +208,20 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(transpose, ops::TransposeOp, ops::TransposeOpMaker, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad); + REGISTER_OP_CPU_KERNEL( transpose, ops::TransposeKernel); REGISTER_OP_CPU_KERNEL( transpose_grad, ops::TransposeGradKernel); + +REGISTER_OPERATOR(transpose2, ops::Transpose2Op, ops::Transpose2OpMaker, + ops::Transpose2GradMaker); +REGISTER_OPERATOR(transpose2_grad, ops::Transpose2OpGrad); + +REGISTER_OP_CPU_KERNEL( + transpose2, + ops::TransposeKernel); +REGISTER_OP_CPU_KERNEL( + transpose2_grad, + ops::TransposeGradKernel); diff --git a/paddle/fluid/operators/transpose_op.cu.cc b/paddle/fluid/operators/transpose_op.cu.cc index bcd1fb631394bc33b6fc162cfa7cbb20d55a654b..c1b5a8b31be243fab3af06a18c8e51986c953700 100644 --- a/paddle/fluid/operators/transpose_op.cu.cc +++ b/paddle/fluid/operators/transpose_op.cu.cc @@ -21,3 +21,10 @@ REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL( transpose_grad, ops::TransposeGradKernel); + +REGISTER_OP_CUDA_KERNEL( + transpose2, + ops::TransposeKernel); +REGISTER_OP_CUDA_KERNEL( + transpose2_grad, + ops::TransposeGradKernel); diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc index 0fc8d54f6400c9dfb6af1e764ed44e95195bfe6e..405943add238ac2d245df11127bfadb4899e855f 100644 --- a/paddle/fluid/operators/unsqueeze_op.cc +++ b/paddle/fluid/operators/unsqueeze_op.cc @@ -127,13 +127,13 @@ class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker { }); AddComment(R"DOC( Unsqueeze Operator. - - Insert single-dimensional entries to the shape of a tensor. - Takes one required argument axes, a list of dimensions that will be inserted. - Dimension indices in axes are as seen in the output tensor. - For example: - Given a tensor such that tensor with shape [3, 4, 5], + Insert single-dimensional entries to the shape of a tensor. + Takes one required argument axes, a list of dimensions that will be inserted. + Dimension indices in axes are as seen in the output tensor. + + For example: + Given a tensor such that tensor with shape [3, 4, 5], then Unsqueeze(tensor, axes=[0, 4]) has shape [1, 3, 4, 5, 1] )DOC"); } @@ -168,6 +168,112 @@ class UnsqueezeGradOp : public framework::OperatorBase { } }; +// FIXME(zcd): unsqueeze2 adds an intermediate output(XShape) based on +// unsqueeze, the XShape is used to carry the shape and lod of X which +// will be used in unsqueeze_grad, in this way, the framework can reuse +// the memory of X immediately the unsqueeze2_op is finished. +// Considering compatibility issues, we could not fix unsqueeze2_op +class Unsqueeze2OpInferShape : public UnsqueezeOpInferShape { + public: + void operator()(framework::InferShapeContext *ctx) const override { + UnsqueezeOpInferShape::operator()(ctx); + PADDLE_ENFORCE(ctx->HasOutput("XShape"), + "Output(XShape) of Unsqueeze operator should not be null."); + const auto &x_dims = ctx->GetInputDim("X"); + std::vector xshape_dims(x_dims.size() + 1); + xshape_dims[0] = 0; + for (int i = 0; i < x_dims.size(); ++i) { + xshape_dims[i + 1] = x_dims[i]; + } + ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims)); + ctx->ShareLoD("X", /*->*/ "XShape"); + } +}; + +class Unsqueeze2OpMaker : public UnsqueezeOpMaker { + public: + void Make() override { + UnsqueezeOpMaker::Make(); + AddOutput("XShape", + "XShape is just used to store the shape and lod of X, which will " + "be used in UnsqueezeGradOp.") + .AsIntermediate(); + } +}; + +class Unsqueeze2Op : public framework::OperatorBase { + public: + using OperatorBase::OperatorBase; + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + auto &axes = Attr>("axes"); + auto x_dims = scope.FindVar(Input("X"))->Get().dims(); + auto out_dims = Unsqueeze2OpInferShape::GetOutputShape(axes, x_dims); + + framework::AttributeMap attrs; + attrs["shape"] = framework::vectorize2int(out_dims); + // Invoke Reshape op. + auto reshape_op = framework::OpRegistry::CreateOp( + "reshape2", {{"X", {Input("X")}}, {"Shape", {}}}, + {{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs); + reshape_op->Run(scope, place); + } +}; + +class Unsqueeze2GradOpMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("unsqueeze2_grad"); + grad_op->SetInput("XShape", Output("XShape")); + grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + grad_op->SetAttrMap(Attrs()); + return std::unique_ptr(grad_op); + } +}; + +class Unsqueeze2GradInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInput("XShape"), + "Input(XShape) shouldn't be null."); + PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) shouldn't be null."); + auto xshape_dims = context->GetInputDim("XShape"); + auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); + context->SetOutputDim(framework::GradVarName("X"), x_dims); + context->ShareLoD("XShape", framework::GradVarName("X")); + } +}; + +class Unsqueeze2GradOp : public framework::OperatorBase { + public: + using OperatorBase::OperatorBase; + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + auto dx_name = Output(framework::GradVarName("X")); + auto dout_name = Input(framework::GradVarName("Out")); + auto xshape_name = Input("XShape"); + auto xshape_dims = + scope.FindVar(xshape_name)->Get().dims(); + auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); + + framework::AttributeMap attrs; + attrs["shape"] = framework::vectorize2int(x_dims); + + auto reshape_op = framework::OpRegistry::CreateOp( + "reshape2", {{"X", {dout_name}}, {"Shape", {}}}, + {{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs); + reshape_op->Run(scope, place); + } +}; } // namespace operators } // namespace paddle @@ -180,3 +286,8 @@ REGISTER_OPERATOR(unsqueeze, ops::UnsqueezeOp, ops::UnsqueezeOpMaker, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp, ops::UnsqueezeGradInferShape); + +REGISTER_OPERATOR(unsqueeze2, ops::Unsqueeze2Op, ops::Unsqueeze2OpMaker, + ops::Unsqueeze2OpInferShape, ops::Unsqueeze2GradOpMaker); +REGISTER_OPERATOR(unsqueeze2_grad, ops::Unsqueeze2GradOp, + ops::Unsqueeze2GradInferShape); diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index 4fbfa6354ab45fed4839227a2a4be8fe147e5fd9..6a3ad2151081504fda2a3818c5f99ad47039d91d 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -121,6 +121,12 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root, if (nullptr == dso_handle) { LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " (" << dlerror() << ")"; + if (dlPath.find("nccl") != std::string::npos) { + std::cout + << "You may need to install 'nccl2' from NVIDIA official website: " + << "https://developer.nvidia.com/nccl/nccl-download" + << "before install PaddlePaddle" << std::endl; + } dlPath = dso_name; dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags); } diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 7199424b4709fbe9fc962cf98aea6223b9f3e51d..9ffde5df9673f192b8970ea832fd0328950969b2 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -115,6 +115,7 @@ function cmake_gen() { -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DWITH_CONTRIB=${WITH_CONTRIB:-ON} + -DWITH_INFERENCE=${WITH_INFERENCE:-ON} -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} -DPY_VERSION=${PY_VERSION:-2.7} ======================================== @@ -144,6 +145,7 @@ EOF -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \ + -DWITH_INFERENCE=${WITH_INFERENCE:-ON} \ -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \ -DPY_VERSION=${PY_VERSION:-2.7} } @@ -498,7 +500,7 @@ EOF EOF if [[ ${WITH_GPU} == "ON" ]]; then - NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.1.2-1+cuda${CUDA_MAJOR} libnccl-dev=2.1.2-1+cuda${CUDA_MAJOR} &&" + NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.2.13-1+cuda${CUDA_MAJOR} libnccl-dev=2.2.13-1+cuda${CUDA_MAJOR} &&" else NCCL_DEPS="" fi diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py index 920dbf3b4ebb0bc3d98c9ea986d7d039deed4a4c..19fc229e6fa84792f58aeeb00be09eb2401b19c7 100644 --- a/python/paddle/dataset/image.py +++ b/python/paddle/dataset/image.py @@ -104,7 +104,7 @@ def batch_images_from_tar(data_file, pickle.dump( output, open('%s/batch_%d' % (out_path, file_id), 'wb'), - protocol=pickle.HIGHEST_PROTOCOL) + protocol=2) file_id += 1 data = [] labels = [] @@ -113,9 +113,7 @@ def batch_images_from_tar(data_file, output['label'] = labels output['data'] = data pickle.dump( - output, - open('%s/batch_%d' % (out_path, file_id), 'wb'), - protocol=pickle.HIGHEST_PROTOCOL) + output, open('%s/batch_%d' % (out_path, file_id), 'wb'), protocol=2) with open(meta_file, 'a') as meta: for file in os.listdir(out_path): diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py index 0182bbeb637ec7b6a341a4822a1cc5fb5aef077d..b1598bfec210474ae1e17f9f88e8b57aa80b8452 100644 --- a/python/paddle/fluid/layers/metric_op.py +++ b/python/paddle/fluid/layers/metric_op.py @@ -78,7 +78,7 @@ def accuracy(input, label, k=1, correct=None, total=None): return acc_out -def auc(input, label, curve='ROC', num_thresholds=200, topk=1): +def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1): """ **Area Under the Curve (AUC) Layer** @@ -118,16 +118,14 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1): """ helper = LayerHelper("auc", **locals()) auc_out = helper.create_tmp_variable(dtype="float64") + batch_auc_out = helper.create_tmp_variable(dtype="float64") # make tp, tn, fp, fn persistable, so that can accumulate all batches. - tp = helper.create_global_variable( - persistable=True, dtype='int64', shape=[num_thresholds]) - tn = helper.create_global_variable( - persistable=True, dtype='int64', shape=[num_thresholds]) - fp = helper.create_global_variable( - persistable=True, dtype='int64', shape=[num_thresholds]) - fn = helper.create_global_variable( - persistable=True, dtype='int64', shape=[num_thresholds]) - for var in [tp, tn, fp, fn]: + stat_pos = helper.create_global_variable( + persistable=True, dtype='int64', shape=[num_thresholds + 1]) + stat_neg = helper.create_global_variable( + persistable=True, dtype='int64', shape=[num_thresholds + 1]) + + for var in [stat_pos, stat_neg]: helper.set_variable_initializer( var, Constant( value=0.0, force_cpu=True)) @@ -137,18 +135,15 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1): inputs={ "Predict": [input], "Label": [label], - "TP": [tp], - "TN": [tn], - "FP": [fp], - "FN": [fn] + "StatPos": [stat_pos], + "StatNeg": [stat_neg] }, attrs={"curve": curve, "num_thresholds": num_thresholds}, outputs={ "AUC": [auc_out], - "TPOut": [tp], - "TNOut": [tn], - "FPOut": [fp], - "FNOut": [fn] + "BatchAUC": [batch_auc_out], + "StatPosOut": [stat_pos], + "StatNegOut": [stat_neg] }) - return auc_out, [tp, tn, fp, fn] + return auc_out, batch_auc_out, [stat_pos, stat_neg] diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index a0d92fd1462acb18cdb2463b51138c9ff33b08a8..5f49d5bbff53096ece140a185f73722870924677 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -3546,11 +3546,6 @@ def topk(input, k, name=None): top5_values, top5_indices = layers.topk(input, k=5) """ - shape = input.shape - if k < 1 or k >= shape[-1]: - raise ValueError("k must be greater than 0 and less than %d." % - (shape[-1])) - helper = LayerHelper("top_k", **locals()) values = helper.create_tmp_variable(dtype=input.dtype) indices = helper.create_tmp_variable(dtype="int64") @@ -4030,10 +4025,12 @@ def transpose(x, perm, name=None): helper = LayerHelper('transpose', **locals()) out = helper.create_tmp_variable(x.dtype) + x_shape = helper.create_tmp_variable(x.dtype) helper.append_op( - type='transpose', + type='transpose2', inputs={'X': [x]}, - outputs={'Out': [out]}, + outputs={'Out': [out], + 'XShape': [x_shape]}, attrs={'axis': perm}) return out @@ -4525,13 +4522,15 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): "Each dimension size given in shape must not be negtive " "except one unknown dimension.") - helper = LayerHelper("reshape", **locals()) + helper = LayerHelper("reshape2", **locals()) out = helper.create_tmp_variable(dtype=x.dtype) + x_shape = helper.create_tmp_variable(dtype=x.dtype) helper.append_op( - type="reshape", + type="reshape2", inputs=inputs, attrs={"shape": shape}, - outputs={"Out": out}) + outputs={"Out": out, + "XShape": x_shape}) return helper.append_activation(out) @@ -4575,11 +4574,13 @@ def squeeze(input, axes, name=None): """ helper = LayerHelper("squeeze", **locals()) out = helper.create_tmp_variable(dtype=input.dtype) + x_shape = helper.create_tmp_variable(dtype=input.dtype) helper.append_op( - type="squeeze", + type="squeeze2", inputs={"X": input}, attrs={"axes": axes}, - outputs={"Out": out}) + outputs={"Out": out, + "XShape": x_shape}) return out @@ -4610,11 +4611,13 @@ def unsqueeze(input, axes, name=None): """ helper = LayerHelper("unsqueeze", **locals()) out = helper.create_tmp_variable(dtype=input.dtype) + x_shape = helper.create_tmp_variable(dtype=input.dtype) helper.append_op( - type="unsqueeze", + type="unsqueeze2", inputs={"X": input}, attrs={"axes": axes}, - outputs={"Out": out}) + outputs={"Out": out, + "XShape": x_shape}) return out @@ -5816,10 +5819,12 @@ def flatten(x, axis=1, name=None): raise ValueError("The axis should be a int, and in range [0, rank(x)]") out = helper.create_tmp_variable(x.dtype) + x_shape = helper.create_tmp_variable(x.dtype) helper.append_op( - type='flatten', + type='flatten2', inputs={"X": x}, - outputs={'Out': out}, + outputs={'Out': out, + 'XShape': x_shape}, attrs={"axis": axis}) return out diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py index 592cb23eb9319658f8542ed5bc6ab3e95cfdb118..0c2800dcf35ed156b71625babea2724f520575e5 100644 --- a/python/paddle/fluid/metrics.py +++ b/python/paddle/fluid/metrics.py @@ -558,8 +558,6 @@ class Auc(MetricBase): name: metric name curve: Specifies the name of the curve to be computed, 'ROC' [default] or 'PR' for the Precision-Recall-curve. - num_thresholds: The number of thresholds to use when discretizing the roc - curve. "NOTE: only implement the ROC curve type via Python now." @@ -574,15 +572,14 @@ class Auc(MetricBase): numpy_auc = metric.eval() """ - def __init__(self, name, curve='ROC', num_thresholds=200): + def __init__(self, name, curve='ROC', num_thresholds=4095): super(Auc, self).__init__(name=name) self._curve = curve self._num_thresholds = num_thresholds - self._epsilon = 1e-6 - self.tp_list = np.zeros((num_thresholds, )) - self.fn_list = np.zeros((num_thresholds, )) - self.tn_list = np.zeros((num_thresholds, )) - self.fp_list = np.zeros((num_thresholds, )) + + _num_pred_buckets = num_thresholds + 1 + self._stat_pos = [0] * _num_pred_buckets + self._stat_neg = [0] * _num_pred_buckets def update(self, preds, labels): if not _is_numpy_(labels): @@ -590,41 +587,32 @@ class Auc(MetricBase): if not _is_numpy_(preds): raise ValueError("The 'predictions' must be a numpy ndarray.") - kepsilon = 1e-7 # to account for floating point imprecisions - thresholds = [(i + 1) * 1.0 / (self._num_thresholds - 1) - for i in range(self._num_thresholds - 2)] - thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon] - - # calculate TP, FN, TN, FP count - for idx_thresh, thresh in enumerate(thresholds): - tp, fn, tn, fp = 0, 0, 0, 0 - for i, lbl in enumerate(labels): - if lbl: - if preds[i, 1] >= thresh: - tp += 1 - else: - fn += 1 - else: - if preds[i, 1] >= thresh: - fp += 1 - else: - tn += 1 - self.tp_list[idx_thresh] += tp - self.fn_list[idx_thresh] += fn - self.tn_list[idx_thresh] += tn - self.fp_list[idx_thresh] += fp + for i, lbl in enumerate(labels): + value = preds[i, 1] + bin_idx = int(value * self._num_thresholds) + assert bin_idx <= self._num_thresholds + if lbl: + self._stat_pos[bin_idx] += 1.0 + else: + self._stat_neg[bin_idx] += 1.0 + + @staticmethod + def trapezoid_area(x1, x2, y1, y2): + return abs(x1 - x2) * (y1 + y2) / 2.0 def eval(self): - epsilon = self._epsilon - num_thresholds = self._num_thresholds - tpr = (self.tp_list.astype("float32") + epsilon) / ( - self.tp_list + self.fn_list + epsilon) - fpr = self.fp_list.astype("float32") / ( - self.fp_list + self.tn_list + epsilon) - rec = (self.tp_list.astype("float32") + epsilon) / ( - self.tp_list + self.fp_list + epsilon) - - x = fpr[:num_thresholds - 1] - fpr[1:] - y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0 - auc_value = np.sum(x * y) - return auc_value + tot_pos = 0.0 + tot_neg = 0.0 + auc = 0.0 + + idx = self._num_thresholds + while idx >= 0: + tot_pos_prev = tot_pos + tot_neg_prev = tot_neg + tot_pos += self._stat_pos[idx] + tot_neg += self._stat_neg[idx] + auc += self.trapezoid_area(tot_neg, tot_neg_prev, tot_pos, + tot_pos_prev) + idx -= 1 + + return auc / tot_pos / tot_neg if tot_pos > 0.0 and tot_neg > 0.0 else 0.0 diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 33d6311b9717c66f0d6782eb6b3e348cd4c02a69..215f0cf2fc5ab4fbd06719ac4790a01dd00080eb 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -897,7 +897,20 @@ class RMSPropOptimizer(Optimizer): r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 - v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{v(w,t) + + v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) + + \\epsilon}} \\nabla Q_{i}(w) + + w & = w - v(w, t) + + if centered is True: + + .. math:: + + r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 + + g(w, t) & = \\rho g(w, t-1) + (1 - \\rho)\\nabla Q_{i}(w) + + v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) - (g(w, t))^2 + \\epsilon}} \\nabla Q_{i}(w) w & = w - v(w, t) @@ -915,6 +928,10 @@ class RMSPropOptimizer(Optimizer): avoid division by zero, set 1e-6 by default. momentum(float): :math:`\\beta` in equation is the momentum term, set 0.0 by default. + centered(bool): If True, gradients are normalized by the estimated variance of + the gradient; if False, by the uncentered second moment. Setting this to + True may help with training, but is slightly more expensive in terms of + computation and memory. Defaults to False. Raises: ValueError: If learning_rate, rho, epsilon, momentum are None. @@ -928,12 +945,14 @@ class RMSPropOptimizer(Optimizer): _momentum_acc_str = "momentum" _mean_square_acc_str = "mean_square" + _mean_grad_acc_str = "mean_grad" def __init__(self, learning_rate, rho=0.95, epsilon=1.0e-6, momentum=0.0, + centered=False, **kwargs): super(RMSPropOptimizer, self).__init__( learning_rate=learning_rate, **kwargs) @@ -950,6 +969,7 @@ class RMSPropOptimizer(Optimizer): self._rho = rho self._epsilon = epsilon self._momentum = momentum + self._centered = centered def _create_accumulators(self, block, parameters): if not isinstance(block, framework.Block): @@ -958,6 +978,7 @@ class RMSPropOptimizer(Optimizer): for p in parameters: self._add_accumulator(self._momentum_acc_str, p) self._add_accumulator(self._mean_square_acc_str, p) + self._add_accumulator(self._mean_grad_acc_str, p) def _append_optimize_op(self, block, param_and_grad): if not isinstance(block, framework.Block): @@ -967,6 +988,8 @@ class RMSPropOptimizer(Optimizer): param_and_grad[0]) mean_square_acc = self._get_accumulator(self._mean_square_acc_str, param_and_grad[0]) + mean_grad_acc = self._get_accumulator(self._mean_grad_acc_str, + param_and_grad[0]) rmsprop_op = block.append_op( type=self.type, inputs={ @@ -974,17 +997,20 @@ class RMSPropOptimizer(Optimizer): "Grad": param_and_grad[1], "Moment": momentum_acc, "MeanSquare": mean_square_acc, + "MeanGrad": mean_grad_acc, "LearningRate": self._create_param_lr(param_and_grad), }, outputs={ "ParamOut": param_and_grad[0], "MomentOut": momentum_acc, - "MeanSquareOut": mean_square_acc + "MeanSquareOut": mean_square_acc, + "MeanGradOut": mean_grad_acc }, attrs={ "epsilon": self._epsilon, "decay": self._rho, - "momentum": self._momentum + "momentum": self._momentum, + "centered": self._centered }) return rmsprop_op diff --git a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py index f6017a455df7e8bd197ef2563a759f843b5e7c73..e1368a3392a9cab3e82eff0a73eb225a52aa03bf 100644 --- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py +++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py @@ -47,14 +47,14 @@ def train_program(): loss = fluid.layers.square_error_cost(input=y_predict, label=y) avg_loss = fluid.layers.mean(loss) - return avg_loss + return [avg_loss, y_predict] def optimizer_func(): return fluid.optimizer.SGD(learning_rate=0.001) -def train(use_cuda, train_program, params_dirname): +def train(use_cuda, train_program, params_dirname, inference_model_dirname): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() trainer = fluid.Trainer( @@ -74,6 +74,8 @@ def train(use_cuda, train_program, params_dirname): ''' if params_dirname is not None: trainer.save_params(params_dirname) + trainer.save_inference_model(inference_model_dirname, + ['x'], [1]) trainer.stop() trainer.train( @@ -99,15 +101,55 @@ def infer(use_cuda, inference_program, params_dirname=None): print("infer results: ", results[0]) +def infer_by_saved_model(use_cuda, save_dirname=None): + if save_dirname is None: + return + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + + inference_scope = fluid.core.Scope() + with fluid.scope_guard(inference_scope): + # Use fluid.io.load_inference_model to obtain the inference program desc, + # the feed_target_names (the names of variables that will be feeded + # data using feed operators), and the fetch_targets (variables that + # we want to obtain data from using fetch operators). + [inference_program, feed_target_names, + fetch_targets] = fluid.io.load_inference_model(save_dirname, exe) + + # The input's dimension should be 2-D and the second dim is 13 + # The input data should be >= 0 + batch_size = 10 + + test_reader = paddle.batch( + paddle.dataset.uci_housing.test(), batch_size=batch_size) + + test_data = next(test_reader()) + test_feat = numpy.array( + [data[0] for data in test_data]).astype("float32") + test_label = numpy.array( + [data[1] for data in test_data]).astype("float32") + + assert feed_target_names[0] == 'x' + results = exe.run(inference_program, + feed={feed_target_names[0]: numpy.array(test_feat)}, + fetch_list=fetch_targets) + print("infer shape: ", results[0].shape) + print("infer results: ", results[0]) + print("ground truth: ", test_label) + + def main(use_cuda): if use_cuda and not fluid.core.is_compiled_with_cuda(): return # Directory for saving the trained model - params_dirname = "fit_a_line.inference.model" + params_dirname = "fit_a_line.model" + inference_model_dirname = "fit_a_line.inference_model" - train(use_cuda, train_program, params_dirname) + train(use_cuda, train_program, params_dirname, inference_model_dirname) infer(use_cuda, inference_program, params_dirname) + infer_by_saved_model(use_cuda, inference_model_dirname) class TestFitALine(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py index 7abfa0a4be0dec9fe251704e22dfef1f932e7c5b..e3db316698398ff693157d583ad1410d10dcf81d 100644 --- a/python/paddle/fluid/tests/unittests/dist_transformer.py +++ b/python/paddle/fluid/tests/unittests/dist_transformer.py @@ -36,6 +36,7 @@ import paddle.fluid as fluid import paddle.fluid.layers as layers from paddle.fluid import core from test_dist_base import TestDistRunnerBase, runtime_main +import paddle.compat as cpt from paddle.compat import long_type import hashlib @@ -315,8 +316,9 @@ def pad_batch_data(insts, """ return_list = [] max_len = max(len(inst) for inst in insts) - num_token = reduce(lambda x, y: x + y, - [len(inst) for inst in insts]) if return_num_token else 0 + num_token = six.moves.reduce( + lambda x, y: x + y, + [len(inst) for inst in insts]) if return_num_token else 0 # Any token included in dict can be used to pad, since the paddings' loss # will be masked out by weights and make no effect on parameter gradients. inst_data = np.array( @@ -328,7 +330,7 @@ def pad_batch_data(insts, return_list += [inst_weight.astype("float32").reshape([-1, 1])] else: # position data inst_pos = np.array([ - range(1, len(inst) + 1) + [0] * (max_len - len(inst)) + list(range(1, len(inst) + 1)) + [0] * (max_len - len(inst)) for inst in insts ]) return_list += [inst_pos.astype("int64").reshape([-1, 1])] @@ -385,10 +387,11 @@ def prepare_batch_input(insts, data_input_names, src_pad_idx, trg_pad_idx, return_num_token=True) data_input_dict = dict( - zip(data_input_names, [ - src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, - trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight - ])) + list( + zip(data_input_names, [ + src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, + trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight + ]))) return data_input_dict, np.asarray([num_token], dtype="float32") @@ -561,7 +564,7 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler, np.log(TrainTaskConfig.label_smooth_eps / ( ModelHyperParams.trg_vocab_size - 1) + 1e-20)) init = False - for pass_id in xrange(TrainTaskConfig.pass_num): + for pass_id in six.moves.xrange(TrainTaskConfig.pass_num): pass_start_time = time.time() for batch_id, data in enumerate(train_data()): if batch_id >= 5: @@ -587,11 +590,11 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler, ModelHyperParams.eos_idx, ModelHyperParams.n_head, ModelHyperParams.d_model) total_num_token += num_token - feed_kv_pairs = data_input_dict.items() + feed_kv_pairs = list(data_input_dict.items()) if TrainTaskConfig.local: - feed_kv_pairs += { + feed_kv_pairs += list({ lr_scheduler.learning_rate.name: lr_rate - }.items() + }.items()) feed_list.append(dict(feed_kv_pairs)) if not init: @@ -873,6 +876,7 @@ class DataReader(object): f = tarfile.open(fpaths[0], "r") for line in f.extractfile(tar_fname): + line = cpt.to_text(line) fields = line.strip("\n").split(self._field_delimiter) if (not self._only_src and len(fields) == 2) or ( self._only_src and len(fields) == 1): @@ -882,8 +886,9 @@ class DataReader(object): if not os.path.isfile(fpath): raise IOError("Invalid file: %s" % fpath) - with open(fpath, "r") as f: + with open(fpath, "rb") as f: for line in f: + line = cpt.to_text(line) fields = line.strip("\n").split(self._field_delimiter) if (not self._only_src and len(fields) == 2) or ( self._only_src and len(fields) == 1): @@ -892,8 +897,9 @@ class DataReader(object): @staticmethod def load_dict(dict_path, reverse=False): word_dict = {} - with open(dict_path, "r") as fdict: + with open(dict_path, "rb") as fdict: for idx, line in enumerate(fdict): + line = cpt.to_text(line) if reverse: word_dict[idx] = line.strip("\n") else: @@ -1034,7 +1040,7 @@ def multi_head_attention(queries, # size of the input as the output dimension size. return layers.reshape( x=trans_x, - shape=map(int, [0, 0, trans_x.shape[2] * trans_x.shape[3]])) + shape=list(map(int, [0, 0, trans_x.shape[2] * trans_x.shape[3]]))) def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate): """ diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 20f1a37a426e9697048d636bf738c9056213e5f6..56a242b996f67aa4b9c858ab8aaeb1c1cd3bcf60 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -249,7 +249,7 @@ class OpTest(unittest.TestCase): outs, _ = self._calc_output(place) return outs - def _calc_output(self, place, parallel=False): + def _calc_output(self, place, parallel=False, no_check_set=None): program = Program() block = program.global_block() @@ -273,6 +273,8 @@ class OpTest(unittest.TestCase): # if not, fill the fetch_list by the user configured outputs in test. if len(fetch_list) == 0: for var_name, var in six.iteritems(outputs): + if no_check_set is not None and var_name in no_check_set: + continue if isinstance(var, list): for v in var: fetch_list.append(v) @@ -291,11 +293,17 @@ class OpTest(unittest.TestCase): return_numpy=False) return outs, fetch_list - def check_output_with_place(self, place, atol): - outs, fetch_list = self._calc_output(place) + def check_output_with_place(self, + place, + atol, + no_check_set=None, + equal_nan=False): + outs, fetch_list = self._calc_output(place, no_check_set=no_check_set) for out_name, out_dup in Operator.get_op_outputs(self.op_type): if out_name not in self.outputs: continue + if no_check_set is not None and out_name in no_check_set: + continue def find_actual(target_name, fetch_list): found = [ @@ -321,7 +329,7 @@ class OpTest(unittest.TestCase): if isinstance(expect, tuple) else expect self.assertTrue( np.allclose( - actual_t, expect_t, atol=atol), + actual_t, expect_t, atol=atol, equal_nan=equal_nan), "Output (" + sub_out_name + ") has diff at " + str(place)) if isinstance(expect, tuple): @@ -337,7 +345,7 @@ class OpTest(unittest.TestCase): expect_t = expect[0] if isinstance(expect, tuple) else expect self.assertTrue( np.allclose( - actual_t, expect_t, atol=atol), + actual_t, expect_t, atol=atol, equal_nan=equal_nan), "Output (" + out_name + ") has diff at " + str(place) + "\nExpect " + str(expect_t) + "\n" + "But Got" + str(actual_t)) @@ -360,10 +368,10 @@ class OpTest(unittest.TestCase): places.append(core.CUDAPlace(0)) return places - def check_output(self, atol=1e-5): + def check_output(self, atol=1e-5, no_check_set=None, equal_nan=False): places = self._get_places() for place in places: - self.check_output_with_place(place, atol) + self.check_output_with_place(place, atol, no_check_set, equal_nan) def check_output_customized(self, checker): places = self._get_places() diff --git a/python/paddle/fluid/tests/unittests/test_auc_op.py b/python/paddle/fluid/tests/unittests/test_auc_op.py index 5393a17e674a3cad6d705a1ff7a45320e644af94..1de4a9d016a177944253d12094722d3a05614be2 100644 --- a/python/paddle/fluid/tests/unittests/test_auc_op.py +++ b/python/paddle/fluid/tests/unittests/test_auc_op.py @@ -26,18 +26,15 @@ class TestAucOp(OpTest): pred = np.random.random((128, 2)).astype("float32") labels = np.random.randint(0, 2, (128, 1)) num_thresholds = 200 - tp = np.zeros((num_thresholds, )).astype("int64") - tn = np.zeros((num_thresholds, )).astype("int64") - fp = np.zeros((num_thresholds, )).astype("int64") - fn = np.zeros((num_thresholds, )).astype("int64") + + stat_pos = np.zeros((num_thresholds + 1, )).astype("int64") + stat_neg = np.zeros((num_thresholds + 1, )).astype("int64") self.inputs = { 'Predict': pred, 'Label': labels, - 'TP': tp, - 'TN': tn, - 'FP': fp, - 'FN': fn + "StatPos": stat_pos, + "StatNeg": stat_neg } self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds} @@ -47,11 +44,10 @@ class TestAucOp(OpTest): python_auc.update(pred, labels) self.outputs = { - 'AUC': python_auc.eval(), - 'TPOut': python_auc.tp_list, - 'FNOut': python_auc.fn_list, - 'TNOut': python_auc.tn_list, - 'FPOut': python_auc.fp_list + 'AUC': np.array(python_auc.eval()), + 'BatchAUC': np.array(python_auc.eval()), + 'StatPosOut': np.array(python_auc._stat_pos), + 'StatNegOut': np.array(python_auc._stat_neg) } def test_check_output(self): diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 58875a1dd19fd91f6f2bed928397ee7f73302dff..c0f5da5a1ae43847dff6348ea5f3e3bfd5e89ab9 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -55,6 +55,7 @@ class TestDistRunnerBase(object): pserver_prog = t.get_pserver_program(args.current_endpoint) startup_prog = t.get_startup_program(args.current_endpoint, pserver_prog) + place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) @@ -147,6 +148,8 @@ def runtime_main(test_class): import paddle.compat as cpt +import socket +from contextlib import closing class TestDistBase(unittest.TestCase): @@ -156,13 +159,19 @@ class TestDistBase(unittest.TestCase): def setUp(self): self._trainers = 2 self._pservers = 2 - self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124" + self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % ( + self._find_free_port(), self._find_free_port()) self._python_interp = "python" self._sync_mode = True self._mem_opt = False self._use_reduce = False self._setup_config() + def _find_free_port(self): + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: + s.bind(('', 0)) + return s.getsockname()[1] + def start_pserver(self, model_file, check_error_log): ps0_ep, ps1_ep = self._ps_endpoints.split(",") ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist" diff --git a/python/paddle/fluid/tests/unittests/test_flatten_op.py b/python/paddle/fluid/tests/unittests/test_flatten_op.py index 17b01e03124e8007c51107b414c628d4bfc49c79..effa2a148eef8b0047b12c676803abb2871e8118 100644 --- a/python/paddle/fluid/tests/unittests/test_flatten_op.py +++ b/python/paddle/fluid/tests/unittests/test_flatten_op.py @@ -22,14 +22,17 @@ from op_test import OpTest class TestFlattenOp(OpTest): def setUp(self): - self.op_type = "flatten" + self.op_type = "flatten2" self.init_test_case() self.inputs = {"X": np.random.random(self.in_shape).astype("float32")} self.init_attrs() - self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)} + self.outputs = { + "Out": self.inputs["X"].reshape(self.new_shape), + "XShape": np.random.random(self.in_shape).astype("float32") + } def test_check_output(self): - self.check_output() + self.check_output(no_check_set=["XShape"]) def test_check_grad(self): self.check_grad(["X"], "Out") diff --git a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py index 1f1eb37667e304351a6a85edde09e7da32cf1630..4767e9433ea74d5da83867d646f2a63c9a092668 100644 --- a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py +++ b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py @@ -58,6 +58,7 @@ class TestFusionLSTMOp(OpTest): self.act_cell = 'tanh' self.act_cand = 'tanh' self.use_peepholes = False + self.use_seq = False self.set_conf() T = sum(self.lod[0]) @@ -107,6 +108,7 @@ class TestFusionLSTMOp(OpTest): } self.attrs = { 'use_peepholes': self.use_peepholes, + 'use_seq': self.use_seq, 'is_reverse': self.is_reverse, 'gate_activation': self.act_gate, 'cell_activation': self.act_cell, @@ -159,5 +161,68 @@ class TestFusionLSTMOpBS1(TestFusionLSTMOp): self.D = 16 +class TestFusionLSTMOpPeepholes(TestFusionLSTMOp): + def set_conf(self): + self.use_peepholes = True + + +class TestFusionLSTMOpPeepholesInit(TestFusionLSTMOp): + def set_conf(self): + self.use_peepholes = True + self.has_initial_state = True + + +class TestFusionLSTMOpPeepholesReverse(TestFusionLSTMOp): + def set_conf(self): + self.use_peepholes = True + self.is_reverse = True + + +class TestFusionLSTMOpPoopholesBS1(TestFusionLSTMOp): + def set_conf(self): + self.use_peepholes = True + self.lod = [[3]] + self.D = 16 + + +class TestFusionLSTMOpSeqInit(TestFusionLSTMOp): + def set_conf(self): + self.use_seq = True + self.has_initial_state = True + + +class TestFusionLSTMOpSeqReverse(TestFusionLSTMOp): + def set_conf(self): + self.use_seq = True + self.is_reverse = True + + +class TestFusionLSTMOpSeqInitReverse(TestFusionLSTMOp): + def set_conf(self): + self.use_seq = True + self.has_initial_state = True + self.is_reverse = True + + +class TestFusionLSTMOpSeqPeepholes(TestFusionLSTMOp): + def set_conf(self): + self.use_seq = True + self.use_peepholes = True + + +class TestFusionLSTMOpSeqPeepholesInit(TestFusionLSTMOp): + def set_conf(self): + self.use_seq = True + self.use_peepholes = True + self.has_initial_state = True + + +class TestFusionLSTMOpSeqPeepholesReverse(TestFusionLSTMOp): + def set_conf(self): + self.use_seq = True + self.use_peepholes = True + self.is_reverse = True + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py index 372ef748b2e704fd3858c382e048e51448ed3bd5..a49c5d9b43ae1bffa7cb57764db497f68030b151 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py @@ -85,6 +85,7 @@ class TestFetchOp(unittest.TestCase): assert not math.isnan(np.sum(ret[i])) and \ not math.isinf(np.sum(ret[i])) + @unittest.skip(reason="CI timeout") def test_fetch_op(self): tst_reader = paddle.batch(flowers.test(use_xmap=False), batch_size=16) tst_reader_iter = tst_reader() @@ -139,6 +140,7 @@ class TestFeedParallel(unittest.TestCase): if batch_id == 2: break + @unittest.skip(reason="CI timeout") def test_feed_op(self): os.environ['CPU_NUM'] = str(4) if core.is_compiled_with_cuda(): diff --git a/python/paddle/fluid/tests/unittests/test_prelu_op.py b/python/paddle/fluid/tests/unittests/test_prelu_op.py index 1e3e40d54a78045c8d8fdd9a3a3715107d1e7a80..48a6b0577b6787d2e1231fdcbe6d2c1bb46414ed 100644 --- a/python/paddle/fluid/tests/unittests/test_prelu_op.py +++ b/python/paddle/fluid/tests/unittests/test_prelu_op.py @@ -16,6 +16,7 @@ from __future__ import print_function import unittest import numpy as np +import six from op_test import OpTest @@ -62,17 +63,20 @@ class PReluTest(OpTest): # TODO(minqiyang): Resume these test cases after fixing Python3 CI job issues -# class TestCase1(PReluTest): -# def initTestCase(self): -# self.attrs = {'mode': "all"} +if six.PY2: -# class TestCase2(PReluTest): -# def initTestCase(self): -# self.attrs = {'mode': "channel"} + class TestCase1(PReluTest): + def initTestCase(self): + self.attrs = {'mode': "all"} + + class TestCase2(PReluTest): + def initTestCase(self): + self.attrs = {'mode': "channel"} + + class TestCase3(PReluTest): + def initTestCase(self): + self.attrs = {'mode': "element"} -# class TestCase3(PReluTest): -# def initTestCase(self): -# self.attrs = {'mode': "element"} if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py index 1de35dc35b0176b77eb2d9b25cd6ee4e645e56c3..0557593657e2e480a509902a07f25723b2c710b0 100644 --- a/python/paddle/fluid/tests/unittests/test_reshape_op.py +++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py @@ -22,106 +22,39 @@ from op_test import OpTest class TestReshapeOp(OpTest): def setUp(self): - ori_shape = (2, 25) - new_shape = (5, 10) - - self.op_type = "reshape" - self.inputs = {"X": np.random.random(ori_shape).astype("float32")} - self.attrs = {"shape": new_shape} - self.outputs = {"Out": self.inputs["X"].reshape(new_shape)} - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(["X"], "Out") - - -class TestReshapeOpDimInfer1(OpTest): - def setUp(self): - ori_shape = (5, 10) - new_shape = (5, -1, 5) - - self.op_type = "reshape" - self.inputs = {"X": np.random.random(ori_shape).astype("float32")} - self.attrs = {"shape": new_shape} - self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])} - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(["X"], "Out") - - -class TestReshapeOpDimInfer2(OpTest): - def setUp(self): - ori_shape = (2, 2, 6) - new_shape = (2, 0, 3, -1) - infered_shape = (2, 2, 3, -1) - - self.op_type = "reshape" - self.inputs = {"X": np.random.random(ori_shape).astype("float32")} - self.attrs = {"shape": new_shape} - self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)} - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(["X"], "Out") - - -class TestReshapeOpInplace(OpTest): - def setUp(self): - ori_shape = (2, 25) - new_shape = (5, 10) - - self.op_type = "reshape" - self.inputs = {"X": np.random.random(ori_shape).astype("float32")} - self.attrs = {"shape": new_shape} - self.outputs = {"Out": self.inputs["X"].reshape(new_shape)} - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(["X"], "Out") - - -class TestReshapeOpDimInferInplace1(OpTest): - def setUp(self): - ori_shape = (5, 10) - new_shape = (5, -1, 5) + self.init_data() + self.op_type = "reshape2" + self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")} + self.attrs = {"shape": self.new_shape} + self.outputs = { + "Out": self.inputs["X"].reshape(self.infered_shape), + 'XShape': np.random.random(self.ori_shape).astype("float32") + } - self.op_type = "reshape" - self.inputs = {"X": np.random.random(ori_shape).astype("float32")} - self.attrs = {"shape": new_shape} - self.outputs = {"Out": self.inputs["X"].reshape(new_shape)} + def init_data(self): + self.ori_shape = (2, 25) + self.new_shape = (5, 10) + self.infered_shape = (5, 10) def test_check_output(self): - self.check_output() + self.check_output(no_check_set=['XShape']) def test_check_grad(self): self.check_grad(["X"], "Out") -class TestReshapeOpDimInferInplace2(OpTest): - def setUp(self): - ori_shape = (2, 2, 6) - new_shape = (2, 0, 3, -1) - infered_shape = (2, 2, 3, -1) - - self.op_type = "reshape" - self.inputs = {"X": np.random.random(ori_shape).astype("float32")} - self.attrs = {"shape": new_shape} - self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)} +class TestReshapeOpDimInfer1(TestReshapeOp): + def init_data(self): + self.ori_shape = (5, 10) + self.new_shape = (5, -1, 5) + self.infered_shape = (5, -1, 5) - def test_check_output(self): - self.check_output() - def test_check_grad(self): - self.check_grad(["X"], "Out") +class TestReshapeOpDimInfer2(TestReshapeOp): + def init_data(self): + self.ori_shape = (2, 2, 6) + self.new_shape = (2, 0, 3, -1) + self.infered_shape = (2, 2, 3, -1) class TestReshapeOpWithInputShape(OpTest): @@ -130,20 +63,23 @@ class TestReshapeOpWithInputShape(OpTest): new_shape = (0, -1, 5) actual_shape = (2, 3, 5) - self.op_type = "reshape" + self.op_type = "reshape2" self.inputs = { "X": np.random.random(ori_shape).astype("float32"), "Shape": np.array( actual_shape, dtype="int32") } self.attrs = {"shape": new_shape} - self.outputs = {"Out": self.inputs["X"].reshape(actual_shape)} + self.outputs = { + "Out": self.inputs["X"].reshape(actual_shape), + 'XShape': np.random.random(ori_shape).astype("float32") + } def test_check_output(self): - self.check_output() + self.check_output(no_check_set=['XShape']) def test_check_grad(self): - self.check_grad(["X"], "Out") + self.check_grad(["X"], "Out", sum_outputs=["Out"]) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py index 3d4623c74d9a307b12ab6d72ad0b4d2dae938720..70848e4e2239e2be160bb0c1a28a5aecd01a87dc 100644 --- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py +++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py @@ -15,90 +15,164 @@ from __future__ import print_function import unittest + import numpy as np -from op_test import OpTest - - -class TestRmspropOp1(OpTest): - ''' Test RMSProp with explicit inputs - ''' - - def setUp(self): - self.op_type = "rmsprop" - - param = np.random.random((123, 321)).astype("float32") - mean_square = np.random.random((123, 321)).astype("float32") - learning_rate = np.array([0.01]).astype("float32") - grad = np.random.random((123, 321)).astype("float32") - moment = np.zeros((123, 321)).astype("float32") - - epsilon = 1e-6 - decay = 0.9 - momentum = 0.0 - - self.inputs = { - 'Param': param, - 'MeanSquare': mean_square, - 'LearningRate': learning_rate, - 'Grad': grad, - 'Moment': moment, - } - - self.attrs = {'epsilon': epsilon, 'decay': decay, 'momentum': momentum} - - ms_out = decay * mean_square + (1 - decay) * grad * grad - moment_out = momentum * moment + \ - learning_rate * grad / np.sqrt(ms_out + epsilon) - param_out = param - moment_out - - self.outputs = { - 'ParamOut': param_out, - 'MomentOut': moment_out, - 'MeanSquareOut': ms_out - } - - def test_check_output(self): - self.check_output() - - -class TestRmspropOp2(OpTest): - '''Test RMSProp with default values for attributes - ''' - - def setUp(self): - self.op_type = "rmsprop" - - param = np.random.random((123, 321)).astype("float32") - mean_square = np.random.random((123, 321)).astype("float32") - learning_rate = np.array([0.01]).astype("float32") - grad = np.random.random((123, 321)).astype("float32") - moment = np.zeros((123, 321)).astype("float32") - - epsilon = 1.0e-10 - decay = 0.9 - momentum = 0.0 - - self.inputs = { - 'Param': param, - 'MeanSquare': mean_square, - 'LearningRate': learning_rate, - 'Grad': grad, - 'Moment': moment, - } - - ms_out = decay * mean_square + (1 - decay) * grad * grad - moment_out = momentum * moment + \ - learning_rate * grad / np.sqrt(ms_out + epsilon) - param_out = param - moment_out - - self.outputs = { - 'ParamOut': param_out, - 'MomentOut': moment_out, - 'MeanSquareOut': ms_out - } - - def test_check_output(self): - self.check_output() +import paddle.fluid.core as core +from paddle.fluid.op import Operator + + +class TestBase(unittest.TestCase): + def setup(self, centered, epsilon=1e-6): + np.random.seed(5) # fix seed + + self.param_name = "param" + self.param = np.random.random((123, 321)).astype("float32") + + self.mean_square_name = "mean_square" + self.mean_square = np.random.random((123, 321)).astype("float32") + + self.mean_grad_name = "mean_grad" + self.mean_grad = np.random.random((123, 321)).astype("float32") + + self.lr_name = "lr" + self.learning_rate = np.array([0.01]).astype("float32") + + self.grad_name = "grad" + self.grad = np.random.random((123, 321)).astype("float32") + + self.moment_name = "moment" + self.moment = np.zeros((123, 321)).astype("float32") + + self.epsilon = epsilon + self.decay = 0.9 + self.momentum = 0.0 + self.centered = centered + + self.ms_out = self.decay * self.mean_square + (1 - self.decay + ) * self.grad * self.grad + if centered: + self.mg_out = self.decay * self.mean_grad + (1 - self.decay + ) * self.grad + self.moment_out = self.momentum * self.moment + \ + self.learning_rate * self.grad / np.sqrt(self.ms_out - np.square(self.mg_out) + self.epsilon) + else: + self.moment_out = self.momentum * self.moment + \ + self.learning_rate * self.grad / np.sqrt(self.ms_out + self.epsilon) + + self.param_out = self.param - self.moment_out + + def check(self, + actual_t, + expect_t, + place, + out_name, + atol=1e-5, + equal_nan=False): + self.assertTrue( + np.allclose( + actual_t, expect_t, atol=atol, equal_nan=equal_nan), + "Output (" + out_name + ") has diff at " + str(place) + "\nExpect " + + str(expect_t) + "\n" + "But Got" + str(actual_t)) + + +class TestRmspropOp(TestBase): + def check_with_place(self, place, centered, epsilon): + self.setup(centered, epsilon) + scope = core.Scope() + + # create and initialize Param Variable + param = scope.var(self.param_name).get_tensor() + param.set(self.param, place) + + mean_square = scope.var(self.mean_square_name).get_tensor() + mean_square.set(self.mean_square, place) + + lr = scope.var(self.lr_name).get_tensor() + lr.set(self.learning_rate, place) + + grad = scope.var(self.grad_name).get_tensor() + grad.set(self.grad, place) + + moment = scope.var(self.moment_name).get_tensor() + moment.set(self.moment, place) + + # create and run sgd operator + + if self.centered: + mean_grad = scope.var(self.mean_grad_name).get_tensor() + mean_grad.set(self.mean_grad, place) + + rmsprop_op = Operator( + "rmsprop", + Param=self.param_name, + Grad=self.grad_name, + MeanSquare=self.mean_square_name, + MeanGrad=self.mean_grad_name, + Moment=self.moment_name, + LearningRate=self.lr_name, + ParamOut=self.param_name, + MeanSquareOut=self.mean_square_name, + MomentOut=self.moment_name, + MeanGradOut=self.mean_grad_name, + epsilon=self.epsilon, + decay=self.decay, + momentum=self.momentum, + centered=True) + else: + rmsprop_op = Operator( + "rmsprop", + Param=self.param_name, + Grad=self.grad_name, + MeanSquare=self.mean_square_name, + Moment=self.moment_name, + LearningRate=self.lr_name, + ParamOut=self.param_name, + MeanSquareOut=self.mean_square_name, + MomentOut=self.moment_name, + epsilon=self.epsilon, + decay=self.decay, + momentum=self.momentum, + centered=False) + + rmsprop_op.run(scope, place) + + atol = 1e-5 + equal_nan = False + + if self.centered: + atol = 1e-3 + equal_nan = True + + self.check( + np.array(mean_square), self.ms_out, place, self.mean_square_name) + self.check( + np.array(moment), + self.moment_out, + place, + self.moment_name, + atol=atol, + equal_nan=equal_nan) + self.check( + np.array(param), + self.param_out, + place, + self.param_name, + atol=atol, + equal_nan=equal_nan) + + if self.centered: + self.check( + np.array(mean_grad), self.mg_out, place, self.mean_grad_name) + + def test_rmsprop(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + for place in places: + self.check_with_place(place, False, 1e-6) + self.check_with_place(place, False, 1e-10) + self.check_with_place(place, True, 1e-6) + self.check_with_place(place, True, 1e-10) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_squeeze_op.py b/python/paddle/fluid/tests/unittests/test_squeeze_op.py index 2be8e24a0fae6945351eb767ac924d7ca70848ab..204a4bb40196bd1fc2f5861aa31cf9560ea4d349 100644 --- a/python/paddle/fluid/tests/unittests/test_squeeze_op.py +++ b/python/paddle/fluid/tests/unittests/test_squeeze_op.py @@ -23,14 +23,17 @@ from op_test import OpTest # Correct: General. class TestSqueezeOp(OpTest): def setUp(self): - self.op_type = "squeeze" + self.op_type = "squeeze2" self.init_test_case() self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")} self.init_attrs() - self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)} + self.outputs = { + "Out": self.inputs["X"].reshape(self.new_shape), + "XShape": np.random.random(self.ori_shape).astype("float32") + } def test_check_output(self): - self.check_output() + self.check_output(no_check_set=['XShape']) def test_check_grad(self): self.check_grad(["X"], "Out") diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py index 0853f80b82030679d140f7fabdd42557c2374599..c30da2389d50d3b6bdf1f911aaed6ed71f274153 100644 --- a/python/paddle/fluid/tests/unittests/test_transpose_op.py +++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py @@ -22,16 +22,19 @@ from op_test import OpTest class TestTransposeOp(OpTest): def setUp(self): self.initTestCase() - self.op_type = "transpose" + self.op_type = "transpose2" self.inputs = {'X': np.random.random(self.shape).astype("float32")} self.attrs = {'axis': list(self.axis)} - self.outputs = {'Out': self.inputs['X'].transpose(self.axis)} + self.outputs = { + 'XShape': np.random.random(self.shape).astype("float32"), + 'Out': self.inputs['X'].transpose(self.axis) + } def test_check_output(self): - self.check_output() + self.check_output(no_check_set=['XShape']) def test_check_grad(self): - self.check_grad(['X'], 'Out') + self.check_grad(['X'], 'Out', sum_outputs=['Out']) def initTestCase(self): self.shape = (3, 4) diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py index a324438ba5a3c3b57fd956bd11189ef7d50267e2..14dd2bb06f9a18d0b15a4aee4e9e6bfdf8c41206 100644 --- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py +++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py @@ -24,13 +24,16 @@ from op_test import OpTest class TestUnsqueezeOp(OpTest): def setUp(self): self.init_test_case() - self.op_type = "unsqueeze" + self.op_type = "unsqueeze2" self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")} self.init_attrs() - self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)} + self.outputs = { + "Out": self.inputs["X"].reshape(self.new_shape), + "XShape": np.random.random(self.ori_shape).astype("float32") + } def test_check_output(self): - self.check_output() + self.check_output(no_check_set=["XShape"]) def test_check_grad(self): self.check_grad(["X"], "Out") diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index d094647afe1900809fc32cae93f777765f72c675..30cdfe4ad2c9892184862b70ff49417ce5a08516 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -431,6 +431,28 @@ class Trainer(object): exe = executor.Executor(self.place) io.save_persistables(exe, dirname=param_path) + def save_inference_model(self, param_path, feeded_var_names, + target_var_indexes): + """ + Save model for cpp inference into :code:`param_path`. + + Args: + param_path(str): The path to save parameters. + feeded_var_names(list(str)): The name of the vars that you + need to feed in before run program. + target_var_indexes(list(int)): the index of target var that + you need to return in trainer.train_func. + Returns: + None + """ + with self._prog_and_scope_guard(): + exe = executor.Executor(self.place) + target_vars = [ + self.train_func_outputs[index] for index in target_var_indexes + ] + io.save_inference_model(param_path, feeded_var_names, target_vars, + exe) + @contextlib.contextmanager def _prog_and_scope_guard(self): with framework.program_guard( diff --git a/python/paddle/fluid/transpiler/details/program_utils.py b/python/paddle/fluid/transpiler/details/program_utils.py index f0fafaa84a73d641ff6ceb74def6addaea759516..a83aa0f11eed9bfc1674d8d75dcfacc297f056b0 100644 --- a/python/paddle/fluid/transpiler/details/program_utils.py +++ b/python/paddle/fluid/transpiler/details/program_utils.py @@ -153,7 +153,7 @@ def block_to_code(block, block_idx): indent += 1 # sort all vars - all_vars = sorted(block.vars.iteritems(), key=lambda x: x[0]) + all_vars = sorted(six.iteritems(block.vars), key=lambda x: x[0]) for var in all_vars: print("{}{}".format(get_indent_space(indent), variable_to_code(var[1]))) diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 8a330e0dee7eda02d0858446778363f2235a3d73..d4d218d547a394a56c040ade2a9ba703b691b86b 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -300,7 +300,7 @@ class DistributeTranspiler(object): input_deps = grad_name_to_send_dummy_out.values() program.global_block().append_op( type="send_barrier", - inputs={"X": input_deps}, + inputs={"X": list(input_deps)}, outputs={"Out": send_barrier_out}, attrs={ "endpoints": pserver_endpoints, @@ -401,7 +401,7 @@ class DistributeTranspiler(object): Args: recv_vars (list): Variable list to recv for current trainer_id - eplist (list): A list of strings indicating + eplist (list): A list of strings indicating Returns: Program: trainer side startup program. @@ -455,7 +455,7 @@ class DistributeTranspiler(object): if len(splited_var) <= 1: continue # NOTE: if enable memory optimization, origin vars maybe removed. - if startup_program.global_block().vars.has_key(varname): + if varname in startup_program.global_block().vars: orig_param = startup_program.global_block().vars[varname] else: origin_param_var = self.origin_program.global_block().vars[ @@ -690,7 +690,7 @@ class DistributeTranspiler(object): Args: endpoint (str): current pserver endpoint. - + Returns: tuple: (main_program, startup_program), of type "Program" """ @@ -713,7 +713,7 @@ class DistributeTranspiler(object): endpoint (str): current pserver endpoint. pserver_program (Program): deprecated, call get_pserver_program first. startup_program (Program): deprecated, should pass startup_program - when initalizing + when initalizing Returns: Program: parameter server side startup program.