提交 0a07cb4b 编写于 作者: D Dang Qingqing

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into quantize_transpiler_update

......@@ -151,6 +151,7 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
if data == None:
break
if iters == args.iterations:
reader_generator.close()
break
if iters == args.skip_batch_num:
start_time = time.time()
......@@ -252,6 +253,7 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
if data == None:
break
if iters == args.iterations:
reader_generator.close()
break
if args.profile and pass_id == 0 and batch_id == 5:
profiler.start_profiler("All")
......
......@@ -169,14 +169,19 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
# So, don't set these flags here.
if (NOT WIN32) # windows msvc2015 support c++11 natively.
# -std=c++11 -fPIC not recoginize by msvc, -Xcompiler will be added by cmake.
list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
endif(NOT WIN32)
list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
# in cuda9, suppress cuda warning on eigen
list(APPEND CUDA_NVCC_FLAGS "-w")
# Set :expt-relaxed-constexpr to suppress Eigen warnings
list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
if (NOT WIN32)
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
......@@ -187,6 +192,13 @@ elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
# nvcc 9 does not support -Os. Use Release flags instead
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
endif()
else(NOT WIN32)
if(CMAKE_BUILD_TYPE STREQUAL "Release")
list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
else()
message(FATAL "Windows only support Release build now. Please set visual studio build type to Release, x64 build.")
endif()
endif(NOT WIN32)
mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
......@@ -44,7 +44,7 @@ ExternalProject_Add(
# 3. keep only zlib, cares, protobuf, boringssl under "third_party",
# checkout and clean other dirs under third_party
# 4. remove .git, and package the directory.
URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz"
URL "http://paddlepaddledeps.cdn.bcebos.com/grpc-v1.10.x.tar.gz"
URL_MD5 "1f268a2aff6759839dccd256adcc91cf"
PREFIX ${GRPC_SOURCES_DIR}
UPDATE_COMMAND ""
......
服务器端部署 - Anakin
#####################
Anakin - 服务器端加速引擎
#######################
使用文档
......
服务器端部署 - 原生引擎
#######################
.. toctree::
:maxdepth: 2
build_and_install_lib_cn.rst
native_infer.rst
......@@ -10,7 +10,6 @@
.. toctree::
:maxdepth: 2
deploy/index_native.rst
deploy/index_anakin.rst
deploy/index_mobile.rst
development/contribute_to_paddle.md
......
*.pyc
train.log
output
data/cifar-10-batches-py/
data/cifar-10-python.tar.gz
data/*.txt
data/*.list
data/mean.meta
......@@ -21,7 +21,7 @@
图像分类包括通用图像分类、细粒度图像分类等。图1展示了通用图像分类效果,即模型可以正确识别图像上的主要物体。
<p align="center">
<img src="image/dog_cat.png " width="350" ><br/>
<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/dog_cat.png?raw=true" width="350" ><br/>
图1. 通用图像分类展示
</p>
......@@ -30,7 +30,7 @@
<p align="center">
<img src="image/flowers.png" width="400" ><br/>
<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/flowers.png?raw=true" width="400" ><br/>
图2. 细粒度图像分类展示
</p>
......@@ -38,7 +38,7 @@
一个好的模型既要对不同类别识别正确,同时也应该能够对不同视角、光照、背景、变形或部分遮挡的图像正确识别(这里我们统一称作图像扰动)。图3展示了一些图像的扰动,较好的模型会像聪明的人类一样能够正确识别。
<p align="center">
<img src="image/variations.png" width="550" ><br/>
<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/variations.png?raw=true" width="550" ><br/>
图3. 扰动图片展示[22]
</p>
......@@ -61,7 +61,7 @@
Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得了历史性的突破,效果大幅度超越传统方法,获得了ILSVRC2012冠军,该模型被称作AlexNet。这也是首次将深度学习用于大规模图像分类中。从AlexNet之后,涌现了一系列CNN模型,不断地在ImageNet上刷新成绩,如图4展示。随着模型变得越来越深以及精妙的结构设计,Top-5的错误率也越来越低,降到了3.5%附近。而在同样的ImageNet数据集上,人眼的辨识错误率大概在5.1%,也就是目前的深度学习模型的识别能力已经超过了人眼。
<p align="center">
<img src="image/ilsvrc.png" width="500" ><br/>
<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/ilsvrc.png?raw=true" width="500" ><br/>
图4. ILSVRC图像分类Top-5错误率
</p>
......@@ -70,7 +70,7 @@ Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得
传统CNN包含卷积层、全连接层等组件,并采用softmax多类别分类器和多类交叉熵损失函数,一个典型的卷积神经网络如图5所示,我们先介绍用来构造CNN的常见组件。
<p align="center">
<img src="image/lenet.png"><br/>
<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/lenet.png?raw=true"><br/>
图5. CNN网络示例[20]
</p>
......@@ -89,7 +89,7 @@ Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得
牛津大学VGG(Visual Geometry Group)组在2014年ILSVRC提出的模型被称作VGG模型 \[[11](#参考文献)\] 。该模型相比以往模型进一步加宽和加深了网络结构,它的核心是五组卷积操作,每两组之间做Max-Pooling空间降维。同一组内采用多次连续的3X3卷积,卷积核的数目由较浅组的64增多到最深组的512,同一组内的卷积核数目是一样的。卷积之后接两层全连接层,之后是分类层。由于每组内卷积层的不同,有11、13、16、19层这几种模型,下图展示一个16层的网络结构。VGG模型结构相对简洁,提出之后也有很多文章基于此模型进行研究,如在ImageNet上首次公开超过人眼识别的模型\[[19](#参考文献)\]就是借鉴VGG模型的结构。
<p align="center">
<img src="image/vgg16.png" width="750" ><br/>
<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/vgg16.png?raw=true" width="750" ><br/>
图6. 基于ImageNet的VGG16模型
</p>
......@@ -106,7 +106,7 @@ NIN模型主要有两个特点:
Inception模块如下图7所示,图(a)是最简单的设计,输出是3个卷积层和一个池化层的特征拼接。这种设计的缺点是池化层不会改变特征通道数,拼接后会导致特征的通道数较大,经过几层这样的模块堆积后,通道数会越来越大,导致参数和计算量也随之增大。为了改善这个缺点,图(b)引入3个1x1卷积层进行降维,所谓的降维就是减少通道数,同时如NIN模型中提到的1x1卷积也可以修正线性特征。
<p align="center">
<img src="image/inception.png" width="800" ><br/>
<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/inception.png?raw=ture" width="800" ><br/>
图7. Inception模块
</p>
......@@ -115,7 +115,7 @@ GoogleNet由多组Inception模块堆积而成。另外,在网络最后也没
GoogleNet整体网络结构如图8所示,总共22层网络:开始由3层普通的卷积组成;接下来由三组子网络组成,第一组子网络包含2个Inception模块,第二组包含5个Inception模块,第三组包含2个Inception模块;然后接均值池化层、全连接层。
<p align="center">
<img src="image/googlenet.jpeg" ><br/>
<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/googlenet.jpeg?raw=true" ><br/>
图8. GoogleNet[12]
</p>
......@@ -130,14 +130,14 @@ ResNet(Residual Network) \[[15](#参考文献)\] 是2015年ImageNet图像分类
残差模块如图9所示,左边是基本模块连接方式,由两个输出通道数相同的3x3卷积组成。右边是瓶颈模块(Bottleneck)连接方式,之所以称为瓶颈,是因为上面的1x1卷积用来降维(图示例即256->64),下面的1x1卷积用来升维(图示例即64->256),这样中间3x3卷积的输入和输出通道数都较小(图示例即64->64)。
<p align="center">
<img src="image/resnet_block.jpg" width="400"><br/>
<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/resnet_block.jpg?raw=true" width="400"><br/>
图9. 残差模块
</p>
图10展示了50、101、152层网络连接示意图,使用的是瓶颈模块。这三个模型的区别在于每组中残差模块的重复次数不同(见图右上角)。ResNet训练收敛较快,成功的训练了上百乃至近千层的卷积神经网络。
<p align="center">
<img src="image/resnet.png"><br/>
<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/resnet.png?raw=true"><br/>
图10. 基于ImageNet的ResNet模型
</p>
......@@ -149,7 +149,7 @@ ResNet(Residual Network) \[[15](#参考文献)\] 是2015年ImageNet图像分类
由于ImageNet数据集较大,下载和训练较慢,为了方便大家学习,我们使用[CIFAR10](<https://www.cs.toronto.edu/~kriz/cifar.html>)数据集。CIFAR10数据集包含60,000张32x32的彩色图片,10个类别,每个类包含6,000张。其中50,000张图片作为训练集,10000张作为测试集。图11从每个类别中随机抽取了10张图片,展示了所有的类别。
<p align="center">
<img src="image/cifar.png" width="350"><br/>
<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/cifar.png?raw=true" width="350"><br/>
图11. CIFAR10数据集[21]
</p>
......@@ -377,7 +377,7 @@ test_reader = paddle.batch(
`event_handler_plot`可以用来利用回调数据来打点画图:
<p align="center">
<img src="image/train_and_test.png" width="350"><br/>
<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/train_and_test.png?raw=true" width="350"><br/>
图12. 训练结果
</p>
......@@ -469,7 +469,7 @@ Test with Pass 0, Loss 1.1, Acc 0.6
图13是训练的分类错误率曲线图,运行到第200个pass后基本收敛,最终得到测试集上分类错误率为8.54%。
<p align="center">
<img src="image/plot.png" width="400" ><br/>
<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/plot.png?raw=true" width="400" ><br/>
图13. CIFAR10数据集上VGG模型的分类错误率
</p>
......
......@@ -6,7 +6,7 @@
.. todo::
概述
.. toctree::
:maxdepth: 2
......
data/train.list
data/test.*
data/conll05st-release.tar.gz
data/conll05st-release
data/predicate_dict
data/label_dict
data/word_dict
data/emb
data/feature
output
predict.res
train.log
......@@ -21,7 +21,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
5. 对第4步的结果,通过多分类得到论元的语义角色标签。可以看到,句法分析是基础,并且后续步骤常常会构造的一些人工特征,这些特征往往也来自句法分析。
<div align="center">
<img src="image/dependency_parsing.png" width = "80%" align=center /><br>
<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/dependency_parsing.png?raw=true" width = "80%" align=center /><br>
图1. 依存句法分析句法树示例
</div>
......@@ -30,7 +30,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
我们继续以上面的这句话为例,图1展示了BIO表示方法。
<div align="center">
<img src="image/bio_example.png" width = "90%" align=center /><br>
<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/bio_example.png?raw=true" width = "90%" align=center /><br>
图2. BIO标注方法示例
</div>
......@@ -53,7 +53,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
图3是最终得到的栈式循环神经网络结构示意图。
<p align="center">
<img src="./image/stacked_lstm.png" width = "40%" align=center><br>
<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/stacked_lstm.png?raw=true" width = "40%" align=center><br>
图3. 基于LSTM的栈式循环神经网络结构示意图
</p>
......@@ -64,7 +64,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
为了克服这一缺陷,我们可以设计一种双向循环网络单元,它的思想简单且直接:对上一节的栈式循环神经网络进行一个小小的修改,堆叠多个LSTM单元,让每一层LSTM单元分别以:正向、反向、正向 …… 的顺序学习上一层的输出序列。于是,从第2层开始,$t$时刻我们的LSTM单元便总是可以看到历史和未来的信息。图4是基于LSTM的双向循环神经网络结构示意图。
<p align="center">
<img src="./image/bidirectional_stacked_lstm.png" width = "60%" align=center><br>
<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/bidirectional_stacked_lstm.png?raw=true" width = "60%" align=center><br>
图4. 基于LSTM的双向循环神经网络结构示意图
</p>
......@@ -79,7 +79,7 @@ CRF是一种概率化结构模型,可以看作是一个概率无向图模型
序列标注任务只需要考虑输入和输出都是一个线性序列,并且由于我们只是将输入序列作为条件,不做任何条件独立假设,因此输入序列的元素之间并不存在图结构。综上,在序列标注任务中使用的是如图5所示的定义在链式图上的CRF,称之为线性链条件随机场(Linear Chain Conditional Random Field)。
<p align="center">
<img src="./image/linear_chain_crf.png" width = "35%" align=center><br>
<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/linear_chain_crf.png?raw=true" width = "35%" align=center><br>
图5. 序列标注任务中使用的线性链条件随机场
</p>
......@@ -123,7 +123,7 @@ $$\DeclareMathOperator*{\argmax}{arg\,max} L(\lambda, D) = - \text{log}\left(\pr
4. CRF以第3步中LSTM学习到的特征为输入,以标记序列为监督信号,完成序列标注;
<div align="center">
<img src="image/db_lstm_network.png" width = "60%" align=center /><br>
<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/db_lstm_network.png?raw=true" width = "60%" align=center /><br>
图6. SRL任务上的深层双向LSTM模型
</div>
......
data/wmt14
data/pre-wmt14
pretrained/wmt14_model
gen.log
gen_result
train.log
dataprovider_copy_1.py
*.pyc
multi-bleu.perl
......@@ -11,10 +11,10 @@
为解决以上问题,统计机器翻译(Statistical Machine Translation, SMT)技术应运而生。在统计机器翻译技术中,转化规则是由机器自动从大规模的语料中学习得到的,而非我们人主动提供规则。因此,它克服了基于规则的翻译系统所面临的知识获取瓶颈的问题,但仍然存在许多挑战:1)人为设计许多特征(feature),但永远无法覆盖所有的语言现象;2)难以利用全局的特征;3)依赖于许多预处理环节,如词语对齐、分词或符号化(tokenization)、规则抽取、句法分析等,而每个环节的错误会逐步累积,对翻译的影响也越来越大。
近年来,深度学习技术的发展为解决上述挑战提供了新的思路。将深度学习应用于机器翻译任务的方法大致分为两类:1)仍以统计机器翻译系统为框架,只是利用神经网络来改进其中的关键模块,如语言模型、调序模型等(见图1的左半部分);2)不再以统计机器翻译系统为框架,而是直接用神经网络将源语言映射到目标语言,即端到端的神经网络机器翻译(End-to-End Neural Machine Translation, End-to-End NMT)(见图1的右半部分),简称为NMT模型。
![nmt](./image/nmt.png)
<p align="center">
<div align="center">
<img src="https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/image/nmt.png?raw=true" width = "400" align=center/><br/>
图1. 基于神经网络的机器翻译系统
</p>
</div>
本教程主要介绍NMT模型,以及如何用PaddlePaddle来训练一个NMT模型。
......@@ -45,18 +45,20 @@
具体来说,该双向循环神经网络分别在时间维以顺序和逆序——即前向(forward)和后向(backward)——依次处理输入序列,并将每个时间步RNN的输出拼接成为最终的输出层。这样每个时间步的输出节点,都包含了输入序列中当前时刻完整的过去和未来的上下文信息。下图展示的是一个按时间步展开的双向循环神经网络。该网络包含一个前向和一个后向RNN,其中有六个权重矩阵:输入到前向隐层和后向隐层的权重矩阵(`$W_1, W_3$`),隐层到隐层自己的权重矩阵(`$W_2,W_5$`),前向隐层和后向隐层到输出层的权重矩阵(`$W_4, W_6$`)。注意,该网络的前向隐层和后向隐层之间没有连接。
![bi_rnn](./image/bi_rnn.png)
<p align="center">
图3. 按时间步展开的双向循环神经网络
</p>
<div align="center">
<img src = "https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/image/bi_rnn.png?raw=true" width="400"><br/>
图2. 按时间步展开的双向循环神经网络
</div>
### 编码器-解码器框架
编码器-解码器(Encoder-Decoder)\[[2](#参考文献)\]框架用于解决由一个任意长度的源序列到另一个任意长度的目标序列的变换问题。即编码阶段将整个源序列编码成一个向量,解码阶段通过最大化预测序列概率,从中解码出整个目标序列。编码和解码的过程通常都使用RNN实现。
![encoder_decoder](./image/encoder_decoder.png)
<p align="center">
图4. 编码器-解码器框架
</p>
<div align="center">
<img src ="https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/image/encoder_decoder.png?raw=true" width="400"><br/>
图3. 编码器-解码器框架
</div>
#### 编码器
......@@ -69,16 +71,14 @@
3. 用RNN编码源语言词序列:这一过程的计算公式为`$h_i=\varnothing _\theta \left ( h_{i-1}, s_i \right )$`,其中`$h_0$`是一个全零的向量,`$\varnothing _\theta$`是一个非线性激活函数,最后得到的`$\mathbf{h}=\left \{ h_1,..., h_T \right \}$`就是RNN依次读入源语言`$T$`个词的状态编码序列。整句话的向量表示可以采用`$\mathbf{h}$`在最后一个时间步`$T$`的状态编码,或使用时间维上的池化(pooling)结果。
第3步也可以使用双向循环神经网络实现更复杂的句编码表示,具体可以用双向GRU实现。前向GRU按照词序列`$(x_1,x_2,...,x_T)$`的顺序依次编码源语言端词,并得到一系列隐层状态`$(\overrightarrow{h_1},\overrightarrow{h_2},...,\overrightarrow{h_T})$`。类似的,后向GRU按照`$(x_T,x_{T-1},...,x_1)$`的顺序依次编码源语言端词,得到`$(\overleftarrow{h_1},\overleftarrow{h_2},...,\overleftarrow{h_T})$`。最后对于词`$x_i$`,通过拼接两个GRU的结果得到它的隐层状态,即`$h_i=\left [ \overrightarrow{h_i^T},\overleftarrow{h_i^T} \right ]^{T}$`
![encoder_attention](./image/encoder_attention.png)
<p align="center">
图5. 使用双向GRU的编码器
</p>
<div align="center">
<img src="https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/image/encoder_attention.png?raw=true" width="400"><br/>
图4. 使用双向GRU的编码器
</div>
#### 解码器
机器翻译任务的训练过程中,解码阶段的目标是最大化下一个正确的目标语言词的概率。思路是:
1. 每一个时刻,根据源语言句子的编码信息(又叫上下文向量,context vector)`$c$`、真实目标语言序列的第`$i$`个词`$u_i$``$i$`时刻RNN的隐层状态`$z_i$`,计算出下一个隐层状态`$z_{i+1}$`。计算公式如下:
$$z_{i+1}=\phi_{\theta '} \left ( c,u_i,z_i \right )$$
其中`$\phi _{\theta '}$`是一个非线性激活函数;`$c=q\mathbf{h}$`是源语言句子的上下文向量,在不使用[注意力机制](#注意力机制)时,如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素,则可以定义`$c=h_T$``$u_i$`是目标语言序列的第`$i$`个单词,`$u_0$`是目标语言序列的开始标记`<s>`,表示解码开始;`$z_i$``$i$`时刻解码RNN的隐层状态,`$z_0$`是一个全零的向量。
......@@ -100,7 +100,6 @@ $$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$
柱搜索算法使用广度优先策略建立搜索树,在树的每一层,按照启发代价(heuristic cost)(本教程中,为生成词的log概率之和)对节点进行排序,然后仅留下预先确定的个数(文献中通常称为beam width、beam size、柱宽度等)的节点。只有这些节点会在下一层继续扩展,其他节点就被剪掉了,也就是说保留了质量较高的节点,剪枝了质量较差的节点。因此,搜索所占用的空间和时间大幅减少,但缺点是无法保证一定获得最优解。
使用柱搜索算法的解码阶段,目标是最大化生成序列的概率。思路是:
1. 每一个时刻,根据源语言句子的编码信息`$c$`、生成的第`$i$`个目标语言序列单词`$u_i$``$i$`时刻RNN的隐层状态`$z_i$`,计算出下一个隐层状态`$z_{i+1}$`
2.`$z_{i+1}$`通过`softmax`归一化,得到目标语言序列的第`$i+1$`个单词的概率分布`$p_{i+1}$`
......
......@@ -37,7 +37,7 @@ Prediction Score is 4.25
YouTube是世界上最大的视频上传、分享和发现网站,YouTube推荐系统为超过10亿用户从不断增长的视频库中推荐个性化的内容。整个系统由两个神经网络组成:候选生成网络和排序网络。候选生成网络从百万量级的视频库中生成上百个候选,排序网络对候选进行打分排序,输出排名最高的数十个结果。系统结构如图1所示:
<p align="center">
<img src="image/YouTube_Overview.png" width="70%" ><br/>
<img src="https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/image/YouTube_Overview.png?raw=true" width="70%" ><br/>
图1. YouTube 推荐系统结构
</p>
......@@ -48,7 +48,7 @@ YouTube是世界上最大的视频上传、分享和发现网站,YouTube推荐
首先,将观看历史及搜索词记录这类历史信息,映射为向量后取平均值得到定长表示;同时,输入人口学特征以优化新用户的推荐效果,并将二值特征和连续特征归一化处理到[0, 1]范围。接下来,将所有特征表示拼接为一个向量,并输入给非线形多层感知器(MLP,详见[识别数字](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md)教程)处理。最后,训练时将MLP的输出给softmax做分类,预测时计算用户的综合特征(MLP的输出)与所有视频的相似度,取得分最高的$k$个作为候选生成网络的筛选结果。图2显示了候选生成网络结构。
<p align="center">
<img src="image/Deep_candidate_generation_model_architecture.png" width="70%" ><br/>
<img src="https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/image/Deep_candidate_generation_model_architecture.png?raw=true" width="70%" ><br/>
图2. 候选生成网络结构
</p>
......@@ -73,7 +73,7 @@ $$P(\omega=i|u)=\frac{e^{v_{i}u}}{\sum_{j \in V}e^{v_{j}u}}$$
卷积神经网络主要由卷积(convolution)和池化(pooling)操作构成,其应用及组合方式灵活多变,种类繁多。本小结我们以如图3所示的网络进行讲解:
<p align="center">
<img src="image/text_cnn.png" width = "80%" align="center"/><br/>
<img src="https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/image/text_cnn.png?raw=true" width = "80%" align="center"/><br/>
图3. 卷积神经网络文本分类模型
</p>
......@@ -107,7 +107,7 @@ $$\hat c=max(c)$$
<p align="center">
<img src="image/rec_regression_network.png" width="90%" ><br/>
<img src="https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/image/rec_regression_network.png?raw=true" width="90%" ><br/>
图4. 融合推荐模型
</p>
......
data/aclImdb
data/imdb
data/pre-imdb
data/mosesdecoder-master
*.log
model_output
dataprovider_copy_1.py
model.list
*.pyc
.DS_Store
......@@ -37,7 +37,7 @@
循环神经网络是一种能对序列数据进行精确建模的有力工具。实际上,循环神经网络的理论计算能力是图灵完备的\[[4](#参考文献)\]。自然语言是一种典型的序列数据(词序列),近年来,循环神经网络及其变体(如long short term memory\[[5](#参考文献)\]等)在自然语言处理的多个领域,如语言模型、句法解析、语义角色标注(或一般的序列标注)、语义表示、图文生成、对话、机器翻译等任务上均表现优异甚至成为目前效果最好的方法。
<p align="center">
<img src="image/rnn.png" width = "60%" align="center"/><br/>
<img src="https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/image/rnn.png?raw=true" width = "60%" align="center"/><br/>
图1. 循环神经网络按时间展开的示意图
</p>
......@@ -66,7 +66,7 @@ $$ h_t = o_t\odot tanh(c_t) $$
其中,$i_t, f_t, c_t, o_t$分别表示输入门,遗忘门,记忆单元及输出门的向量值,带角标的$W$及$b$为模型参数,$tanh$为双曲正切函数,$\odot$表示逐元素(elementwise)的乘法操作。输入门控制着新输入进入记忆单元$c$的强度,遗忘门控制着记忆单元维持上一时刻值的强度,输出门控制着输出记忆单元的强度。三种门的计算方式类似,但有着完全不同的参数,它们各自以不同的方式控制着记忆单元$c$,如图2所示:
<p align="center">
<img src="image/lstm.png" width = "65%" align="center"/><br/>
<img src="https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/image/lstm.png?raw=true" width = "65%" align="center"/><br/>
图2. 时刻$t$的LSTM [7]
</p>
......@@ -83,7 +83,7 @@ $$ h_t=Recrurent(x_t,h_{t-1})$$
如图3所示(以三层为例),奇数层LSTM正向,偶数层LSTM反向,高一层的LSTM使用低一层LSTM及之前所有层的信息作为输入,对最高层LSTM序列使用时间维度上的最大池化即可得到文本的定长向量表示(这一表示充分融合了文本的上下文信息,并且对文本进行了深层次抽象),最后我们将文本表示连接至softmax构建分类模型。
<p align="center">
<img src="image/stacked_lstm.jpg" width=450><br/>
<img src="https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/image/stacked_lstm.jpg?raw=true" width=450><br/>
图3. 栈式双向LSTM用于文本分类
</p>
......
data/train.list
data/test.list
data/simple-examples*
......@@ -34,7 +34,7 @@ $$X = USV^T$$
本章中,当词向量训练好后,我们可以用数据可视化算法t-SNE\[[4](#参考文献)\]画出词语特征在二维上的投影(如下图所示)。从图中可以看出,语义相关的词语(如a, the, these; big, huge)在投影上距离很近,语意无关的词(如say, business; decision, japan)在投影上的距离很远。
<p align="center">
<img src = "image/2d_similarity.png" width=400><br/>
<img src = "https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/2d_similarity.png?raw=true" width=400><br/>
图1. 词向量的二维投影
</p>
......@@ -90,7 +90,7 @@ $$\frac{1}{T}\sum_t f(w_t, w_{t-1}, ..., w_{t-n+1};\theta) + R(\theta)$$
其中$f(w_t, w_{t-1}, ..., w_{t-n+1})$表示根据历史n-1个词得到当前词$w_t$的条件概率,$R(\theta)$表示参数正则项。
<p align="center">
<img src="image/nnlm.png" width=500><br/>
<img src="https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/nnlm.png?raw=true" width=500><br/>
图2. N-gram神经网络模型
</p>
......@@ -122,7 +122,7 @@ $$\frac{1}{T}\sum_t f(w_t, w_{t-1}, ..., w_{t-n+1};\theta) + R(\theta)$$
CBOW模型通过一个词的上下文(各N个词)预测当前词。当N=2时,模型如下图所示:
<p align="center">
<img src="image/cbow.png" width=250><br/>
<img src="https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/cbow.png?raw=true" width=250><br/>
图3. CBOW模型
</p>
......@@ -137,7 +137,7 @@ $$context = \frac{x_{t-1} + x_{t-2} + x_{t+1} + x_{t+2}}{4}$$
CBOW的好处是对上下文词语的分布在词向量上进行了平滑,去掉了噪声,因此在小数据集上很有效。而Skip-gram的方法中,用一个词预测其上下文,得到了当前词上下文的很多样本,因此可用于更大的数据集。
<p align="center">
<img src="image/skipgram.png" width=250><br/>
<img src="https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/skipgram.png?raw=true" width=250><br/>
图4. Skip-gram模型
</p>
......@@ -194,7 +194,7 @@ dream that one day <e>
本配置的模型结构如下图所示:
<p align="center">
<img src="image/ngram.png" width=400><br/>
<img src="https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/ngram.png?raw=true" width=400><br/>
图5. 模型配置中的N-gram神经网络模型
</p>
......
......@@ -15,7 +15,7 @@ $$y_i = \omega_1x_{i1} + \omega_2x_{i2} + \ldots + \omega_dx_{id} + b, i=1,\ldo
## 效果展示
我们使用从[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing)获得的波士顿房价数据集进行模型的训练和预测。下面的散点图展示了使用模型对部分房屋价格进行的预测。其中,每个点的横坐标表示同一类房屋真实价格的中位数,纵坐标表示线性回归模型根据特征预测的结果,当二者值完全相等的时候就会落在虚线上。所以模型预测得越准确,则点离虚线越近。
<p align="center">
<img src = "image/predictions.png" width=400><br/>
<img src = "https://github.com/PaddlePaddle/book/blob/develop/01.fit_a_line/image/predictions.png?raw=true" width=400><br/>
图1. 预测值 V.S. 真实值
</p>
......@@ -40,13 +40,9 @@ $$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$
### 训练过程
定义好模型结构之后,我们要通过以下几个步骤进行模型训练
1. 初始化参数,其中包括权重$\omega_i$和偏置$b$,对其进行初始化(如0均值,1方差)。
2. 网络正向传播计算网络输出和损失函数。
3. 根据损失函数进行反向误差传播 ([backpropagation](https://en.wikipedia.org/wiki/Backpropagation)),将网络误差从输出层依次向前传递, 并更新网络中的参数。
4. 重复2~3步骤,直至网络训练误差达到规定的程度或训练轮次达到设定值。
## 数据集
......@@ -84,7 +80,7 @@ $$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$
- 很多的机器学习技巧/模型(例如L1,L2正则项,向量空间模型-Vector Space Model)都基于这样的假设:所有的属性取值都差不多是以0为均值且取值范围相近的。
<p align="center">
<img src = "image/ranges.png" width=550><br/>
<img src = "https://github.com/PaddlePaddle/book/blob/develop/01.fit_a_line/image/ranges.png?raw=true" width=550><br/>
图2. 各维属性的取值范围
</p>
......@@ -199,10 +195,12 @@ step = 0
def event_handler_plot(event):
global step
if isinstance(event, fluid.EndStepEvent):
if event.step % 10 == 0: # record the test cost every 10 seconds
if step % 10 == 0: # record a train cost every 10 batches
plot_cost.append(train_title, step, event.metrics[0])
if step % 100 == 0: # record a test cost every 100 batches
test_metrics = trainer.test(
reader=test_reader, feed_order=feed_order)
plot_cost.append(test_title, step, test_metrics[0])
plot_cost.plot()
......@@ -210,12 +208,13 @@ def event_handler_plot(event):
# If the accuracy is good enough, we can stop the training.
print('loss is less than 10.0, stop')
trainer.stop()
# We can save the trained parameters for the inferences later
if params_dirname is not None:
trainer.save_params(params_dirname)
step += 1
if isinstance(event, fluid.EndEpochEvent):
if event.epoch % 10 == 0:
# We can save the trained parameters for the inferences later
if params_dirname is not None:
trainer.save_params(params_dirname)
```
### 开始训练
......@@ -231,11 +230,10 @@ trainer.train(
event_handler=event_handler_plot,
feed_order=feed_order)
```
<p align="center">
<img src = "image/train_and_test1.png" width=400><br/>
图3. 训练结果
</p>
<div align="center">
<img src="https://github.com/PaddlePaddle/book/blob/develop/01.fit_a_line/image/train_and_test.png?raw=true" width="400"><br/>
图3 训练结果
</div>
## 预测
......@@ -262,18 +260,18 @@ inferencer = fluid.Inferencer(
batch_size = 10
test_reader = paddle.batch(paddle.dataset.uci_housing.test(),batch_size=batch_size)
test_data = test_reader().next()
test_feat = numpy.array([data[0] for data in test_data]).astype("float32")
test_label = numpy.array([data[1] for data in test_data]).astype("float32")
test_x = numpy.array([data[0] for data in test_data]).astype("float32")
test_y = numpy.array([data[1] for data in test_data]).astype("float32")
results = inferencer.infer({'x': test_feat})
results = inferencer.infer({'x': test_x})
print("infer results: (House Price)")
for k in range(0, batch_size-1):
print("%d. %f" % (k, results[0][k]))
for idx, val in enumerate(results[0]):
print("%d: %.2f" % (idx, val))
print("\nground truth:")
for k in range(0, batch_size-1):
print("%d. %f" % (k, test_label[k]))
for idx, val in enumerate(test_y):
print("%d: %.2f" % (idx, val))
```
## 总结
......
......@@ -6,8 +6,8 @@
当我们学习编程的时候,编写的第一个程序一般是实现打印"Hello World"。而机器学习(或深度学习)的入门教程,一般都是 [MNIST](http://yann.lecun.com/exdb/mnist/) 数据库上的手写识别问题。原因是手写识别属于典型的图像分类问题,比较简单,同时MNIST数据集也很完备。MNIST数据集作为一个简单的计算机视觉数据集,包含一系列如图1所示的手写数字图片和对应的标签。图片是28x28的像素矩阵,标签则对应着0~9的10个数字。每张图片都经过了大小归一化和居中处理。
<p align="center">
<img src="image/mnist_example_image.png" width="400"><br/>
图1. MNIST图片示例
<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/mnist_example_image.png?raw=true" width="400"><br/>
图1. MNIST图片示例
</p>
MNIST数据集是从 [NIST](https://www.nist.gov/srd/nist-special-database-19) 的Special Database 3(SD-3)和Special Database 1(SD-1)构建而来。由于SD-3是由美国人口调查局的员工进行标注,SD-1是由美国高中生进行标注,因此SD-3比SD-1更干净也更容易识别。Yann LeCun等人从SD-1和SD-3中各取一半作为MNIST的训练集(60000条数据)和测试集(10000条数据),其中训练集来自250位不同的标注员,此外还保证了训练集和测试集的标注员是不完全相同的。
......@@ -40,12 +40,12 @@ $$ y_i = \text{softmax}(\sum_j W_{i,j}x_j + b_i) $$
在分类问题中,我们一般采用交叉熵代价损失函数(cross entropy loss),公式如下:
$$ L_{cross-entropy} (label, y) = -\sum_i label_ilog(y_i) $$
$$ L_{cross-entropy}(label, y) = -\sum_i label_ilog(y_i) $$
图2为softmax回归的网络图,图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。
<p align="center">
<img src="image/softmax_regression.png" width=400><br/>
<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/softmax_regression.png?raw=true" width=400><br/>
图2. softmax回归网络结构图<br/>
</p>
......@@ -54,16 +54,14 @@ $$ L_{cross-entropy} (label, y) = -\sum_i label_ilog(y_i) $$
Softmax回归模型采用了最简单的两层神经网络,即只有输入层和输出层,因此其拟合能力有限。为了达到更好的识别效果,我们考虑在输入层和输出层中间加上若干个隐藏层\[[10](#参考文献)\]
1. 经过第一个隐藏层,可以得到 $ H_1 = \phi(W_1X + b_1) $,其中$\phi$代表激活函数,常见的有sigmoid、tanh或ReLU等函数。
2. 经过第二个隐藏层,可以得到 $ H_2 = \phi(W_2H_1 + b_2) $。
3. 最后,再经过输出层,得到的$Y=\text{softmax}(W_3H_2 + b_3)$,即为最后的分类结果向量。
图3为多层感知器的网络结构图,图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。
<p align="center">
<img src="image/mlp.png" width=500><br/>
<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/mlp.png?raw=true" width=500><br/>
图3. 多层感知器网络结构图<br/>
</p>
......@@ -72,7 +70,7 @@ Softmax回归模型采用了最简单的两层神经网络,即只有输入层
在多层感知器模型中,将图像展开成一维向量输入到网络中,忽略了图像的位置和结构信息,而卷积神经网络能够更好的利用图像的结构信息。[LeNet-5](http://yann.lecun.com/exdb/lenet/)是一个较简单的卷积神经网络。图4显示了其结构:输入的二维图像,先经过两次卷积层到池化层,再经过全连接层,最后使用softmax分类作为输出层。下面我们主要介绍卷积层和池化层。
<p align="center">
<img src="image/cnn.png"><br/>
<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/cnn.png?raw=true" width="400"><br/>
图4. LeNet-5卷积神经网络结构<br/>
</p>
......@@ -81,7 +79,7 @@ Softmax回归模型采用了最简单的两层神经网络,即只有输入层
卷积层是卷积神经网络的核心基石。在图像识别里我们提到的卷积是二维卷积,即离散二维滤波器(也称作卷积核)与二维图像做卷积操作,简单的讲是二维滤波器滑动到二维图像上所有位置,并在每个位置上与该像素点及其领域像素点做内积。卷积操作被广泛应用与图像处理领域,不同卷积核可以提取不同的特征,例如边沿、线性、角等特征。在深层卷积神经网络中,通过卷积操作可以提取出图像低级到复杂的特征。
<p align="center">
<img src="image/conv_layer.png" width='750'><br/>
<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/conv_layer.png?raw=true" width='750'><br/>
图5. 卷积层图片<br/>
</p>
......@@ -98,7 +96,7 @@ Softmax回归模型采用了最简单的两层神经网络,即只有输入层
#### 池化层
<p align="center">
<img src="image/max_pooling.png" width="400px"><br/>
<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/max_pooling.png?raw=true" width="400px"><br/>
图6. 池化层图片<br/>
</p>
......@@ -106,8 +104,7 @@ Softmax回归模型采用了最简单的两层神经网络,即只有输入层
更详细的关于卷积神经网络的具体知识可以参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/ )[图像分类](https://github.com/PaddlePaddle/book/blob/develop/image_classification/README.md)教程。
### 常见激活函数介绍
### 常见激活函数介绍
- sigmoid激活函数: $ f(x) = sigmoid(x) = \frac{1}{1+e^{-x}} $
- tanh激活函数: $ f(x) = tanh(x) = \frac{e^x-e^{-x}}{e^x+e^{-x}} $
......@@ -136,20 +133,18 @@ PaddlePaddle在API中提供了自动加载[MNIST](http://yann.lecun.com/exdb/mni
我们建议使用 Fluid API,因为它更容易学起来。
下面是快速的 Fluid API 概述。
1. `inference_program`:指定如何从数据输入中获得预测的函数。
这是指定网络流的地方。
2. `train_program`:指定如何从 `inference_program``标签值`中获取 `loss` 的函数。
1. `train_program`:指定如何从 `inference_program``标签值`中获取 `loss` 的函数。
这是指定损失计算的地方。
3. `optimizer_func`: “指定优化器配置的函数。优化器负责减少损失并驱动培训。Paddle 支持多种不同的优化器。
1. `optimizer_func`: “指定优化器配置的函数。优化器负责减少损失并驱动培训。Paddle 支持多种不同的优化器。
4. `Trainer`:PaddlePaddle Trainer 管理由 `train_program``optimizer` 指定的训练过程。
1. `Trainer`:PaddlePaddle Trainer 管理由 `train_program``optimizer` 指定的训练过程。
通过 `event_handler` 回调函数,用户可以监控培训的进展。
5. `Inferencer`:Fluid inferencer 加载 `inference_program` 和由 Trainer 训练的参数。
1. `Inferencer`:Fluid inferencer 加载 `inference_program` 和由 Trainer 训练的参数。
然后,它可以推断数据和返回预测。
在这个演示中,我们将深入了解它们。
......@@ -240,6 +235,7 @@ def train_program():
acc = fluid.layers.accuracy(input=predict, label=label)
return [avg_cost, acc]
```
#### Optimizer Function 配置
......@@ -255,9 +251,9 @@ def optimizer_program():
下一步,我们开始训练过程。`paddle.dataset.movielens.train()``paddle.dataset.movielens.test()`分别做训练和测试数据集。这两个函数各自返回一个reader——PaddlePaddle中的reader是一个Python函数,每次调用的时候返回一个Python yield generator。
下面`shuffle`是一个reader decorator,它接受一个reader A,返回另一个reader B 。reader B 每次读入`buffer_size`条训练数据到一个buffer里,然后随机打乱其顺序,并且逐条输出。
下面`shuffle`是一个reader decorator,它接受一个reader A,返回另一个reader B。reader B 每次读入`buffer_size`条训练数据到一个buffer里,然后随机打乱其顺序,并且逐条输出。
`batch`是一个特殊的decorator,它的输入是一个reader,输出是一个batched reader 。在PaddlePaddle里,一个reader每次yield一条训练数据,而一个batched reader每次yield一个minibatch。
`batch`是一个特殊的decorator,它的输入是一个reader,输出是一个batched reader。在PaddlePaddle里,一个reader每次yield一条训练数据,而一个batched reader每次yield一个minibatch。
```python
train_reader = paddle.batch(
......@@ -280,7 +276,6 @@ place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
trainer = fluid.Trainer(
train_func=train_program, place=place, optimizer_func=optimizer_program)
```
#### Event Handler 配置
......@@ -315,11 +310,10 @@ def event_handler(event):
`event_handler_plot` 可以用来在训练过程中画图如下:
<p align="center">
<img src="image/train_and_test2.png" width="400"><br/>
图7. 训练结果
</p>
<div align="center">
<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/train_and_test.png?raw=true" width="400"><br/>
图7 训练结果
</div>
```python
......
############
模型预测部署
############
PaddlePaddle Fluid 提供了 C++ API 来支持模型的部署上线
.. toctree::
:maxdepth: 2
build_and_install_lib_cn.rst
native_infer.rst
......@@ -4,11 +4,13 @@ Paddle 预测 API
为了更简单方便的预测部署,Fluid 提供了一套高层 API
用来隐藏底层不同的优化实现。
`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api>`__
`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/contrib/inference>`__
包括
- 头文件 ``paddle_inference_api.h`` 定义了所有的接口
- 库文件\ ``libpaddle_fluid.so`` 或 ``libpaddle_fluid.a``
- 库文件 ``libpaddle_inference_api.so`` 或
``libpaddle_inference_api.a``
编译和依赖可以参考 :ref:`install_or_build_cpp_inference_lib` 。
......@@ -95,7 +97,8 @@ engine
CHECK(predictor->Run(slots, &outputs));
// 获取 outputs ...
编译时,联编 ``libpaddle_fluid.a/.so`` 即可。
编译时,联编 ``libpaddle_fluid.a/.so`` 和
``libpaddle_inference_api.a/.so`` 便可。
详细代码参考
------------
......
......@@ -38,7 +38,6 @@ PaddlePaddle Fluid支持两种传入数据的方式:
:maxdepth: 2
feeding_data
use_recordio_reader
Python Reader
#############
......
.. _user_guide_use_recordio_as_train_data:
############################
使用RecordIO文件作为训练数据
############################
相比于 :ref:`user_guide_use_numpy_array_as_train_data`,
:ref:`user_guide_use_recordio_as_train_data` 的性能更好;
但是用户需要先将训练数据集转换成RecordIO文件格式,再使用
:code:`fluid.layers.open_files()` 层在神经网络配置中导入 RecordIO 文件。
用户还可以使用 :code:`fluid.layers.double_buffer()` 加速数据从内存到显存的拷贝,
使用 :code:`fluid.layers.Preprocessor` 工具进行数据增强。
将训练数据转换成RecordIO文件格式
################################
:code:`fluid.recordio_writer` 中,每个记录都是一个
:code:`vector<LoDTensor>`, 即一个支持序列信息的Tensor数组。这个数组包括训练所需
的所有特征。例如对于图像分类来说,这个数组可以包含图片和分类标签。
用户可以使用 :code:`fluid.recordio_writer.convert_reader_to_recordio_file()` 可以将
:ref:`user_guide_reader` 转换成一个RecordIO文件。或者可以使用
:code:`fluid.recordio_writer.convert_reader_to_recordio_files()` 将一个
:ref:`user_guide_reader` 转换成多个RecordIO文件。
具体使用方法为:
.. code-block:: python
import paddle.fluid as fluid
import numpy
def reader_creator():
def __impl__():
for i in range(1000):
yield [
numpy.random.random(size=[3,224,224], dtype="float32"),
numpy.random.random(size=[1], dtype="int64")
]
return __impl__
img = fluid.layers.data(name="image", shape=[3, 224, 224])
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
feeder = fluid.DataFeeder(feed_list=[img, label], place=fluid.CPUPlace())
BATCH_SIZE = 32
reader = paddle.batch(reader_creator(), batch_size=BATCH_SIZE)
fluid.recordio_writer.convert_reader_to_recordio_file(
"train.recordio", feeder=feeder, reader_creator=reader)
其中 :code:`reader_creator` 创建了一个 :code:`Reader`。
:ref:`_api_fluid_data_feeder_DataFeeder`
是将 :code:`Reader` 转换成 :code:`LoDTensor` 的工具。详细请参考
:ref:`user_guide_reader` 。
上述程序将 :code:`reader_creator` 的数据转换成了 :code:`train.recordio` 文件,
其中每一个record 含有 32 条样本。如果batch size会在训练过程中调整,
用户可以将每一个Record的样本数设置成1。并参考
:ref:`user_guide_use_recordio_as_train_data_use_op_create_batch`。
配置神经网络, 打开RecordIO文件
##############################
RecordIO文件转换好之后,用户可以使用 :code:`fluid.layers.open_files()`
打开文件,并使用 :code:`fluid.layers.read_file` 读取文件内容。
简单使用方法如下:
.. code-block:: python
import paddle.fluid as fluid
file_obj = fluid.layers.open_files(
filenames=["train.recordio"],
shape=[[3, 224, 224], [1]],
lod_levels=[0, 0],
dtypes=["float32", "int64"],
pass_num=100
)
image, label = fluid.layers.read_file(file_obj)
其中如果设置了 :code:`pass_num` ,那么当所有数据读完后,会重新读取数据,
直到读取了 :code:`pass_num` 遍。
进阶使用
########
使用 :code:`fluid.layers.double_buffer()`
------------------------------------------
:code:`Double buffer` 使用双缓冲技术,将训练数据从内存中复制到显存中。配置双缓冲
需要使用 :code:`fluid.layers.double_buffer()` 修饰文件对象。 例如:
.. code-block:: python
import paddle.fliud as fluid
file_obj = fluid.layers.open_files(...)
file_obj = fluid.layers.double_buffer(file_obj)
image, label = fluid.layers.read_file(file_obj)
双缓冲技术可以参考
`Multiple buffering <https://en.wikipedia.org/wiki/Multiple_buffering>`_ 。
配置数据增强
------------
使用 :code:`fluid.layers.Preprocessor` 可以配置文件的数据增强方法。例如
.. code-block:: python
import paddle.fluid as fluid
file_obj = fluid.layers.open_files(...)
preprocessor = fluid.layers.Preprocessor(reader=data_file)
with preprocessor.block():
image, label = preprocessor.inputs()
image = image / 2
label = label + 1
preprocessor.outputs(image, label)
如上代码所示,使用 :code:`Preprocessor` 定义了一个数据增强模块,并在
:code:`with preprocessor.block()` 中定义了数据增强的具体操作。 用户通过配置
:code:`preprocessor.inputs()` 获得数据文件中的各个字段。 并用
:code:`preprocessor.outputs()` 标记预处理后的输出。
.. _user_guide_use_recordio_as_train_data_use_op_create_batch:
使用Op组batch
-------------
使用 :code:`fluid.layers.batch()` 可以在训练的过程中动态的组batch。例如
.. code-block:: python
import paddle.fluid as fluid
file_obj = fluid.layers.open_files(...)
file_obj = fluid.layers.batch(file_obj, batch_size=32)
img, label = fluid.layers.read_file(file_obj)
需要注意的是,如果数据集中的最后几个样本不能组成 :code:`batch_size` 大小的批量数据,
那么这几个样本直接组成一个批量数据进行训练。
读入数据的shuffle
-----------------
使用 :code:`fluid.layers.shuffle()` 可以在训练过程中动态重排训练数据。例如
.. code-block:: python
import paddle.fluid as fluid
file_obj = fluid.layers.open_files(...)
file_obj = fliud.layers.shuffle(file_obj, buffer_size=8192)
img, label = fliud.layers.read_file(file_obj)
需要注意的是:
1. :code:`shuffle` 实现方法是:
先读入 :code:`buffer_size` 条样本,再随机的选出样本进行训练。
2. :code:`shuffle` 中 :code:`buffer_size` 会占用训练内存,需要确定训练过程中内存
足够支持缓存 :code:`buffer_size` 条数据。
......@@ -15,4 +15,5 @@
howto/training/index
howto/debug/index
howto/evaluation/index
howto/inference/index
models/index.rst
......@@ -170,6 +170,7 @@ paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], vara
paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
paddle.fluid.layers.sequence_mask ArgSpec(args=['x', 'maxlen', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 'int64', None))
paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.layers.pad2d ArgSpec(args=['input', 'paddings', 'mode', 'pad_value', 'data_format', 'name'], varargs=None, keywords=None, defaults=([0, 0, 0, 0], 'constant', 0.0, 'NCHW', None))
paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
......
.tensor_util.cu
.data_type_transform.cu
\ No newline at end of file
# windows treat symbolic file as a real file, which is different with unix
# We create a hidden file and compile it instead of origin source file.
function(windows_symbolic TARGET)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
foreach(src ${windows_symbolic_SRCS})
get_filename_component(src ${src} NAME_WE)
if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu)
message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
endif()
add_custom_command(OUTPUT .${src}.cu
COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc" "${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu"
COMMENT "create hidden file of ${src}.cu")
add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)
endforeach()
endfunction()
add_subdirectory(ir)
if (NOT WIN32)
add_subdirectory(details)
......@@ -11,7 +30,13 @@ nv_test(dim_test SRCS dim_test.cu DEPS ddim)
cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
if(WITH_GPU)
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
if (WIN32)
windows_symbolic(tensor_util SRCS tensor_util.cu)
nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
add_dependencies(tensor tensor_util)
else()
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
endif(WIN32)
else()
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context)
endif()
......@@ -55,7 +80,13 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu
DEPS operator op_registry device_context math_function)
if(WITH_GPU)
nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
if (WIN32)
windows_symbolic(hidden_file SRCS data_type_transform.cu)
nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor)
add_dependencies(data_type_transform hidden_file)
else()
nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
endif(WIN32)
nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
else()
cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
......
......@@ -46,7 +46,7 @@ struct CastDataLayout {
const std::vector<int> axis_;
template <typename T>
void operator()() {
void apply() {
auto place = ctx_->GetPlace();
if (platform::is_cpu_place(place)) {
......
......@@ -26,75 +26,40 @@ namespace framework {
extern proto::VarType::Type ToDataType(std::type_index type);
extern std::type_index ToTypeIndex(proto::VarType::Type type);
#if !defined(_WIN32)
template <typename Visitor>
inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
switch (type) {
case proto::VarType::FP16:
visitor.template operator()<platform::float16>();
visitor.template apply<platform::float16>();
break;
case proto::VarType::FP32:
visitor.template operator()<float>();
visitor.template apply<float>();
break;
case proto::VarType::FP64:
visitor.template operator()<double>();
visitor.template apply<double>();
break;
case proto::VarType::INT32:
visitor.template operator()<int>();
visitor.template apply<int>();
break;
case proto::VarType::INT64:
visitor.template operator()<int64_t>();
visitor.template apply<int64_t>();
break;
case proto::VarType::BOOL:
visitor.template operator()<bool>();
visitor.template apply<bool>();
break;
case proto::VarType::UINT8:
visitor.template operator()<uint8_t>();
visitor.template apply<uint8_t>();
break;
case proto::VarType::INT16:
visitor.template operator()<int16_t>();
visitor.template apply<int16_t>();
break;
case proto::VarType::INT8:
visitor.template operator()<int8_t>();
visitor.template apply<int8_t>();
break;
default:
PADDLE_THROW("Not supported %d", type);
}
}
#else
// the msvc compiler do not implement two-stage name lookup correctly.
template <typename Visitor>
inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
switch (type) {
case proto::VarType::FP16:
visitor.operator()<platform::float16>();
break;
case proto::VarType::FP32:
visitor.operator()<float>();
break;
case proto::VarType::FP64:
visitor.operator()<double>();
break;
case proto::VarType::INT32:
visitor.operator()<int>();
break;
case proto::VarType::INT64:
visitor.operator()<int64_t>();
break;
case proto::VarType::BOOL:
visitor.operator()<bool>();
break;
case proto::VarType::UINT8:
visitor.operator()<uint8_t>();
break;
case proto::VarType::INT16:
visitor.operator()<int16_t>();
break;
default:
PADDLE_THROW("Not supported %d", type);
}
}
#endif // _WIN32
extern std::string DataTypeToString(const proto::VarType::Type type);
extern size_t SizeOfType(std::type_index type);
......
......@@ -37,7 +37,7 @@ struct CastDataType {
const platform::DeviceContext* ctx_;
template <typename OutType>
void operator()() {
void apply() {
auto* in_begin = in_.data<InType>();
auto* in_end = in_begin + in_.numel();
auto* out_begin = out_->mutable_data<OutType>(in_.place());
......
......@@ -31,7 +31,7 @@ struct ReduceLoDTensor {
: src_tensors_(src), dst_tensor_(*dst) {}
template <typename T>
void operator()() const {
void apply() const {
PADDLE_ENFORCE(!src_tensors_.empty());
auto &t0 = *src_tensors_[0];
PADDLE_ENFORCE_NE(t0.numel(), 0);
......
......@@ -13,6 +13,9 @@
// limitations under the License.
#include "paddle/fluid/framework/ir/attention_lstm_fuse_pass.h"
#include <string>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/lod_tensor.h"
......@@ -216,11 +219,11 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
float* out_data = out->mutable_data<float>(platform::CPUPlace());
std::array<const float*, 4> tensors(
{W_forget_w0.data<float>(), W_input_w0.data<float>(),
W_output_w0.data<float>(), W_cell_w0.data<float>()});
{{W_forget_w0.data<float>(), W_input_w0.data<float>(),
W_output_w0.data<float>(), W_cell_w0.data<float>()}});
std::array<const float*, 4> tensors1(
{W_forget_w1.data<float>(), W_input_w1.data<float>(),
W_output_w1.data<float>(), W_cell_w1.data<float>()});
{{W_forget_w1.data<float>(), W_input_w1.data<float>(),
W_output_w1.data<float>(), W_cell_w1.data<float>()}});
for (int row = 0; row < D; row++) {
for (int col = 0; col < 4; col++) {
......@@ -243,8 +246,8 @@ void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input,
const LoDTensor& B_output, const LoDTensor& B_cell,
LoDTensor* out) {
std::array<const float*, 4> tensors(
{B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(),
B_cell.data<float>()});
{{B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(),
B_cell.data<float>()}});
PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1);
int D = B_forget.dims()[0];
......
......@@ -13,37 +13,38 @@
// limitations under the License.
#include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h"
#include <string>
#include "paddle/fluid/framework/lod_tensor.h"
namespace paddle {
namespace framework {
namespace ir {
std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
GraphPatternDetector gpd;
auto* pattern = gpd.mutable_pattern();
std::unordered_set<int> fused_ops({// first lstm
13, 15, 16,
// second lstm
23, 25, 26});
pattern->NewNode([&](Node* x) { return fused_ops.count(x->id()); },
"any_node");
std::string GenNodeName(const std::string& prefix, const std::string& name) {
return prefix + "/" + name;
}
std::unordered_set<Node*> marked_nodes;
void BuildPattern(PDPattern* pattern, const std::string& name_scope,
bool with_fc_bias) {
PDNode* x = pattern->NewNode(name_scope, "x")
->assert_is_op_input("mul")
->assert_var_not_persistable();
auto* fc_out = patterns::FC(pattern, name_scope, x, with_fc_bias);
fc_out->AsIntermediate(); // fc_out is a tmp var, will be removed after fuse.
patterns::LSTM(pattern, name_scope, fc_out);
// LOG(INFO) << "\n" << pattern->DotString();
}
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
bool with_fc_bias) {
GraphPatternDetector gpd;
auto* pattern = gpd.mutable_pattern();
auto* id = subgraph.at(gpd.pattern().RetrieveNode("any_node"));
marked_nodes.insert(id);
};
gpd(graph.get(), handler);
BuildPattern(pattern, name_scope, with_fc_bias);
// Create New OpDesc
auto lstm_creator = [&](int lstm, int input, int weight_x, int weight_h,
int bias, int hidden, int cell, int xx) {
int bias, int hidden, int cell, int xx, int fc_bias) {
#define GET_NODE(x) auto* x##_n = graph->RetriveNode(x);
GET_NODE(input);
GET_NODE(weight_x);
......@@ -61,23 +62,64 @@ std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
SET_IN(WeightX, weight_x);
SET_IN(WeightH, weight_h);
SET_IN(Bias, bias);
#undef GET_NODE
#undef SET_IN
if (with_fc_bias) {
// Add FC-bias with LSTM-bias and create a new weight
PADDLE_ENFORCE(scope);
const std::string& new_bias_var = name_scope + "_bias.new";
auto* bias_var = scope->Var(new_bias_var);
PADDLE_ENFORCE(bias_var);
auto* bias_tensor = bias_var->GetMutable<framework::LoDTensor>();
auto* lstm_bias_var = scope->FindVar(bias_n->Name());
PADDLE_ENFORCE(lstm_bias_var);
const auto& lstm_bias_tensor = lstm_bias_var->Get<framework::LoDTensor>();
bias_tensor->Resize(lstm_bias_tensor.dims());
GET_NODE(fc_bias);
auto* fc_bias_var = scope->FindVar(fc_bias_n->Name());
const auto& fc_bias_tensor = fc_bias_var->Get<framework::LoDTensor>();
auto* data = bias_tensor->mutable_data<float>(platform::CPUPlace());
for (int i = 0; i < bias_tensor->numel(); i++) {
data[i] =
fc_bias_tensor.data<float>()[i] + lstm_bias_tensor.data<float>()[i];
}
op_desc.SetInput("Bias", {new_bias_var});
}
VLOG(4) << "hidden_n: " << hidden_n->Name();
VLOG(4) << "cell: " << cell_n->Name();
VLOG(4) << "xx: " << xx_n->Name();
#undef GET_NODE
op_desc.SetInput("H0", {});
op_desc.SetInput("C0", {});
op_desc.SetOutput("Hidden", {hidden_n->Name()});
op_desc.SetOutput("Cell", {cell_n->Name()});
op_desc.SetOutput("XX", {xx_n->Name()});
op_desc.SetOutput("BatchedGate", {"blstm_0.tmp_2"});
op_desc.SetOutput("BatchCellPreAct", {"blstm_1.tmp_2"});
op_desc.SetOutput("BatchedInput", {"blstm_0.tmp_2"});
op_desc.SetAttr("is_reverse", lstm_n->Op()->GetAttr("is_reverse"));
op_desc.SetAttr("use_peepholes", false);
op_desc.SetAttr("use_peepholes", lstm_n->Op()->GetAttr("use_peepholes"));
// TODO(TJ): get from attr
op_desc.SetAttr("use_seq", true);
#define TMP_NAME(x) "at.new.tmp." #x
#define OP_SET_OUT(x) op_desc.SetOutput(#x, {TMP_NAME(x)})
OP_SET_OUT(BatchedCell);
OP_SET_OUT(BatchedHidden);
OP_SET_OUT(ReorderedH0);
OP_SET_OUT(ReorderedC0);
#undef OP_SET_OUT
auto* op = graph->CreateOpNode(&op_desc);
PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
auto* scope = graph->Get<Scope*>(kParamScopeAttr);
#define TMP_NEW(x) scope->Var(TMP_NAME(x))->GetMutable<LoDTensor>()
TMP_NEW(BatchedCell);
TMP_NEW(BatchedHidden);
TMP_NEW(ReorderedH0);
TMP_NEW(ReorderedC0);
#undef TMP_NEW
#undef TMP_NAME
#define LINK_TO(a, b) \
a->outputs.push_back(b); \
......@@ -89,33 +131,70 @@ std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
LINK_TO(op, hidden_n);
#undef LINK_TO
return op;
};
int fusion_count{0};
auto fc_no_bias_handler = [&](
const GraphPatternDetector::subgraph_t& subgraph, Graph* g) {
#define GET_NODE(name__) \
std::string name__##key = name_scope + "/" + #name__; \
auto* name__##n = pattern->RetrieveNode(name__##key); \
PADDLE_ENFORCE(name__##n); \
PADDLE_ENFORCE(subgraph.count(name__##n)); \
Node* name__##_n = subgraph.at(name__##n); \
int name__ __attribute__((unused)) = name__##_n->id();
GET_NODE(x);
GET_NODE(w);
GET_NODE(mul);
GET_NODE(fc_out);
GET_NODE(Weight);
GET_NODE(lstm);
GET_NODE(Bias);
GET_NODE(Hidden);
GET_NODE(Cell);
if (with_fc_bias) {
GET_NODE(fc_bias);
lstm_creator(lstm, x, w, Weight, Bias, Hidden, Cell, fc_out, fc_bias);
} else {
lstm_creator(lstm, x, w, Weight, Bias, Hidden, Cell, fc_out, -1);
}
#undef GET_NODE
// Remove unneeded nodes.
std::unordered_set<const Node*> marked_nodes({mul_n, lstm_n});
GraphSafeRemoveNodes(graph, marked_nodes);
++fusion_count;
};
lstm_creator(16, 12, 14, 18, 17, 22, 21, 19);
lstm_creator(26, 12, 24, 28, 27, 32, 31, 29);
gpd(graph, fc_no_bias_handler);
// remove all the nodes
return fusion_count;
}
for (auto* node : marked_nodes) {
graph->RemoveNode(const_cast<Node*>(node));
}
std::unique_ptr<ir::Graph> MulLstmFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
FusePassBase::Init(name_scope_, graph.get());
for (auto* node : graph->Nodes()) {
for (auto it = node->inputs.begin(); it != node->inputs.end();) {
if (marked_nodes.count(*it)) {
it = const_cast<Node*>(node)->inputs.erase(it);
} else
it++;
}
for (auto it = node->outputs.begin(); it != node->outputs.end();) {
if (marked_nodes.count(*it)) {
it = const_cast<Node*>(node)->outputs.erase(it);
} else
it++;
}
}
int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
false /*with_fc_bias*/);
AddStatis(fusion_count);
return graph;
}
std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
FusePassBase::Init(name_scope_, graph.get());
int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
true /*with_fc_bias*/);
AddStatis(fusion_count);
return graph;
}
......@@ -123,4 +202,5 @@ std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
} // namespace framework
} // namespace paddle
REGISTER_PASS(mul_lstm_fuse_pass, paddle::framework::ir::MulLstmFusePass);
REGISTER_PASS(fc_lstm_fuse_pass, paddle::framework::ir::FCLstmFusePass);
......@@ -12,20 +12,34 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace ir {
class FCLstmFusePass : public Pass {
// The MulLstmFusePass and MulLstmFusePass will fuse to the same FusionLstm op.
// Just FC without bias
class FCLstmFusePass : public FusePassBase {
public:
virtual ~FCLstmFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
const std::string name_scope_{"fc_lstm_fuse"};
};
class MulLstmFusePass : public FusePassBase {
public:
virtual ~MulLstmFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
const std::string name_scope_{"fc_nobias_lstm_fuse"};
};
} // namespace ir
......
......@@ -167,7 +167,6 @@ class Graph {
std::map<std::string, std::function<void(void)>> attr_dels_;
std::map<ir::Node *, std::unique_ptr<ir::Node>> nodes_;
std::unordered_set<ir::Node *> node_set_;
int node_count_{0};
};
bool IsControlDepVar(const ir::Node &var);
......
......@@ -19,6 +19,7 @@
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/graph_traits.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
......@@ -71,7 +72,11 @@ void PDPattern::AddEdge(PDNode* a, PDNode* b) {
void GraphPatternDetector::operator()(Graph* graph,
GraphPatternDetector::handle_t handler) {
if (!MarkPDNodesInGraph(*graph)) return;
if (!MarkPDNodesInGraph(*graph)) {
LOG(INFO) << "Mark failed";
return;
}
auto subgraphs = DetectPatterns();
UniquePatterns(&subgraphs);
RemoveOverlappedMatch(&subgraphs);
......@@ -87,7 +92,7 @@ void GraphPatternDetector::operator()(Graph* graph,
}
bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) {
VLOG(4) << "mark pdnodes in graph";
VLOG(3) << "mark pdnodes in graph";
if (graph.Nodes().empty()) return false;
for (auto& node : GraphTraits::DFS(graph)) {
......@@ -107,6 +112,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) {
}
}
VLOG(3) << pdnodes2nodes_.size() << " nodes marked";
return !pdnodes2nodes_.empty();
}
......@@ -357,7 +363,9 @@ PDNode* PDNode::assert_is_op_nth_input(const std::string& op_type,
assert_is_op_input(op_type);
asserts_.emplace_back([=](Node* x) {
for (auto* op : x->outputs) {
if (IsNthInput(x, op, argument, nth)) return true;
if (op->IsOp() && op->Op()->Type() == op_type &&
IsNthInput(x, op, argument, nth))
return true;
}
return false;
});
......@@ -368,7 +376,9 @@ PDNode* PDNode::assert_is_op_nth_output(const std::string& op_type,
assert_is_var();
asserts_.emplace_back([=](Node* x) {
for (auto* op : x->inputs) {
if (IsNthOutput(x, op, argument, nth)) return true;
if (op->IsOp() && op->Op()->Type() == op_type &&
IsNthOutput(x, op, argument, nth))
return true;
}
return false;
});
......@@ -412,6 +422,12 @@ PDNode* PDNode::assert_is_op_output(const std::string& op_type) {
});
return this;
}
PDNode* PDNode::assert_is_op_output(const std::string& op_type,
const std::string& argument) {
assert_is_var();
assert_is_op_nth_output(op_type, argument, 0);
return this;
}
PDNode* PDNode::assert_is_op_input(const std::string& op_type) {
assert_is_var();
asserts_.emplace_back([=](Node* x) {
......@@ -424,6 +440,12 @@ PDNode* PDNode::assert_is_op_input(const std::string& op_type) {
});
return this;
}
PDNode* PDNode::assert_is_op_input(const std::string& op_type,
const std::string& argument) {
assert_is_var();
assert_is_op_nth_input(op_type, argument, 0);
return this;
}
PDNode* PDNode::assert_op_has_n_inputs(const std::string& op_type, size_t n) {
assert_is_op(op_type);
asserts_.emplace_back([=](Node* x) { return x->inputs.size() == n; });
......@@ -439,6 +461,128 @@ PDNode* PDNode::assert_more(PDNode::teller_t&& teller) {
return this;
}
bool VarLinksToOp(Node* node, const std::string& op_type) {
for (auto* out : node->outputs) {
if (out->IsOp() && out->Op()->Type() == op_type) {
return true;
}
}
return false;
}
bool IsNthInput(Node* var, Node* op, const std::string& argument, size_t nth) {
PADDLE_ENFORCE(var->IsVar());
PADDLE_ENFORCE(op->IsOp());
if (op->Op()->Input(argument).size() <= nth) return false;
return var->Name() == op->Op()->Input(argument)[nth];
}
bool IsNthOutput(Node* var, Node* op, const std::string& argument, size_t nth) {
PADDLE_ENFORCE(var->IsVar());
PADDLE_ENFORCE(op->IsOp());
if (op->Op()->Output(argument).size() <= nth) return false;
return var->Name() == op->Op()->Output(argument)[nth];
}
void GraphSafeRemoveNodes(Graph* graph,
const std::unordered_set<const Node*>& nodes) {
for (auto* node : nodes) {
graph->RemoveNode(const_cast<Node*>(node));
}
for (auto* node : graph->Nodes()) {
for (auto it = node->inputs.begin(); it != node->inputs.end();) {
if (nodes.count(*it)) {
it = const_cast<Node*>(node)->inputs.erase(it);
} else
it++;
}
for (auto it = node->outputs.begin(); it != node->outputs.end();) {
if (nodes.count(*it)) {
it = const_cast<Node*>(node)->outputs.erase(it);
} else
it++;
}
}
}
bool VarLinksFromOp(Node* node, const std::string& op_type) {
for (auto* out : node->inputs) {
if (out->IsOp() && out->Op()->Type() == op_type) {
return true;
}
}
return false;
}
PDNode* patterns::FC(PDPattern* pattern, const std::string& name_scope,
PDNode* x, bool with_bias) {
// Create Operators
PDNode* elementwise_add_op{nullptr};
auto* mul_op = pattern->NewNode(name_scope, "mul")->assert_is_op("mul");
if (with_bias) {
elementwise_add_op = pattern->NewNode(name_scope, "elementwise_add")
->assert_is_op("elementwise_add");
}
// Create variables
// w
auto* mul_weight_var = pattern->NewNode(name_scope, "w")
->AsInput()
->assert_is_persistable_var()
->assert_is_op_nth_input("mul", "Y", 0);
PDNode* mul_out_var{nullptr};
if (with_bias) {
// intermediate variable, will be removed in the IR after fuse.
mul_out_var = pattern->NewNode(name_scope, "mul_out")
->AsIntermediate()
->assert_is_only_output_of_op("mul")
->assert_is_op_input("elementwise_add");
}
PDNode *bias{nullptr}, *fc_out{nullptr};
if (with_bias) {
// bias
bias = pattern->NewNode(name_scope, "fc_bias")
->assert_is_op_input("elementwise_add")
->AsInput();
// output
fc_out = pattern->NewNode(name_scope, "fc_out")
->AsOutput()
->assert_is_op_output("elementwise_add");
} else {
fc_out = pattern->NewNode(name_scope, "fc_out")
->AsOutput()
->assert_is_op_output("mul");
}
if (with_bias) {
mul_op->LinksFrom({mul_weight_var, x}).LinksTo({mul_out_var});
elementwise_add_op->LinksFrom({mul_out_var, bias}).LinksTo({fc_out});
} else {
mul_op->LinksFrom({mul_weight_var, x}).LinksTo({fc_out});
}
return fc_out;
}
PDNode* patterns::LSTM(PDPattern* pattern, const std::string& name_scope,
PDNode* x) {
x->assert_is_op_input("lstm", "Input");
auto* lstm_op = pattern->NewNode(name_scope, "lstm")->assert_is_op("lstm");
#define NEW_NODE(arg__, io__) \
auto* arg__ = pattern->NewNode(name_scope, #arg__) \
->assert_is_op_##io__("lstm", #arg__);
// Currently, the H0 and C0 are optional
// TODO(Superjomn) upgrade the fuse framework to support optional.
// NEW_NODE(H0, input);
// NEW_NODE(C0, input);
NEW_NODE(Weight, input);
NEW_NODE(Bias, input);
NEW_NODE(Hidden, output);
NEW_NODE(Cell, output);
NEW_NODE(BatchGate, output);
NEW_NODE(BatchCellPreAct, output);
lstm_op->LinksFrom({x, Weight, Bias});
lstm_op->LinksTo({Hidden, Cell, BatchGate, BatchCellPreAct});
return Hidden;
}
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -95,7 +95,11 @@ struct PDNode {
PDNode* assert_var_not_persistable();
PDNode* assert_is_persistable_var();
PDNode* assert_is_op_output(const std::string& op_type);
PDNode* assert_is_op_output(const std::string& op_type,
const std::string& argument);
PDNode* assert_is_op_input(const std::string& op_type);
PDNode* assert_is_op_input(const std::string& op_type,
const std::string& argument);
PDNode* assert_is_op_nth_input(const std::string& op_type,
const std::string& argument, int nth);
PDNode* assert_is_op_nth_output(const std::string& op_type,
......@@ -167,6 +171,9 @@ class PDPattern {
PDNode* NewNode(PDNode::teller_t&& teller, const std::string& name = NewID());
PDNode* NewNode(const std::string& name = NewID());
PDNode* NewNode(const std::string& prefix, const std::string& name) {
return NewNode(prefix + "/" + name);
}
PDNode* RetrieveNode(const std::string& id) const;
const std::vector<std::unique_ptr<PDNode>>& nodes() const { return nodes_; }
......@@ -257,64 +264,36 @@ class GraphPatternDetector {
// some helper methods.
// Op's input.
static bool VarLinksToOp(Node* node, const std::string& op_type) {
for (auto* out : node->outputs) {
if (out->IsOp() && out->Op()->Type() == op_type) {
return true;
}
}
return false;
}
// Op's output.
static bool VarLinksFromOp(Node* node, const std::string& op_type) {
for (auto* out : node->inputs) {
if (out->IsOp() && out->Op()->Type() == op_type) {
return true;
}
}
return false;
}
// Tell if a var links to an Op
bool VarLinksToOp(Node* node, const std::string& op_type);
// Tell if an op links to a var
bool VarLinksFromOp(Node* node, const std::string& op_type);
// Check whether a var node is a op node's nth input.
static bool IsNthInput(Node* var, Node* op, const std::string& argument,
size_t nth) {
PADDLE_ENFORCE(var->IsVar());
PADDLE_ENFORCE(op->IsOp());
if (op->inputs.size() <= nth) return false;
return var->Name() == op->Op()->Input(argument)[nth];
}
static bool IsNthOutput(Node* var, Node* op, const std::string& argument,
size_t nth) {
PADDLE_ENFORCE(var->IsVar());
PADDLE_ENFORCE(op->IsOp());
if (op->inputs.size() <= nth) return false;
return var->Name() == op->Op()->Output(argument)[nth];
}
static void GraphSafeRemoveNodes(Graph* graph,
const std::unordered_set<const Node*>& nodes) {
for (auto* node : nodes) {
graph->RemoveNode(const_cast<Node*>(node));
}
bool IsNthInput(Node* var, Node* op, const std::string& argument, size_t nth);
for (auto* node : graph->Nodes()) {
for (auto it = node->inputs.begin(); it != node->inputs.end();) {
if (nodes.count(*it)) {
it = const_cast<Node*>(node)->inputs.erase(it);
} else
it++;
}
for (auto it = node->outputs.begin(); it != node->outputs.end();) {
if (nodes.count(*it)) {
it = const_cast<Node*>(node)->outputs.erase(it);
} else
it++;
}
}
}
// Tell whether a var node is a op node's nth output.
bool IsNthOutput(Node* var, Node* op, const std::string& argument, size_t nth);
// Graph safely remove some nodes, will automatically clean up the edges.
void GraphSafeRemoveNodes(Graph* graph,
const std::unordered_set<const Node*>& nodes);
// Some pre-defined patterns those can be reused in multiple passes.
namespace patterns {
// FC with bias
// op: mul + elementwise_add
// named nodes:
// mul, elementwise_add
// w, mul_out, bias, fc_out
PDNode* FC(PDPattern* pattern, const std::string& name_scope, PDNode* x,
bool with_bias);
PDNode* LSTM(PDPattern* pattern, const std::string& name_scope, PDNode* x);
} // namespace patterns
} // namespace ir
} // namespace framework
......
......@@ -42,6 +42,13 @@ class GraphVizPass : public Pass {
marked_nodes_t ConsumeMarkedNodes(Graph* graph) const;
};
static GraphVizPass::marked_nodes_t& GetMarkedNodes(Graph* graph) {
if (!graph->Has(kGraphvizMarkedNodeAttr)) {
graph->Set(kGraphvizMarkedNodeAttr, new GraphVizPass::marked_nodes_t);
}
return graph->Get<GraphVizPass::marked_nodes_t>(kGraphvizMarkedNodeAttr);
}
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -56,5 +56,76 @@ struct RWLock {
};
#endif
class RWLockGuard {
public:
enum Status { kUnLock, kWRLock, kRDLock };
RWLockGuard(RWLock* rw_lock, Status init_status)
: lock_(rw_lock), status_(Status::kUnLock) {
switch (init_status) {
case Status::kRDLock: {
RDLock();
break;
}
case Status::kWRLock: {
WRLock();
break;
}
case Status::kUnLock: {
break;
}
}
}
void WRLock() {
switch (status_) {
case Status::kUnLock: {
lock_->WRLock();
status_ = Status::kWRLock;
break;
}
case Status::kWRLock: {
break;
}
case Status::kRDLock: {
PADDLE_THROW(
"Please unlock read lock first before invoking write lock.");
break;
}
}
}
void RDLock() {
switch (status_) {
case Status::kUnLock: {
lock_->RDLock();
status_ = Status::kRDLock;
break;
}
case Status::kRDLock: {
break;
}
case Status::kWRLock: {
PADDLE_THROW(
"Please unlock write lock first before invoking read lock.");
break;
}
}
}
void UnLock() {
if (status_ != Status::kUnLock) {
lock_->UNLock();
status_ = Status::kUnLock;
}
}
~RWLockGuard() { UnLock(); }
private:
RWLock* lock_;
Status status_;
};
} // namespace framework
} // namespace paddle
......@@ -49,7 +49,7 @@ struct TensorCopyVisitor {
size_(size) {}
template <typename T>
void operator()() const {
void apply() const {
// TODO(Yancey1989): support other place
platform::CPUPlace cpu;
memory::Copy(cpu, dst_->mutable_data<T>(cpu) + dst_offset_, cpu,
......
......@@ -149,7 +149,7 @@ struct AnyDTypeVisitor {
: predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {}
template <typename T>
void operator()() const {
void apply() const {
auto t = EigenVector<T>::Flatten(tensor_);
auto o = EigenScalar<bool>::From(*out_);
// return any of predicate_(t) is true.
......@@ -302,7 +302,7 @@ struct DeserializedDataFunctor {
: buf_(buf), tensor_(tensor), place_(place) {}
template <typename T>
void operator()() {
void apply() {
*buf_ = tensor_->mutable_data<T>(place_);
}
......
......@@ -17,9 +17,9 @@ get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
# paddle_fluid_origin exclude inference api interface
cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
if(NOT APPLE)
#if(APPLE)
add_subdirectory(api)
endif()
#endif()
# Create static library
cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api)
......
......@@ -25,9 +25,8 @@ function (inference_analysis_test TARGET)
if(WITH_TESTING)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS EXTRA_DEPS)
set(multiValueArgs SRCS ARGS EXTRA_DEPS)
cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(mem_opt "")
if(WITH_GPU)
set(mem_opt "--fraction_of_gpu_memory_to_use=0.5")
......@@ -35,28 +34,25 @@ function (inference_analysis_test TARGET)
cc_test(${TARGET}
SRCS "${analysis_test_SRCS}"
DEPS analysis graph fc_fuse_pass graph_viz_pass infer_clean_graph_pass graph_pattern_detector pass ${analysis_test_EXTRA_DEPS}
ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt})
ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt} ${analysis_test_ARGS})
set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
endif(WITH_TESTING)
endfunction(inference_analysis_test)
set(DITU_RNN_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/ditu_rnn_fluid%2Fmodel.tar.gz")
set(DITU_RNN_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/ditu_rnn_fluid%2Fdata.txt.tar.gz")
set(DITU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/ditu_rnn" CACHE PATH "Ditu RNN model and data root." FORCE)
set(DITU_RNN_MODEL ${DITU_INSTALL_DIR}/model)
set(DITU_RNN_DATA ${DITU_INSTALL_DIR}/data.txt)
function (inference_download_and_uncompress target url gz_filename)
function (inference_download_and_uncompress install_dir url gz_filename)
message(STATUS "Download inference test stuff ${gz_filename} from ${url}")
execute_process(COMMAND bash -c "mkdir -p ${DITU_INSTALL_DIR}")
execute_process(COMMAND bash -c "cd ${DITU_INSTALL_DIR} && wget -q ${url}")
execute_process(COMMAND bash -c "cd ${DITU_INSTALL_DIR} && tar xzf ${gz_filename}")
execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}")
execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${gz_filename}")
message(STATUS "finish downloading ${gz_filename}")
endfunction(inference_download_and_uncompress)
if (NOT EXISTS ${DITU_INSTALL_DIR})
inference_download_and_uncompress(ditu_rnn_model ${DITU_RNN_MODEL_URL} "ditu_rnn_fluid%2Fmodel.tar.gz")
inference_download_and_uncompress(ditu_rnn_data ${DITU_RNN_DATA_URL} "ditu_rnn_fluid%2Fdata.txt.tar.gz")
set(DITU_RNN_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/ditu_rnn_fluid%2Fmodel.tar.gz")
set(DITU_RNN_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/ditu_rnn_fluid%2Fdata.txt.tar.gz")
set(DITU_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/ditu_rnn" CACHE PATH "Ditu RNN model and data root." FORCE)
if (NOT EXISTS ${DITU_INSTALL_DIR} AND WITH_TESTING)
inference_download_and_uncompress(${DITU_INSTALL_DIR} ${DITU_RNN_MODEL_URL} "ditu_rnn_fluid%2Fmodel.tar.gz")
inference_download_and_uncompress(${DITU_INSTALL_DIR} ${DITU_RNN_DATA_URL} "ditu_rnn_fluid%2Fdata.txt.tar.gz")
endif()
inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
......@@ -73,8 +69,7 @@ inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
attention_lstm_fuse_pass
paddle_inference_api
pass
ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model
--infer_ditu_rnn_model=${DITU_INSTALL_DIR}/model
ARGS --infer_ditu_rnn_model=${DITU_INSTALL_DIR}/model
--infer_ditu_rnn_data=${DITU_INSTALL_DIR}/data.txt)
inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
......@@ -87,3 +82,29 @@ inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_
inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc)
inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc)
inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc)
set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz")
set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz")
set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner" CACHE PATH "Chinese ner model and data root." FORCE)
if (NOT EXISTS ${CHINESE_NER_INSTALL_DIR} AND WITH_TESTING)
inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} "chinese_ner_model.tar.gz")
inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_DATA_URL} "chinese_ner-data.txt.tar.gz")
endif()
inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api
ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model
--infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt)
set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz")
set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz")
set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac" CACHE PATH "LAC model and data root." FORCE)
if (NOT EXISTS ${LAC_INSTALL_DIR} AND WITH_TESTING)
inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} "lac_model.tar.gz")
inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_DATA_URL} "lac_data.txt.tar.gz")
endif()
inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api
ARGS --infer_model=${LAC_INSTALL_DIR}/model
--infer_data=${LAC_INSTALL_DIR}/data.txt)
......@@ -109,6 +109,7 @@ void Analyzer::Run(Argument* argument) {
"infer_clean_graph_pass", "graph_viz_pass", //
"attention_lstm_fuse_pass", "graph_viz_pass", //
"fc_lstm_fuse_pass", "graph_viz_pass", //
"mul_lstm_fuse_pass", "graph_viz_pass", //
"seq_concat_fc_fuse_pass", "graph_viz_pass", //
"fc_fuse_pass", "graph_viz_pass" //
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h"
#include <google/protobuf/text_format.h>
#include <gtest/gtest.h>
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_string(infer_model, "", "model path for LAC");
DEFINE_string(infer_data, "", "data file for LAC");
DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(burning, 0, "Burning before repeat.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
namespace paddle {
namespace inference {
namespace analysis {
struct DataRecord {
std::vector<int64_t> data;
std::vector<size_t> lod;
// for dataset and nextbatch
size_t batch_iter{0};
std::vector<std::vector<size_t>> batched_lods;
std::vector<std::vector<int64_t>> batched_datas;
std::vector<std::vector<int64_t>> datasets;
DataRecord() = default;
explicit DataRecord(const std::string &path, int batch_size = 1) {
Load(path);
Prepare(batch_size);
batch_iter = 0;
}
void Load(const std::string &path) {
std::ifstream file(path);
std::string line;
int num_lines = 0;
datasets.resize(0);
while (std::getline(file, line)) {
num_lines++;
std::vector<std::string> data;
split(line, ';', &data);
std::vector<int64_t> words_ids;
split_to_int64(data[1], ' ', &words_ids);
datasets.emplace_back(words_ids);
}
}
void Prepare(int bs) {
if (bs == 1) {
batched_datas = datasets;
for (auto one_sentence : datasets) {
batched_lods.push_back({0, one_sentence.size()});
}
} else {
std::vector<int64_t> one_batch;
std::vector<size_t> lod{0};
int bs_id = 0;
for (auto one_sentence : datasets) {
bs_id++;
one_batch.insert(one_batch.end(), one_sentence.begin(),
one_sentence.end());
lod.push_back(lod.back() + one_sentence.size());
if (bs_id == bs) {
bs_id = 0;
batched_datas.push_back(one_batch);
batched_lods.push_back(lod);
one_batch.clear();
one_batch.resize(0);
lod.clear();
lod.resize(0);
lod.push_back(0);
}
}
if (one_batch.size() != 0) {
batched_datas.push_back(one_batch);
batched_lods.push_back(lod);
}
}
}
DataRecord NextBatch() {
DataRecord data;
data.data = batched_datas[batch_iter];
data.lod = batched_lods[batch_iter];
batch_iter++;
if (batch_iter >= batched_datas.size()) {
batch_iter = 0;
}
return data;
}
};
void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
int batch_size) {
auto one_batch = data->NextBatch();
PaddleTensor input_tensor;
input_tensor.name = "word";
input_tensor.shape.assign({static_cast<int>(one_batch.data.size()), 1});
input_tensor.lod.assign({one_batch.lod});
input_tensor.dtype = PaddleDType::INT64;
TensorAssignData<int64_t>(&input_tensor, {one_batch.data});
PADDLE_ENFORCE_EQ(batch_size, static_cast<int>(one_batch.lod.size() - 1));
input_slots->assign({input_tensor});
}
static void PrintTime(const double latency, const int bs, const int repeat) {
LOG(INFO) << "===========profile result===========";
LOG(INFO) << "batch_size: " << bs << ", repeat: " << repeat
<< ", avg latency: " << latency / repeat << "ms";
LOG(INFO) << "=====================================";
}
void BenchAllData(const std::string &model_path, const std::string &data_file,
const int batch_size, const int repeat) {
NativeConfig config;
config.model_dir = model_path;
config.use_gpu = false;
config.device = 0;
config.specify_input_name = true;
std::vector<PaddleTensor> input_slots, outputs_slots;
DataRecord data(data_file, batch_size);
auto predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
GetOneBatch(&input_slots, &data, batch_size);
for (int i = 0; i < FLAGS_burning; i++) {
predictor->Run(input_slots, &outputs_slots);
}
Timer timer;
double sum = 0;
for (int i = 0; i < repeat; i++) {
for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
GetOneBatch(&input_slots, &data, batch_size);
timer.tic();
predictor->Run(input_slots, &outputs_slots);
sum += timer.toc();
}
}
PrintTime(sum, batch_size, repeat);
}
const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25,
25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43,
44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39,
14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23};
void TestLACPrediction(const std::string &model_path,
const std::string &data_file, const int batch_size,
const int repeat, bool test_all_data) {
if (test_all_data) {
BenchAllData(model_path, data_file, batch_size, repeat);
return;
}
NativeConfig config;
config.model_dir = model_path;
config.use_gpu = false;
config.device = 0;
config.specify_input_name = true;
std::vector<PaddleTensor> input_slots, outputs_slots;
DataRecord data(data_file, batch_size);
GetOneBatch(&input_slots, &data, batch_size);
auto predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
for (int i = 0; i < FLAGS_burning; i++) {
predictor->Run(input_slots, &outputs_slots);
}
Timer timer;
timer.tic();
for (int i = 0; i < repeat; i++) {
predictor->Run(input_slots, &outputs_slots);
}
PrintTime(timer.toc(), batch_size, repeat);
EXPECT_EQ(outputs_slots.size(), 1UL);
auto &out = outputs_slots[0];
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
[](int a, int b) { return a * b; });
size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t);
PADDLE_ENFORCE_GT(size, 0);
EXPECT_GE(size, batch1_size);
int64_t *pdata = static_cast<int64_t *>(out.data.data());
for (size_t i = 0; i < batch1_size; ++i) {
EXPECT_EQ(pdata[i], lac_ref_data[i]);
}
}
TEST(Analyzer_LAC, native) {
LOG(INFO) << "LAC with native";
TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
FLAGS_repeat, FLAGS_test_all_data);
}
} // namespace analysis
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h"
#include <google/protobuf/text_format.h>
#include <gtest/gtest.h>
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_string(infer_model, "", "model path");
DEFINE_string(infer_data, "", "data path");
DEFINE_int32(batch_size, 10, "batch size.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
namespace paddle {
namespace inference {
struct DataRecord {
std::vector<std::vector<int64_t>> word_data_all, mention_data_all;
std::vector<std::vector<int64_t>> rnn_word_datas, rnn_mention_datas;
std::vector<size_t> lod; // two inputs have the same lod info.
size_t batch_iter{0};
size_t batch_size{1};
DataRecord() = default;
explicit DataRecord(const std::string &path, int batch_size = 1)
: batch_size(batch_size) {
Load(path);
}
DataRecord NextBatch() {
DataRecord data;
size_t batch_end = batch_iter + batch_size;
// NOTE skip the final batch, if no enough data is provided.
if (batch_end <= word_data_all.size()) {
data.word_data_all.assign(word_data_all.begin() + batch_iter,
word_data_all.begin() + batch_end);
data.mention_data_all.assign(mention_data_all.begin() + batch_iter,
mention_data_all.begin() + batch_end);
// Prepare LoDs
data.lod.push_back(0);
CHECK(!data.word_data_all.empty());
CHECK(!data.mention_data_all.empty());
CHECK_EQ(data.word_data_all.size(), data.mention_data_all.size());
for (size_t j = 0; j < data.word_data_all.size(); j++) {
data.rnn_word_datas.push_back(data.word_data_all[j]);
data.rnn_mention_datas.push_back(data.mention_data_all[j]);
// calculate lod
data.lod.push_back(data.lod.back() + data.word_data_all[j].size());
}
}
batch_iter += batch_size;
return data;
}
void Load(const std::string &path) {
std::ifstream file(path);
std::string line;
int num_lines = 0;
while (std::getline(file, line)) {
num_lines++;
std::vector<std::string> data;
split(line, ';', &data);
// load word data
std::vector<int64_t> word_data;
split_to_int64(data[1], ' ', &word_data);
// load mention data
std::vector<int64_t> mention_data;
split_to_int64(data[3], ' ', &mention_data);
word_data_all.push_back(std::move(word_data));
mention_data_all.push_back(std::move(mention_data));
}
}
};
void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
int batch_size) {
PaddleTensor lod_word_tensor, lod_mention_tensor;
lod_word_tensor.name = "word";
lod_mention_tensor.name = "mention";
auto one_batch = data->NextBatch();
int size = one_batch.lod[one_batch.lod.size() - 1]; // token batch size
lod_word_tensor.shape.assign({size, 1});
lod_word_tensor.lod.assign({one_batch.lod});
lod_mention_tensor.shape.assign({size, 1});
lod_mention_tensor.lod.assign({one_batch.lod});
// assign data
TensorAssignData<int64_t>(&lod_word_tensor, one_batch.rnn_word_datas);
TensorAssignData<int64_t>(&lod_mention_tensor, one_batch.rnn_mention_datas);
// Set inputs.
input_slots->assign({lod_word_tensor, lod_mention_tensor});
for (auto &tensor : *input_slots) {
tensor.dtype = PaddleDType::INT64;
}
}
// the first inference result
const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
48, 39, 38, 16, 25};
void TestChineseNERPrediction() {
NativeConfig config;
config.prog_file = FLAGS_infer_model + "/__model__";
config.param_file = FLAGS_infer_model + "/param";
config.use_gpu = false;
config.device = 0;
config.specify_input_name = true;
auto predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
std::vector<PaddleTensor> input_slots;
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
// Prepare inputs.
PrepareInputs(&input_slots, &data, FLAGS_batch_size);
std::vector<PaddleTensor> outputs;
Timer timer;
timer.tic();
for (int i = 0; i < FLAGS_repeat; i++) {
predictor->Run(input_slots, &outputs);
}
LOG(INFO) << "===========profile result===========";
LOG(INFO) << "batch_size: " << FLAGS_batch_size
<< ", repeat: " << FLAGS_repeat
<< ", latency: " << timer.toc() / FLAGS_repeat << "ms";
LOG(INFO) << "=====================================";
PADDLE_ENFORCE(outputs.size(), 1UL);
auto &out = outputs[0];
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
[](int a, int b) { return a * b; });
PADDLE_ENFORCE_GT(size, 0);
int64_t *result = static_cast<int64_t *>(out.data.data());
for (size_t i = 0; i < std::min(11UL, size); i++) {
PADDLE_ENFORCE(result[i], chinese_ner_result_data[i]);
}
}
// Directly infer with the original model.
TEST(Analyzer, Chinese_ner) { TestChineseNERPrediction(); }
} // namespace inference
} // namespace paddle
......@@ -34,7 +34,7 @@ namespace paddle {
namespace inference {
namespace analysis {
using namespace framework;
using namespace framework; // NOLINT
TEST(Analyzer, analysis_without_tensorrt) {
FLAGS_IA_enable_tensorrt_subgraph_engine = false;
......@@ -201,13 +201,13 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
minute_tensor.lod.assign({one_batch.lod3});
// clang-format on
// assign data
TensorAssignData(&lod_attention_tensor,
std::vector<std::vector<float>>({{0, 0}}));
TensorAssignData<float>(&lod_attention_tensor,
std::vector<std::vector<float>>({{0, 0}}));
std::vector<float> tmp_zeros(batch_size * 15, 0.);
TensorAssignData(&init_zero_tensor, {tmp_zeros});
TensorAssignData(&lod_tensor_tensor, one_batch.rnn_link_data);
TensorAssignData(&week_tensor, one_batch.rnn_week_datas);
TensorAssignData(&minute_tensor, one_batch.rnn_minute_datas);
TensorAssignData<float>(&init_zero_tensor, {tmp_zeros});
TensorAssignData<float>(&lod_tensor_tensor, one_batch.rnn_link_data);
TensorAssignData<float>(&week_tensor, one_batch.rnn_week_datas);
TensorAssignData<float>(&minute_tensor, one_batch.rnn_minute_datas);
// Set inputs.
auto init_zero_tensor1 = init_zero_tensor;
init_zero_tensor1.name = "hidden_init";
......@@ -312,8 +312,8 @@ void TestDituRNNPrediction(const std::string &model_path,
PADDLE_ENFORCE_GT(size, 0);
float *data = static_cast<float *>(out.data.data());
float *base_data = static_cast<float *>(base_out.data.data());
for (size_t i = 0; i < size; i++) {
EXPECT_NEAR(data[i], base_data[i], 1e-3);
for (size_t j = 0; j < size; j++) {
EXPECT_NEAR(data[j], base_data[j], 1e-3);
}
}
......@@ -329,6 +329,7 @@ void TestDituRNNPrediction(const std::string &model_path,
ASSERT_TRUE(fuse_statis.count("fc"));
EXPECT_EQ(fuse_statis.at("fc"), 1);
EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 1);
}
}
......
......@@ -67,7 +67,7 @@ struct Argument {
PADDLE_ENFORCE(!attrs_.count(key), "Duplicate set Argument's attr [%s]",
key);
attrs_[key] = data;
attr_deleters_[key] = [data, key, this]() {
attr_deleters_[key] = [data, key]() {
VLOG(3) << "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
VLOG(3) << "argument delete attr: " << key;
delete data;
......
......@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <sys/time.h>
#include <algorithm>
#include <map>
#include <set>
......@@ -23,32 +22,14 @@ limitations under the License. */
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/timer.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_bool(profile, false, "Turn on profiler for fluid");
namespace paddle {
namespace {
// Timer for timer
class Timer {
public:
double start;
double startu;
void tic() {
struct timeval tp;
gettimeofday(&tp, NULL);
start = tp.tv_sec;
startu = tp.tv_usec;
}
double toc() {
struct timeval tp;
gettimeofday(&tp, NULL);
double used_time_ms =
(tp.tv_sec - start) * 1000.0 + (tp.tv_usec - startu) / 1000.0;
return used_time_ms;
}
};
using paddle::inference::Timer;
template <class T>
std::string num2str(T a) {
......@@ -69,7 +50,7 @@ void NativePaddlePredictor::PrepareFeedFetch() {
feed_names_[op->Output("Out")[0]] = idx;
} else if (op->Type() == "fetch") {
int idx = boost::get<int>(op->GetAttr("col"));
if (fetchs_.size() <= idx) {
if (fetchs_.size() <= static_cast<size_t>(idx)) {
fetchs_.resize(idx + 1);
}
fetchs_[idx] = op;
......@@ -80,7 +61,7 @@ void NativePaddlePredictor::PrepareFeedFetch() {
bool NativePaddlePredictor::Init(
std::shared_ptr<framework::Scope> parent_scope) {
VLOG(3) << "Predictor::init()";
#if !defined(_WIN32)
if (FLAGS_profile) {
LOG(WARNING) << "Profiler is actived, might affect the performance";
LOG(INFO) << "You can turn off by set gflags '-profile false'";
......@@ -89,6 +70,7 @@ bool NativePaddlePredictor::Init(
: platform::ProfilerState::kCPU;
platform::EnableProfiler(tracking_device);
}
#endif
if (config_.use_gpu) {
place_ = paddle::platform::CUDAPlace(config_.device);
......@@ -133,10 +115,12 @@ bool NativePaddlePredictor::Init(
}
NativePaddlePredictor::~NativePaddlePredictor() {
#if !defined(_WIN32)
if (FLAGS_profile) {
platform::DisableProfiler(platform::EventSortingKey::kTotal,
"./profile.log");
}
#endif
if (sub_scope_) {
scope_->DeleteScope(sub_scope_);
}
......@@ -179,8 +163,13 @@ std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
LOG(ERROR) << "fail to call Init";
return nullptr;
}
#ifdef __clang__
// fix clang compile error
return cls;
#else
// fix manylinux compile error.
return std::move(cls);
#endif
}
bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
......@@ -222,6 +211,62 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
}
return true;
}
template <typename T>
void NativePaddlePredictor::GetFetchOne(const framework::LoDTensor &fetch,
PaddleTensor *output) {
std::vector<int> shape;
auto dims_i = fetch.dims();
auto lod = fetch.lod();
const T *output_ptr = fetch.data<T>();
auto num = fetch.numel();
std::vector<T> data;
if (0 == lod.size()) {
std::copy(output_ptr, output_ptr + num, std::back_inserter(data));
for (int j = 0; j < dims_i.size(); ++j) {
shape.push_back(dims_i[j]);
}
} else {
// for batch detection
// image[0] -> output[0] shape {145, 6}
// image[1] -> output[1] shape {176, 6}
// then,
// the batch output shape {321, 6}
// the lod {{0, 145, 321}}
// so we should append output[0] to {176, 6}
size_t max_dim = 0;
for (size_t j = 1; j < lod[0].size(); j++) {
max_dim = std::max(max_dim, lod[0][j] - lod[0][j - 1]);
}
size_t common_dim = lod[0].back() == 0 ? 0 : num / lod[0].back();
if (max_dim > 0) {
data.resize((lod[0].size() - 1) * max_dim * common_dim, 0);
}
for (size_t j = 1; j < lod[0].size(); j++) {
size_t start = lod[0][j - 1] * common_dim;
size_t end = lod[0][j] * common_dim;
if (end > start) {
std::copy(output_ptr + start, output_ptr + end,
data.begin() + (j - 1) * max_dim * common_dim);
}
}
shape.push_back(lod[0].size() - 1);
shape.push_back(max_dim);
for (int j = 1; j < dims_i.size(); ++j) {
shape.push_back(dims_i[j]);
}
}
output->shape = shape;
auto &buffer = output->data;
if (buffer.empty() || buffer.length() < sizeof(T) * data.size()) {
buffer.Resize(sizeof(T) * data.size());
}
std::memcpy(buffer.data(), data.data(), buffer.length());
// copy LoD
for (const auto &level : fetch.lod()) {
output->lod.emplace_back(level);
}
}
bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
framework::Scope *scope) {
......@@ -229,70 +274,20 @@ bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
outputs->resize(fetchs_.size());
for (size_t i = 0; i < fetchs_.size(); ++i) {
int idx = boost::get<int>(fetchs_[i]->GetAttr("col"));
PADDLE_ENFORCE(idx == i);
framework::LoDTensor &output =
PADDLE_ENFORCE((size_t)idx == i);
framework::LoDTensor &fetch =
framework::GetFetchVariable(*scope, "fetch", idx);
// TODO(panyx0718): Support fetch of other types.
if (output.type() != typeid(float)) {
LOG(ERROR) << "only support fetching float now.";
return false;
}
std::vector<int> shape;
auto dims_i = output.dims();
auto lod = output.lod();
const float *output_ptr = output.data<float>();
// const int64_t* output_ptr = fetchs[i].data<int64_t>();
auto num = output.numel();
std::vector<float> data;
if (0 == lod.size()) {
std::copy(output_ptr, output_ptr + num, std::back_inserter(data));
for (int j = 0; j < dims_i.size(); ++j) {
shape.push_back(dims_i[j]);
}
auto type = fetch.type();
auto output = &(outputs->at(i));
if (type == typeid(float)) {
GetFetchOne<float>(fetch, output);
output->dtype = PaddleDType::FLOAT32;
} else if (type == typeid(int64_t)) {
GetFetchOne<int64_t>(fetch, output);
output->dtype = PaddleDType::INT64;
} else {
// for batch detection
// image[0] -> output[0] shape {145, 6}
// image[1] -> output[1] shape {176, 6}
// then,
// the batch output shape {321, 6}
// the lod {{0, 145, 321}}
// so we should append output[0] to {176, 6}
size_t max_dim = 0;
for (size_t j = 1; j < lod[0].size(); j++) {
max_dim = std::max(max_dim, lod[0][j] - lod[0][j - 1]);
}
size_t common_dim = lod[0].back() == 0 ? 0 : num / lod[0].back();
if (max_dim > 0) {
data.resize((lod[0].size() - 1) * max_dim * common_dim, 0);
}
for (size_t j = 1; j < lod[0].size(); j++) {
size_t start = lod[0][j - 1] * common_dim;
size_t end = lod[0][j] * common_dim;
if (end > start) {
std::copy(output_ptr + start, output_ptr + end,
data.begin() + (j - 1) * max_dim * common_dim);
}
}
shape.push_back(lod[0].size() - 1);
shape.push_back(max_dim);
for (int j = 1; j < dims_i.size(); ++j) {
shape.push_back(dims_i[j]);
}
}
outputs->at(i).shape = shape;
auto &buffer = outputs->at(i).data;
if (buffer.empty() || buffer.length() < sizeof(float) * data.size()) {
buffer.Resize(sizeof(float) * data.size());
}
std::memcpy(buffer.data(), data.data(), buffer.length());
// copy LoD
for (const auto &level : output.lod()) {
outputs->at(i).lod.emplace_back(level);
LOG(ERROR) << "unknown type, only support float32 and int64 now.";
}
outputs->at(i).dtype = PaddleDType::FLOAT32;
// TODO(panyx0718): support other types? fill tensor name? avoid a copy.
}
return true;
}
......@@ -323,7 +318,12 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) {
return nullptr;
}
#ifdef __clang__
// fix clang compile error
return predictor;
#else
return std::move(predictor);
#endif
}
} // namespace paddle
......@@ -51,7 +51,9 @@ class NativePaddlePredictor : public PaddlePredictor {
framework::Scope *scope);
bool GetFetch(std::vector<PaddleTensor> *output_data,
framework::Scope *scope);
template <typename T>
void GetFetchOne(const framework::LoDTensor &fetchs,
PaddleTensor *output_data);
void PrepareFeedFetch();
NativeConfig config_;
......
......@@ -3,6 +3,11 @@ cmake_minimum_required(VERSION 3.0)
project(cpp_inference_demo CXX C)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
if (WIN32)
set(CMAKE_STATIC_LIBRARY_PREFIX "lib")
else()
set(CMAKE_STATIC_LIBRARY_PREFIX "")
endif()
if(NOT DEFINED PADDLE_LIB)
message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
......@@ -32,44 +37,56 @@ endif(NOT WIN32)
include_directories("${PADDLE_LIB}/third_party/boost")
include_directories("${PADDLE_LIB}/third_party/eigen3")
if (NOT WIN32)
link_directories("${PADDLE_LIB}/third_party/install/snappy/lib")
link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
endif(NOT WIN32)
link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
link_directories("${PADDLE_LIB}/paddle/fluid/inference")
add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
if(WITH_MKL)
include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so
${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5.so)
set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn")
if(EXISTS ${MKLDNN_PATH})
include_directories("${MKLDNN_PATH}/include")
set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
endif()
else()
set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a)
set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX})
endif()
# Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
if(WITH_STATIC_LIB)
set(DEPS
${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.a)
${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
else()
set(DEPS
${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.so)
${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
endif()
set(EXTERNAL_LIB "-lrt -ldl -lpthread")
if (NOT WIN32)
set(EXTERNAL_LIB "-lrt -ldl -lpthread")
set(DEPS ${DEPS}
${MATH_LIB} ${MKLDNN_LIB}
glog gflags protobuf snappystream snappy z
${EXTERNAL_LIB})
else()
set(DEPS ${DEPS}
${MATH_LIB} ${MKLDNN_LIB}
${CMAKE_STATIC_LIBRARY_PREFIX}glog ${CMAKE_STATIC_LIBRARY_PREFIX}gflags ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf
${EXTERNAL_LIB})
endif(NOT WIN32)
if(WITH_GPU)
set(DEPS ${DEPS} ${CUDA_LIB}/libcudart.so)
set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
endif()
target_link_libraries(${DEMO_NAME} ${DEPS})
......@@ -20,30 +20,11 @@
#include <string>
#include <vector>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/timer.h"
namespace paddle {
namespace inference {
// Timer for timer
class Timer {
public:
double start;
double startu;
void tic() {
struct timeval tp;
gettimeofday(&tp, NULL);
start = tp.tv_sec;
startu = tp.tv_usec;
}
double toc() {
struct timeval tp;
gettimeofday(&tp, NULL);
double used_time_ms =
(tp.tv_sec - start) * 1000.0 + (tp.tv_usec - startu) / 1000.0;
return used_time_ms;
}
};
static void split(const std::string &str, char sep,
std::vector<std::string> *pieces) {
pieces->clear();
......@@ -68,6 +49,13 @@ static void split_to_float(const std::string &str, char sep,
std::transform(pieces.begin(), pieces.end(), std::back_inserter(*fs),
[](const std::string &v) { return std::stof(v); });
}
static void split_to_int64(const std::string &str, char sep,
std::vector<int64_t> *is) {
std::vector<std::string> pieces;
split(str, sep, &pieces);
std::transform(pieces.begin(), pieces.end(), std::back_inserter(*is),
[](const std::string &v) { return std::stoi(v); });
}
template <typename T>
std::string to_string(const std::vector<T> &vec) {
std::stringstream ss;
......@@ -84,14 +72,18 @@ template <>
std::string to_string<std::vector<std::vector<float>>>(
const std::vector<std::vector<std::vector<float>>> &vec);
// clang-format off
static void TensorAssignData(PaddleTensor *tensor, const std::vector<std::vector<float>> &data) {
template <typename T>
static void TensorAssignData(PaddleTensor *tensor,
const std::vector<std::vector<T>> &data) {
// Assign buffer
int dim = std::accumulate(tensor->shape.begin(), tensor->shape.end(), 1, [](int a, int b) { return a * b; });
tensor->data.Resize(sizeof(float) * dim);
int dim = std::accumulate(tensor->shape.begin(), tensor->shape.end(), 1,
[](int a, int b) { return a * b; });
tensor->data.Resize(sizeof(T) * dim);
int c = 0;
for (const auto &f : data) {
for (float v : f) { static_cast<float *>(tensor->data.data())[c++] = v; }
for (T v : f) {
static_cast<T *>(tensor->data.data())[c++] = v;
}
}
}
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono> // NOLINT
namespace paddle {
namespace inference {
// Timer for timer
class Timer {
public:
std::chrono::high_resolution_clock::time_point start;
std::chrono::high_resolution_clock::time_point startu;
void tic() { start = std::chrono::high_resolution_clock::now(); }
double toc() {
startu = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> time_span =
std::chrono::duration_cast<std::chrono::duration<double>>(startu -
start);
double used_time_ms = static_cast<double>(time_span.count()) * 1000.0;
return used_time_ms;
}
};
} // namespace inference
} // namespace paddle
......@@ -865,8 +865,8 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
auto temp1 = static_cast<T>(1) /
(static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
auto temp2 = temp1 * (static_cast<T>(1) - (beta * out));
dx.device(d) = dout * ((beta * out) + temp2);
auto temp2 = temp1 * (static_cast<T>(1) - (static_cast<T>(beta) * out));
dx.device(d) = dout * ((static_cast<T>(beta) * out) + temp2);
}
};
......
......@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/attention_lstm_op.h"
#include <sys/time.h>
#include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
......
......@@ -74,7 +74,7 @@ struct BeamSearchDecodeFunctor {
}
template <typename T>
void operator()() const;
void apply() const;
bool tensor_on_gpu_;
size_t beam_size_;
......@@ -88,7 +88,7 @@ struct BeamSearchDecodeFunctor {
};
template <typename T>
void BeamSearchDecodeFunctor::operator()() const {
void BeamSearchDecodeFunctor::apply() const {
BeamSearchDecoder<T> beam_search_decoder(beam_size_, end_id_);
// Check if the tensor is on GPU. If so, use the CPU copy instead
if (tensor_on_gpu_) {
......@@ -101,7 +101,7 @@ void BeamSearchDecodeFunctor::operator()() const {
}
template <>
void BeamSearchDecodeFunctor::operator()<bool>() const {
void BeamSearchDecodeFunctor::apply<bool>() const {
PADDLE_THROW("beam search decode op does not support bool!");
}
......
......@@ -37,7 +37,7 @@ struct CastOpFunctor {
: in_(in), out_(out), ctx_(ctx) {}
template <typename OutT>
void operator()() const {
void apply() const {
auto* in_begin = in_->data<InT>();
auto numel = in_->numel();
auto* in_end = in_begin + numel;
......
......@@ -118,7 +118,6 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
output_channels / groups * output_height * output_width * output_depth;
int group_offset_filter = filter->numel() / groups;
// ------------------- cudnn conv workspace ---------------------
void* cudnn_workspace = nullptr;
size_t workspace_size_in_bytes; // final workspace to allocate.
size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
if (user_workspace_size > 0) {
......@@ -159,20 +158,18 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
"workspace_size to be allocated exceeds the limit");
// Allocate on GPU memory
platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv forward ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
for (int i = 0; i < groups; i++) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
cudnn_filter_desc, filter_data + i * group_offset_filter,
cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
&beta, cudnn_output_desc, output_data + i * group_offset_out));
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
cudnn_filter_desc, filter_data + i * group_offset_filter,
cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
&beta, cudnn_output_desc, output_data + i * group_offset_out));
};
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
}
// Release the cudnn workspace
paddle::memory::Free(gpu, cudnn_workspace);
}
};
......@@ -314,11 +311,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
cudnn_filter_desc, filter_algo, &tmp_size));
workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
}
// ------------------- cudnn conv workspace ---------------------
// Already on GPU
void* cudnn_workspace = nullptr;
platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv backward data ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
if (input_grad) {
......@@ -326,12 +319,15 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
// Because beta is zero, it is unnecessary to reset input_grad.
for (int i = 0; i < groups; i++) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
handle, &alpha, cudnn_filter_desc,
filter_data + i * group_offset_filter, cudnn_output_grad_desc,
output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo,
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
input_grad_data + i * group_offset_in));
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
handle, &alpha, cudnn_filter_desc,
filter_data + i * group_offset_filter, cudnn_output_grad_desc,
output_grad_data + i * group_offset_out, cudnn_conv_desc,
data_algo, cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_input_desc, input_grad_data + i * group_offset_in));
};
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
}
}
// ------------------- cudnn conv backward filter ---------------------
......@@ -339,16 +335,17 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
// Because beta is zero, it is unnecessary to reset filter_grad.
for (int i = 0; i < groups; i++) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
cudnn_output_grad_desc, output_grad_data + i * group_offset_out,
cudnn_conv_desc, filter_algo, cudnn_workspace,
workspace_size_in_bytes, &beta, cudnn_filter_desc,
filter_grad_data + i * group_offset_filter));
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
handle, &alpha, cudnn_input_desc,
input_data + i * group_offset_in, cudnn_output_grad_desc,
output_grad_data + i * group_offset_out, cudnn_conv_desc,
filter_algo, cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_filter_desc, filter_grad_data + i * group_offset_filter));
};
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
}
}
// Release the cudnn workspace
paddle::memory::Free(gpu, cudnn_workspace);
}
};
......
......@@ -76,7 +76,6 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
conv_desc.descriptor<T>(paddings, strides, dilations);
// ------------------- cudnn conv workspace ---------------------
void* cudnn_workspace = nullptr;
size_t workspace_size_in_bytes; // final workspace to allocate.
size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes;
if (user_workspace_size > 0) {
......@@ -100,25 +99,21 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
cudnn_output_desc, algo, &workspace_size_in_bytes));
// Allocate on GPU memory
platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv transpose forward ---------------------
int input_offset = input->numel() / input->dims()[0] / groups;
int output_offset = output->numel() / output->dims()[0] / groups;
int filter_offset = filter->numel() / groups;
T alpha = 1.0f, beta = 0.0f;
for (int g = 0; g < groups; g++) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
algo, cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_output_desc, output_data + output_offset * g));
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
algo, cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_output_desc, output_data + output_offset * g));
};
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
}
// Release the cudnn workspace
paddle::memory::Free(gpu, cudnn_workspace);
}
};
......@@ -206,11 +201,6 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
std::max(workspace_size_in_bytes, bwd_filter_ws_size);
}
// ------------------- cudnn conv workspace ---------------------
// Already on GPU
void* cudnn_workspace = nullptr;
platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv backward data ---------------------
// FIXME(typhoonzero): template type T may not be the same as cudnn call.
int input_offset = input->numel() / input->dims()[0] / groups;
......@@ -222,12 +212,15 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
// Because beta is zero, it is unnecessary to reset input_grad.
for (int g = 0; g < groups; g++) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
handle, &alpha, cudnn_output_desc,
output_grad_data + output_grad_offset * g, cudnn_filter_desc,
filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
input_grad_data + input_offset * g));
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
handle, &alpha, cudnn_output_desc,
output_grad_data + output_grad_offset * g, cudnn_filter_desc,
filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
input_grad_data + input_offset * g));
};
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
}
}
......@@ -237,17 +230,17 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
// Because beta is zero, it is unnecessary to reset filter_grad.
// Gradient with respect to the filter
for (int g = 0; g < groups; g++) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
handle, &alpha, cudnn_output_desc,
output_grad_data + output_grad_offset * g, cudnn_input_desc,
input_data + input_offset * g, cudnn_conv_desc, filter_algo,
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_filter_desc,
filter_grad_data + filter_offset * g));
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
handle, &alpha, cudnn_output_desc,
output_grad_data + output_grad_offset * g, cudnn_input_desc,
input_data + input_offset * g, cudnn_conv_desc, filter_algo,
cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_filter_desc, filter_grad_data + filter_offset * g));
};
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
}
}
// Release the cudnn workspace
paddle::memory::Free(gpu, cudnn_workspace);
}
};
......
......@@ -33,7 +33,7 @@ struct AppendProposalsFunctor {
: out_(out), offset_(offset), to_add_(to_add) {}
template <typename T>
void operator()() const {
void apply() const {
auto *out_data = out_->data<T>();
auto *to_add_data = to_add_->data<T>();
memcpy(out_data + offset_, to_add_data, to_add_->numel() * sizeof(T));
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include <glog/logging.h>
#include <algorithm>
#include <iterator>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
......@@ -94,8 +95,11 @@ class RowwiseTransformIterator;
template <typename T, typename DeviceContext>
class MidWiseTransformIterator;
// NOTE(dzhwinter): ptrdiff_t in iterator is deperecated in c++17
template <typename T>
class RowwiseTransformIterator<T, platform::CPUDeviceContext> {
class RowwiseTransformIterator<T, platform::CPUDeviceContext>
: public std::iterator<std::random_access_iterator_tag, T, std::ptrdiff_t,
T *, T &> {
public:
RowwiseTransformIterator(const T *ptr, int n) : ptr_(ptr), i_(0), n_(n) {}
......@@ -126,7 +130,9 @@ class RowwiseTransformIterator<T, platform::CPUDeviceContext> {
};
template <typename T>
class MidWiseTransformIterator<T, platform::CPUDeviceContext> {
class MidWiseTransformIterator<T, platform::CPUDeviceContext>
: public std::iterator<std::random_access_iterator_tag, T, std::ptrdiff_t,
T *, T &> {
public:
MidWiseTransformIterator(const T *ptr, int n, int post)
: ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {}
......@@ -479,8 +485,13 @@ void ElemwiseGradComputeNoBroadcast(
const framework::Tensor &dout, int axis, framework::Tensor *dx,
framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) {
size_t N = static_cast<size_t>(framework::product(x_dim));
#if !defined(_WIN32)
platform::ForRange<DeviceContext> for_range(
ctx.template device_context<DeviceContext>(), N);
#else
platform::ForRange<DeviceContext> for_range(
ctx.device_context<DeviceContext>(), N);
#endif // !_WIN32
for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP>{
x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), dx_op, dy_op,
dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
......@@ -633,13 +644,13 @@ void ElementwiseGradCompute(const framework::ExecutionContext &ctx,
template <typename Functor, typename DeviceContext, typename T,
typename OutType = T>
void ElementwiseComputeEx(const framework::ExecutionContext &ctx,
const framework::Tensor *x,
const framework::Tensor *y, int axis, Functor func,
framework::Tensor *z) {
TransformFunctor<Functor, T, DeviceContext, OutType> functor(
x, y, z, ctx.template device_context<DeviceContext>(), func);
auto x_dims = x->dims();
auto y_dims_untrimed = y->dims();
PADDLE_ENFORCE_GE(x_dims.size(), y_dims_untrimed.size(),
......
......@@ -25,7 +25,7 @@ struct FillOpVisitor {
: tensor_(tensor), value_(value) {}
template <typename T>
void operator()() const {
void apply() const {
platform::CPUPlace cpu;
auto *data = tensor_->mutable_data<T>(cpu);
std::transform(value_.data(), value_.data() + tensor_->numel(), data,
......
......@@ -13,16 +13,13 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fusion_gru_op.h"
#include <cstring> // for memcpy
#include <string>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
#include "paddle/fluid/operators/math/detail/gru_kernel.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/operators/math/gru_compute.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace paddle {
namespace operators {
......@@ -35,12 +32,12 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
"Input(WeightH) of GRU should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("XX"), "Output(XX) of GRU should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedGate"),
"Output(BatchedGate) of GRU should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchResetHiddenPrev"),
"Output(BatchResetHiddenPrev) of GRU should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"),
"Output(BatchedHidden) of GRU should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
"Output(ReorderedH0) of GRU should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
"Output(BatchedInput) of GRU should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"),
"Output(BatchedOut) of GRU should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
"Output(Hidden) of GRU should not be null.");
......@@ -83,12 +80,16 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
}
framework::DDim out_dims({x_dims[0], frame_size});
ctx->SetOutputDim("Hidden", out_dims);
ctx->SetOutputDim("BatchedGate", {x_dims[0], wx_dims[1]});
ctx->SetOutputDim("BatchedHidden", out_dims);
ctx->SetOutputDim("BatchResetHiddenPrev", out_dims);
ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
ctx->SetOutputDim("BatchedOut", out_dims);
ctx->ShareLoD("X", "Hidden");
int xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
int xx_width;
if (ctx->Attrs().Get<bool>("use_seq")) {
xx_width = wx_dims[1];
} else {
xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
}
ctx->SetOutputDim("XX", {x_dims[0], xx_width});
ctx->ShareLoD("X", "XX");
}
......@@ -115,22 +116,29 @@ void FusionGRUOpMaker::Make() {
"(Tensor) The FC weight with shape (M x 3D),"
"where M is the dim size of x, D is the hidden size. ");
AddInput("WeightH",
"(Tensor) (D x 3D) Same as GRUOp, where D is the hidden size. ");
"(Tensor) (D x 3D) Same as GRUOp, where D is the hidden size. "
"This weight is not exactly D x 3D as: {W_update, W_reset, W_state}"
"Acutally they are D x 2D and D x D two part weights."
"{W_update, W_reset; W_state}"
"{D x (D + D); D x D}");
AddInput("Bias",
"(Tensor, optional) (1 x 3D)."
"Almost same as GRUOp."
"Note: if have FC bias it should be added on this bias.")
.AsDispensable();
AddOutput("ReorderedH0", "(Tensor) (N x D), which N is the min-batch size.")
.AsIntermediate();
AddOutput("XX",
"(LoDTensor) the result after X * WeightX (size is T x 4D)"
"(LoDTensor) the result after X * WeightX (size is T x 3D)"
" or batched_X (size is T x M), this will be automatically chosen,"
" where T is the total time steps in this mini-batch,"
" D is the hidden size, M is the dim size of x input.")
.AsIntermediate();
AddOutput("BatchedGate", "(LoDTensor) Same as GRUOp").AsIntermediate();
AddOutput("BatchResetHiddenPrev", "(LoDTensor) (T x 3D) Same as GRUOp.")
AddOutput("BatchedInput",
"(LoDTensor) This is the batched result of input X"
"or the batched result after fc, shape (T x 3D)")
.AsIntermediate();
AddOutput("BatchedHidden", "(LoDTensor) (T X D) Same as GRUOp.")
AddOutput("BatchedOut", "(LoDTensor) (T X D) save batched hidden.")
.AsIntermediate();
AddOutput("Hidden", "(LoDTensor) (T x D) Same as GRUOp");
AddAttr<std::string>("activation",
......@@ -146,6 +154,10 @@ void FusionGRUOpMaker::Make() {
"(bool, defalut: False) "
"whether to compute reversed GRU.")
.SetDefault(false);
AddAttr<bool>("use_seq",
"(bool, defalut: True) "
"whether to use seq mode to compute GRU.")
.SetDefault(true);
AddComment(R"DOC(
The Fusion complete GRU Operator.
This operator fuse the fully-connected operator into GRU,
......@@ -153,172 +165,261 @@ more details can refer to GRU op.
)DOC");
}
template <typename DeviceContext, typename T>
inline void ReorderInitState(const DeviceContext& ctx,
const framework::Tensor& src,
framework::Vector<size_t> index_lod,
framework::Tensor* dst, bool indexed_src) {
math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
dst->mutable_data<T>(src.dims(), ctx.GetPlace());
row_shuffle(ctx, src, index_lod, dst, indexed_src);
}
template <typename DeviceContext, typename T>
template <typename T>
class FusionGRUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
if (ctx.Attr<bool>("use_seq")) {
SeqCompute(ctx);
} else {
BatchCompute(ctx);
}
}
#define INIT_VEC_FUNC \
std::function<void(const int, const T *, T *)> act_gate, act_state; \
std::function<void(const int, const T*, const T*, const T*, T*)> cross; \
auto& act_gate_str = ctx.Attr<std::string>("gate_activation"); \
auto& act_state_str = ctx.Attr<std::string>("activation"); \
if (platform::jit::MayIUse(platform::jit::avx)) { \
math::VecActivations<T, platform::jit::avx> act_functor; \
act_gate = act_functor(act_gate_str); \
act_state = act_functor(act_state_str); \
cross = math::vec_cross<T, platform::jit::avx>; \
} else { \
math::VecActivations<T, platform::jit::isa_any> act_functor; \
act_gate = act_functor(act_gate_str); \
act_state = act_functor(act_state_str); \
cross = math::vec_cross<T, platform::jit::isa_any>; \
}
#define INIT_BASE_INPUT_OUTPUT \
auto* h0 = ctx.Input<Tensor>("H0"); \
auto* wx = ctx.Input<Tensor>("WeightX"); \
auto* wh = ctx.Input<Tensor>("WeightH"); \
auto* bias = ctx.Input<Tensor>("Bias"); \
auto* xx = ctx.Output<LoDTensor>("XX"); \
auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
bool is_reverse = ctx.Attr<bool>("is_reverse");
#define INIT_BASE_SIZES \
auto x_dims = x->dims(); /* T x M*/ \
auto wh_dims = wh->dims(); /* D x 3D*/ \
const int total_T = x_dims[0]; \
const int M = x_dims[1]; \
const int D = wh_dims[0]; \
const int D3 = wh_dims[1]; \
const int D2 = D * 2;
void SeqCompute(const framework::ExecutionContext& ctx) const {
using DeviceContext = paddle::platform::CPUDeviceContext;
auto* x = ctx.Input<LoDTensor>("X");
auto* wx = ctx.Input<Tensor>("WeightX");
auto* wh = ctx.Input<Tensor>("WeightH");
auto* bias = ctx.Input<Tensor>("Bias");
auto* h0 = ctx.Input<Tensor>("H0");
auto* xx = ctx.Output<LoDTensor>("XX");
auto* batched_gate = ctx.Output<LoDTensor>("BatchedGate");
auto* batch_reset_hidden_prev =
ctx.Output<LoDTensor>("BatchResetHiddenPrev");
auto* batch_hidden = ctx.Output<LoDTensor>("BatchedHidden");
auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
bool is_reverse = ctx.Attr<bool>("is_reverse");
INIT_BASE_INPUT_OUTPUT
INIT_BASE_SIZES
INIT_VEC_FUNC
auto x_lod = x->lod();
const int N = x_lod[0].size() - 1;
const T* x_data = x->data<T>();
const T* h0_data = h0 ? h0->data<T>() : nullptr;
const T* wx_data = wx->data<T>();
const T* wh_data = wh->data<T>();
const T* wh_state_data = wh_data + D * D2;
T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
T* batched_gate_data = batched_gate->mutable_data<T>(ctx.GetPlace());
batch_reset_hidden_prev->mutable_data<T>(ctx.GetPlace());
batch_hidden->mutable_data<T>(ctx.GetPlace());
hidden_out->mutable_data<T>(ctx.GetPlace());
T* hidden_out_data = hidden_out->mutable_data<T>(ctx.GetPlace());
auto blas = math::GetBlas<DeviceContext, T>(ctx);
math::FCCompute<DeviceContext, T>(blas, total_T, D3, M, x_data, wx_data,
xx_data,
bias ? bias->data<T>() : nullptr);
int xx_offset = D3;
int gate_offset = D;
if (is_reverse) {
const int offset = (total_T - 1) * D;
xx_data = xx_data + offset * 3;
hidden_out_data = hidden_out_data + offset;
xx_offset = -D3;
gate_offset = -D;
}
auto move_step = [&]() {
xx_data = xx_data + xx_offset;
hidden_out_data = hidden_out_data + gate_offset;
};
for (int i = 0; i < N; ++i) {
int bid = is_reverse ? N - 1 - i : i;
int seq_len = x_lod[0][bid + 1] - x_lod[0][bid];
const T* prev_hidden_data = nullptr;
int tstart = 0;
if (h0_data) {
prev_hidden_data = h0_data + bid * D;
} else {
// W: {W_update, W_reset; W_state}
// update gate
act_gate(D, xx_data, xx_data);
// state gate
act_state(D, xx_data + D2, xx_data + D2);
// out = a*b
blas.VMUL(D, xx_data, xx_data + D2, hidden_out_data);
// save prev
prev_hidden_data = hidden_out_data;
tstart = 1;
move_step();
}
for (int step = tstart; step < seq_len; ++step) {
// gemm prev * (Wu + Wr)
blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D2, D, static_cast<T>(1),
prev_hidden_data, D, wh_data, D2, static_cast<T>(1), xx_data,
D3);
act_gate(D2, xx_data, xx_data);
// rt = rt*ht_1 inplace result
blas.VMUL(D, prev_hidden_data, xx_data + D, hidden_out_data);
// gemm rt * Ws
blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D, D, static_cast<T>(1),
hidden_out_data, D, wh_state_data, D, static_cast<T>(1),
xx_data + D2, D3);
act_state(D, xx_data + D2, xx_data + D2);
// out = zt*ht~ + (1-zt)*ht_1
cross(D, xx_data, xx_data + D2, prev_hidden_data, hidden_out_data);
// save prev
prev_hidden_data = hidden_out_data;
move_step();
}
}
}
void BatchCompute(const framework::ExecutionContext& ctx) const {
using DeviceContext = paddle::platform::CPUDeviceContext;
auto* x = ctx.Input<LoDTensor>("X");
if (x->lod()[0].size() == 2) {
SeqCompute(ctx);
return;
}
INIT_BASE_INPUT_OUTPUT
INIT_BASE_SIZES
INIT_VEC_FUNC
auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
auto* batched_input = ctx.Output<LoDTensor>("BatchedInput");
auto* batched_out = ctx.Output<LoDTensor>("BatchedOut");
const T* x_data = x->data<T>();
const T* wx_data = wx->data<T>();
const T* wh_data = wh->data<T>();
auto x_dims = x->dims();
auto wx_dims = wx->dims();
T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
T* batched_input_data = batched_input->mutable_data<T>(ctx.GetPlace());
T* batched_out_data = batched_out->mutable_data<T>(ctx.GetPlace());
hidden_out->mutable_data<T>(ctx.GetPlace());
auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
if (x_dims[1] > wx_dims[1]) {
math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
x_data, wx_data, xx_data,
bias ? bias->data<T>() : NULL);
to_batch(dev_ctx, *xx, batched_gate, true, is_reverse);
if (M > D3) {
math::FCCompute<DeviceContext, T>(blas, total_T, D3, M, x_data, wx_data,
xx_data,
bias ? bias->data<T>() : nullptr);
to_batch(dev_ctx, *xx, batched_input, true, is_reverse);
} else {
to_batch(dev_ctx, *x, xx, true, is_reverse);
batched_gate->set_lod(xx->lod());
math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
xx_data, wx_data, batched_gate_data,
bias ? bias->data<T>() : NULL);
batched_input->set_lod(xx->lod());
math::FCCompute<DeviceContext, T>(blas, total_T, D3, M, xx_data, wx_data,
batched_input_data,
bias ? bias->data<T>() : nullptr);
}
int frame_size = static_cast<int>(wx_dims[1] / 3);
math::GRUMetaValue<T> gru_value;
gru_value.gate_weight = const_cast<T*>(wh_data);
gru_value.state_weight =
const_cast<T*>(wh_data + 2 * frame_size * frame_size);
Tensor ordered_h0;
framework::Vector<size_t> order(batched_gate->lod()[2]);
auto batched_lod = batched_input->lod();
const auto& seq_order = batched_lod[2];
const int max_bs = seq_order.size();
reordered_h0->Resize({max_bs, D});
int tstart = 0;
T* prev_hidden_data = nullptr;
if (h0) {
ReorderInitState<DeviceContext, T>(
ctx.template device_context<DeviceContext>(), *h0, order, &ordered_h0,
true);
gru_value.prev_out_value = ordered_h0.data<T>();
// reorder h0
T* reordered_h0_data = reordered_h0->mutable_data<T>(ctx.GetPlace());
const T* h0_data = h0->data<T>();
prev_hidden_data = reordered_h0_data;
size_t sz = sizeof(T) * D;
for (int i = 0; i < max_bs; ++i) {
std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz);
reordered_h0_data += D;
}
} else {
gru_value.prev_out_value = nullptr;
// compute without h0
T* cur_in_data = batched_input_data;
T* cur_out_data = batched_out_data;
// W: {W_update, W_reset; W_state}
for (int i = 0; i < max_bs; ++i) {
// update gate
act_gate(D, cur_in_data, cur_in_data);
// state gate
act_state(D, cur_in_data + D2, cur_in_data + D2);
// out = a*b
blas.VMUL(D, cur_in_data, cur_in_data + D2, cur_out_data);
// add offset
cur_in_data += D3;
cur_out_data += D;
}
tstart = 1;
prev_hidden_data = batched_out_data;
}
auto batch_starts = batched_gate->lod()[0];
size_t seq_len = batch_starts.size() - 1;
auto active_node =
math::detail::GetActivationType(ctx.Attr<std::string>("activation"));
auto active_gate = math::detail::GetActivationType(
ctx.Attr<std::string>("gate_activation"));
#ifdef PADDLE_WITH_MKLML
// use MKL packed to speedup GEMM
if (FLAGS_paddle_num_threads >= 4) {
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
T* packed_gate = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/,
frame_size * 2 /*width of weight*/,
frame_size /*height of height*/);
PADDLE_ENFORCE(packed_gate);
blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size * 2,
frame_size, T(1.0), gru_value.gate_weight, frame_size * 2,
packed_gate);
T* packed_state = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/,
frame_size /*width of weight*/,
frame_size /*height of height*/);
PADDLE_ENFORCE(packed_state);
blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size,
frame_size, T(1.0), gru_value.state_weight, frame_size,
packed_state);
for (size_t n = 0; n < seq_len; n++) {
int bstart = static_cast<int>(batch_starts[n]);
int bend = static_cast<int>(batch_starts[n + 1]);
int cur_batch_size = bend - bstart;
Tensor gate_t = batched_gate->Slice(bstart, bend);
Tensor reset_hidden_prev_t =
batch_reset_hidden_prev->Slice(bstart, bend);
Tensor hidden_t = batch_hidden->Slice(bstart, bend);
gru_value.output_value = hidden_t.data<T>();
gru_value.gate_value = gate_t.data<T>();
gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
if (gru_value.prev_out_value) {
blas.GEMM_COMPUTE(
CblasNoTrans, CblasPacked, cur_batch_size, frame_size * 2,
frame_size, gru_value.prev_out_value, frame_size, packed_gate,
frame_size * 2, T(1), gru_value.gate_value, frame_size * 3);
}
math::detail::forward_reset_output(
math::detail::forward::gru_resetOutput<T>(), gru_value, frame_size,
cur_batch_size, active_gate);
if (gru_value.prev_out_value) {
blas.GEMM_COMPUTE(
CblasNoTrans, CblasPacked, cur_batch_size, frame_size, frame_size,
gru_value.reset_output_value, frame_size, packed_state,
frame_size, T(1), gru_value.gate_value + frame_size * 2,
frame_size * 3);
}
math::detail::forward_final_output(
math::detail::forward::gru_finalOutput<T>(), gru_value, frame_size,
cur_batch_size, active_node);
gru_value.prev_out_value = gru_value.output_value;
// Then start from next
const T* wh_state_data = wh_data + D * D2;
const auto& batch_starts = batched_lod[0];
const int max_seq_len = batch_starts.size() - 1;
batched_input_data = batched_input_data + tstart * max_bs * D3;
batched_out_data = batched_out_data + tstart * max_bs * D;
for (int step = tstart; step < max_seq_len; ++step) {
const int cur_bs = batch_starts[step + 1] - batch_starts[step];
// gemm prev * (Wu + Wr)
blas.GEMM(CblasNoTrans, CblasNoTrans, cur_bs, D2, D, static_cast<T>(1),
prev_hidden_data, D, wh_data, D2, static_cast<T>(1),
batched_input_data, D3);
T* cur_batched_data = batched_input_data;
T* cur_out_data = batched_out_data;
T* cur_prev_hidden_data = prev_hidden_data;
for (int i = 0; i < cur_bs; ++i) {
act_gate(D2, cur_batched_data, cur_batched_data);
// rt = rt*ht_1 inplace result
blas.VMUL(D, cur_prev_hidden_data, cur_batched_data + D, cur_out_data);
cur_batched_data += D3;
cur_prev_hidden_data += D;
cur_out_data += D;
}
blas.GEMM_FREE(packed_gate);
blas.GEMM_FREE(packed_state);
} else {
#endif
for (size_t n = 0; n < seq_len; n++) {
int bstart = static_cast<int>(batch_starts[n]);
int bend = static_cast<int>(batch_starts[n + 1]);
int cur_batch_size = bend - bstart;
Tensor gate_t = batched_gate->Slice(bstart, bend);
Tensor reset_hidden_prev_t =
batch_reset_hidden_prev->Slice(bstart, bend);
Tensor hidden_t = batch_hidden->Slice(bstart, bend);
gru_value.output_value = hidden_t.data<T>();
gru_value.gate_value = gate_t.data<T>();
gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
math::GRUUnitFunctor<DeviceContext, T>::compute(
dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
active_gate);
gru_value.prev_out_value = gru_value.output_value;
cur_batched_data = batched_input_data;
cur_out_data = batched_out_data;
blas.GEMM(CblasNoTrans, CblasNoTrans, cur_bs, D, D, static_cast<T>(1),
cur_out_data, D, wh_state_data, D, static_cast<T>(1),
cur_batched_data + D2, D3);
cur_prev_hidden_data = prev_hidden_data;
for (int i = 0; i < cur_bs; ++i) {
// ht~ = act_state(...)
act_state(D, cur_batched_data + D2, cur_batched_data + D2);
// out = zt*ht~ + (1-zt)*ht_1
cross(D, cur_batched_data, cur_batched_data + D2, cur_prev_hidden_data,
cur_out_data);
cur_batched_data += D3;
cur_prev_hidden_data += D;
cur_out_data += D;
}
#ifdef PADDLE_WITH_MKLML
prev_hidden_data = batched_out_data;
batched_out_data = cur_out_data;
batched_input_data = cur_batched_data;
}
#endif
math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
batch_hidden->set_lod(batched_gate->lod());
to_seq(dev_ctx, *batch_hidden, hidden_out);
batched_out->set_lod(batched_lod);
to_seq(dev_ctx, *batched_out, hidden_out);
}
#undef INIT_VEC_FUNC
#undef INIT_BASE_SIZES
#undef INIT_BASE_INPUT_OUTPUT
};
} // namespace operators
......@@ -327,6 +428,5 @@ class FusionGRUKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators;
REGISTER_OPERATOR(fusion_gru, ops::FusionGRUOp, ops::FusionGRUOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OP_CPU_KERNEL(
fusion_gru, ops::FusionGRUKernel<paddle::platform::CPUDeviceContext, float>,
ops::FusionGRUKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(fusion_gru, ops::FusionGRUKernel<float>,
ops::FusionGRUKernel<double>);
......@@ -16,14 +16,10 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/operators/math/lstm_compute.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/platform/cpu_info.h"
DEFINE_bool(seq_mode, true, "Use sequence mode");
namespace paddle {
namespace operators {
......@@ -42,10 +38,16 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
"Output(Hidden) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Cell"),
"Output(Cell) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedGate"),
"Output(BatchedGate) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"),
"Output(BatchedGate) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
"Output(BatchedInput) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"),
"Output(BatchedHidden) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"),
"Output(BatchedCell) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
"Output(ReorderedH0) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"),
"Output(ReorderedC0) of LSTM should not be null.");
auto x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
......@@ -97,13 +99,14 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
framework::DDim out_dims({x_dims[0], frame_size});
ctx->SetOutputDim("Hidden", out_dims);
ctx->SetOutputDim("Cell", out_dims);
ctx->SetOutputDim("BatchedGate", {x_dims[0], wx_dims[1]});
ctx->SetOutputDim("BatchCellPreAct", out_dims);
ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
ctx->SetOutputDim("BatchedHidden", out_dims);
ctx->SetOutputDim("BatchedCell", out_dims);
ctx->ShareLoD("X", "Hidden");
ctx->ShareLoD("X", "Cell");
int xx_width;
if (FLAGS_seq_mode) {
if (ctx->Attrs().Get<bool>("use_seq")) {
xx_width = wx_dims[1];
} else {
xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
......@@ -169,9 +172,11 @@ void FusionLSTMOpMaker::Make() {
" where T is the total time steps in this mini-batch,"
" D is the hidden size, M is the dim size of x input.")
.AsIntermediate();
AddOutput("BatchedGate", "(LoDTensor) (same as LSTMOp).").AsIntermediate();
AddOutput("BatchCellPreAct", "(LoDTensor) (same as LSTMOp).")
.AsIntermediate();
AddOutput("BatchedInput", "(LoDTensor) (T x 4D).").AsIntermediate();
AddOutput("BatchedHidden", "(LoDTensor) (T x D).").AsIntermediate();
AddOutput("BatchedCell", "(LoDTensor) (T x D).").AsIntermediate();
AddOutput("ReorderedH0", "(LoDTensor) (N x D).").AsIntermediate();
AddOutput("ReorderedC0", "(LoDTensor) (N x D).").AsIntermediate();
AddAttr<bool>("use_peepholes",
"(bool, defalut: True) "
"whether to enable diagonal/peephole connections.")
......@@ -180,6 +185,10 @@ void FusionLSTMOpMaker::Make() {
"(bool, defalut: False) "
"whether to compute reversed LSTM.")
.SetDefault(false);
AddAttr<bool>("use_seq",
"(bool, defalut: True) "
"whether to use seq mode to compute.")
.SetDefault(true);
AddAttr<std::string>("gate_activation",
"(string, default: sigmoid)"
"The activation for input gate, forget gate and output "
......@@ -203,64 +212,60 @@ This operator fuse the X into LSTM, more details can refer to LSTM op.
)DOC");
}
template <typename DeviceContext, typename T>
inline void ReorderInitState(const DeviceContext& ctx,
const framework::Tensor& src,
framework::Vector<size_t> index_lod,
framework::Tensor* dst, bool indexed_src) {
math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
dst->mutable_data<T>(src.dims(), ctx.GetPlace());
// TODO(TJ): check mem copy perf
row_shuffle(ctx, src, index_lod, dst, indexed_src);
}
template <typename T>
class FuisonLSTMKernel : public framework::OpKernel<T> {
public:
#define INIT_VEC_FUNC \
std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand; \
auto& act_gate_str = ctx.Attr<std::string>("gate_activation"); \
auto& act_cell_str = ctx.Attr<std::string>("cell_activation"); \
auto& act_cand_str = ctx.Attr<std::string>("candidate_activation"); \
if (platform::jit::MayIUse(platform::jit::avx)) { \
math::VecActivations<T, platform::jit::avx> act_functor; \
act_gate = act_functor(act_gate_str); \
act_cell = act_functor(act_cell_str); \
act_cand = act_functor(act_cand_str); \
} else { \
math::VecActivations<T, platform::jit::isa_any> act_functor; \
act_gate = act_functor(act_gate_str); \
act_cell = act_functor(act_cell_str); \
act_cand = act_functor(act_cand_str); \
}
#define INIT_BASE_INPUT_OUTPUT \
auto* x = ctx.Input<LoDTensor>("X"); \
auto* h0 = ctx.Input<Tensor>("H0"); \
auto* c0 = ctx.Input<Tensor>("C0"); \
auto* wx = ctx.Input<Tensor>("WeightX"); \
auto* wh = ctx.Input<Tensor>("WeightH"); \
auto* bias = ctx.Input<Tensor>("Bias"); \
auto* xx = ctx.Output<LoDTensor>("XX"); \
auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
auto* cell_out = ctx.Output<LoDTensor>("Cell"); \
bool is_reverse = ctx.Attr<bool>("is_reverse");
#define INIT_BASE_SIZES \
auto x_dims = x->dims(); /* T x M*/ \
auto wh_dims = wh->dims(); /* D x 4D*/ \
const int M = x_dims[1]; \
const int D = wh_dims[0]; \
const int D2 = D * 2; \
const int D3 = D * 3; \
const int D4 = wh_dims[1];
void SeqCompute(const framework::ExecutionContext& ctx) const {
using DeviceContext = paddle::platform::CPUDeviceContext;
auto* x = ctx.Input<LoDTensor>("X");
auto* h0 = ctx.Input<Tensor>("H0");
auto* c0 = ctx.Input<Tensor>("C0");
auto* wx = ctx.Input<Tensor>("WeightX");
auto* wh = ctx.Input<Tensor>("WeightH");
auto* bias = ctx.Input<Tensor>("Bias");
auto* xx = ctx.Output<LoDTensor>("XX");
auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
auto* cell_out = ctx.Output<LoDTensor>("Cell");
bool is_reverse = ctx.Attr<bool>("is_reverse");
std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand;
auto& act_gate_str = ctx.Attr<std::string>("gate_activation");
auto& act_cell_str = ctx.Attr<std::string>("cell_activation");
auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");
if (platform::jit::MayIUse(platform::jit::avx)) {
math::VecActivations<T, platform::jit::avx> act_functor;
act_gate = act_functor(act_gate_str);
act_cell = act_functor(act_cell_str);
act_cand = act_functor(act_cand_str);
} else {
math::VecActivations<T, platform::jit::isa_any> act_functor;
act_gate = act_functor(act_gate_str);
act_cell = act_functor(act_cell_str);
act_cand = act_functor(act_cand_str);
}
INIT_BASE_INPUT_OUTPUT
INIT_BASE_SIZES
INIT_VEC_FUNC
auto x_lod = x->lod();
auto x_dims = x->dims(); // T x M
auto wh_dims = wh->dims(); // D x 4D
const int total_T = x_dims[0];
const int N = x_lod[0].size() - 1; // batch size
const int M = x_dims[1]; // x frame size
const int D = wh_dims[0];
const int D2 = D * 2;
const int D3 = D * 3;
const int D4 = wh_dims[1];
const T* x_data = x->data<T>();
const T* h0_data = h0 ? h0->data<T>() : NULL;
const T* c0_data = c0 ? c0->data<T>() : NULL;
const T* h0_data = h0 ? h0->data<T>() : nullptr;
const T* c0_data = c0 ? c0->data<T>() : nullptr;
const T* wx_data = wx->data<T>();
const T* wh_data = wh->data<T>();
T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
......@@ -290,12 +295,12 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
for (int i = 0; i < N; ++i) {
int bid = is_reverse ? N - 1 - i : i;
int seq_len = x_lod[0][bid + 1] - x_lod[0][bid];
const T* prev_cell_data = NULL;
const T* prev_hidden_data = NULL;
const T* prev_c_data = nullptr;
const T* prev_h_data = nullptr;
int tstart = 0;
if (h0_data) {
prev_hidden_data = h0_data + bid * D;
prev_cell_data = c0_data + bid * D;
prev_h_data = h0_data + bid * D;
prev_c_data = c0_data + bid * D;
} else {
// W_ch, W_ih, W_fh, W_oh
act_gate(D3, xx_data + D, xx_data + D);
......@@ -307,23 +312,22 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data);
// prev
prev_hidden_data = hidden_out_data;
prev_cell_data = cell_out_data;
prev_h_data = hidden_out_data;
prev_c_data = cell_out_data;
tstart = 1;
move_step();
}
for (int step = tstart; step < seq_len; ++step) {
blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D4, D, static_cast<T>(1),
prev_hidden_data, D, wh_data, D4, static_cast<T>(1), xx_data,
D4);
prev_h_data, D, wh_data, D4, static_cast<T>(1), xx_data, D4);
// W_ch, W_ih, W_fh, W_oh
act_gate(D3, xx_data + D, xx_data + D);
act_cand(D, xx_data, xx_data);
// a = forget * prev_cell
blas.VMUL(D, xx_data + D2, prev_cell_data, xx_data + D2);
blas.VMUL(D, xx_data + D2, prev_c_data, xx_data + D2);
// b = input * tilde
blas.VMUL(D, xx_data, xx_data + D, xx_data + D);
......@@ -336,8 +340,8 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data);
// prev
prev_hidden_data = hidden_out_data;
prev_cell_data = cell_out_data;
prev_h_data = hidden_out_data;
prev_c_data = cell_out_data;
move_step();
}
......@@ -346,143 +350,155 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
void BatchCompute(const framework::ExecutionContext& ctx) const {
using DeviceContext = platform::CPUDeviceContext;
auto* x = ctx.Input<LoDTensor>("X");
auto* wx = ctx.Input<Tensor>("WeightX");
auto* wh = ctx.Input<Tensor>("WeightH");
auto* bias = ctx.Input<Tensor>("Bias");
auto* hidden_t0 = ctx.Input<Tensor>("H0");
auto* cell_t0 = ctx.Input<Tensor>("C0");
auto* xx = ctx.Output<LoDTensor>("XX");
auto* batched_gate = ctx.Output<LoDTensor>("BatchedGate");
auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
auto* cell_out = ctx.Output<LoDTensor>("Cell");
bool is_reverse = ctx.Attr<bool>("is_reverse");
INIT_BASE_INPUT_OUTPUT
if (x->lod()[0].size() == 2) {
SeqCompute(ctx);
return;
}
INIT_BASE_SIZES
INIT_VEC_FUNC
T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
T* batched_gate_data = batched_gate->mutable_data<T>(ctx.GetPlace());
hidden_out->mutable_data<T>(ctx.GetPlace());
cell_out->mutable_data<T>(ctx.GetPlace());
auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
auto* reordered_c0 = ctx.Output<Tensor>("ReorderedC0");
auto* batched_input = ctx.Output<LoDTensor>("BatchedInput");
auto* batched_c_out = ctx.Output<LoDTensor>("BatchedCell");
auto* batched_h_out = ctx.Output<LoDTensor>("BatchedHidden");
const T* x_data = x->data<T>();
const T* wx_data = wx->data<T>();
auto x_dims = x->dims();
auto wx_dims = wx->dims();
const T* wh_data = wh->data<T>();
auto place = ctx.GetPlace();
T* xx_data = xx->mutable_data<T>(place);
T* batched_input_data = batched_input->mutable_data<T>(place);
T* batched_c_out_data = batched_c_out->mutable_data<T>(place);
T* batched_h_out_data = batched_h_out->mutable_data<T>(place);
hidden_out->mutable_data<T>(place);
cell_out->mutable_data<T>(place);
math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
if (x_dims[1] > wx_dims[1]) {
math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
x_data, wx_data, xx_data,
bias->data<T>());
to_batch(dev_ctx, *xx, batched_gate, true, is_reverse);
if (M > D4) {
math::FCCompute<DeviceContext, T>(blas, x_dims[0], D4, M, x_data, wx_data,
xx_data, bias->data<T>());
to_batch(dev_ctx, *xx, batched_input, true, is_reverse);
} else {
to_batch(dev_ctx, *x, xx, true, is_reverse);
batched_gate->set_lod(xx->lod());
math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
xx_data, wx_data, batched_gate_data,
batched_input->set_lod(xx->lod());
math::FCCompute<DeviceContext, T>(blas, x_dims[0], D4, M, xx_data,
wx_data, batched_input_data,
bias->data<T>());
}
int frame_size = static_cast<int>(wx_dims[1] / 4);
framework::DDim out_dims({x_dims[0], frame_size});
math::LstmMetaValue<T> lstm_value;
// no peephole
lstm_value.check_ig = nullptr;
lstm_value.check_fg = nullptr;
lstm_value.check_og = nullptr;
lstm_value.prev_state_value = nullptr;
Tensor ordered_c0;
framework::Vector<size_t> order(batched_gate->lod()[2]);
if (cell_t0) {
// Since the batch computing for LSTM reorders the input sequence
// according to their length. The initialized cell state also needs
// to reorder.
ReorderInitState<DeviceContext, T>(dev_ctx, *cell_t0, order, &ordered_c0,
true);
lstm_value.prev_state_value = ordered_c0.data<T>();
auto batched_lod = batched_input->lod();
const auto& seq_order = batched_lod[2];
const int max_bs = seq_order.size();
reordered_h0->Resize({max_bs, D});
reordered_c0->Resize({max_bs, D});
int tstart = 0;
T* prev_h_data = nullptr;
T* prev_c_data = nullptr;
if (h0) {
// reorder h0, c0
T* reordered_h0_data = reordered_h0->mutable_data<T>(place);
T* reordered_c0_data = reordered_c0->mutable_data<T>(place);
const T* h0_data = h0->data<T>();
const T* c0_data = c0->data<T>();
prev_h_data = reordered_h0_data;
prev_c_data = reordered_c0_data;
size_t sz = sizeof(T) * D;
for (int i = 0; i < max_bs; ++i) {
std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz);
std::memcpy(reordered_c0_data, c0_data + seq_order[i] * D, sz);
reordered_h0_data += D;
reordered_c0_data += D;
}
} else {
// compute without h0, c0
T* cur_in_data = batched_input_data;
T* cur_h_out_data = batched_h_out_data;
T* cur_c_out_data = batched_c_out_data;
// W_ch, W_ih, W_fh, W_oh
for (int i = 0; i < max_bs; ++i) {
act_gate(D3, cur_in_data + D, cur_in_data + D);
act_cand(D, cur_in_data, cur_in_data);
// cell out= input*tilde
blas.VMUL(D, cur_in_data, cur_in_data + D, cur_c_out_data);
// hidden out= act_state(cellout) * outgate
act_cell(D, cur_c_out_data, cur_in_data + D2);
blas.VMUL(D, cur_in_data + D2, cur_in_data + D3, cur_h_out_data);
// add offset
cur_in_data += D4;
cur_c_out_data += D;
cur_h_out_data += D;
}
tstart = 1;
prev_h_data = batched_h_out_data;
prev_c_data = batched_c_out_data;
}
// Then start from next
const auto& batch_starts = batched_lod[0];
const int max_seq_len = batch_starts.size() - 1;
const int offset = tstart * max_bs * D;
batched_input_data = batched_input_data + offset * 4;
batched_h_out_data = batched_h_out_data + offset;
batched_c_out_data = batched_c_out_data + offset;
for (int step = tstart; step < max_seq_len; ++step) {
const int cur_bs = batch_starts[step + 1] - batch_starts[step];
blas.GEMM(CblasNoTrans, CblasNoTrans, cur_bs, D4, D, static_cast<T>(1),
prev_h_data, D, wh_data, D4, static_cast<T>(1),
batched_input_data, D4);
T* cur_in_data = batched_input_data;
T* cur_prev_c_data = prev_c_data;
T* cur_c_out_data = batched_c_out_data;
T* cur_h_out_data = batched_h_out_data;
for (int i = 0; i < cur_bs; ++i) {
// W_ch, W_ih, W_fh, W_oh
act_gate(D3, cur_in_data + D, cur_in_data + D);
act_cand(D, cur_in_data, cur_in_data);
// a = forget * prev_cell
blas.VMUL(D, cur_in_data + D2, cur_prev_c_data, cur_in_data + D2);
// b = input * tilde
blas.VMUL(D, cur_in_data, cur_in_data + D, cur_in_data + D);
// cell out= a+b
blas.VADD(D, cur_in_data + D, cur_in_data + D2, cur_c_out_data);
// hidden out= act_state(cellout) * outgate
act_cell(D, cur_c_out_data, cur_in_data + D2);
blas.VMUL(D, cur_in_data + D2, cur_in_data + D3, cur_h_out_data);
// Use the local variable as here.
LoDTensor batch_hidden, batch_cell;
auto* batch_cell_pre_act = ctx.Output<LoDTensor>("BatchCellPreAct");
batch_hidden.mutable_data<T>(out_dims, ctx.GetPlace());
batch_cell.mutable_data<T>(out_dims, ctx.GetPlace());
batch_cell_pre_act->mutable_data<T>(out_dims, ctx.GetPlace());
auto batch_starts = batched_gate->lod()[0];
size_t max_seq_len = batch_starts.size() - 1;
auto gate_act = math::detail::GetActivationType(
ctx.Attr<std::string>("gate_activation"));
auto cell_act = math::detail::GetActivationType(
ctx.Attr<std::string>("cell_activation"));
auto cand_act = math::detail::GetActivationType(
ctx.Attr<std::string>("candidate_activation"));
for (size_t n = 0; n < max_seq_len; n++) {
int bstart = static_cast<int>(batch_starts[n]);
int bend = static_cast<int>(batch_starts[n + 1]);
Tensor gate_t = batched_gate->Slice(bstart, bend);
Tensor out_t = batch_hidden.Slice(bstart, bend);
Tensor cell_t = batch_cell.Slice(bstart, bend);
Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
int cur_batch_size = bend - bstart;
if (n > 0) {
int pre_h_start = static_cast<int>(batch_starts[n - 1]);
int pre_h_end = pre_h_start + cur_batch_size;
auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end);
// TODO(TJ): use gemm directly
blas.MatMul(pre_hidden_t, false, *wh, false, static_cast<T>(1.0),
&gate_t, static_cast<T>(1.0));
} else if (hidden_t0) {
// TODO(TJ): move h0 outside for
// If n == 0 and there is no initialized hidden state, that is to say
// the H0 is zeros, the calculation W_h * H0 will be skiped.
// If n == 0 and there is initialized hidden state, calculate W_h * H0.
// Since the batch computing for LSTM reorders the input sequence
// according to their length. The initialized hidden state also needs
// to reorder.
Tensor ordered_h0;
ReorderInitState<DeviceContext, T>(dev_ctx, *hidden_t0, order,
&ordered_h0, true);
// TODO(TJ): use gemm directly
blas.MatMul(ordered_h0, false, *wh, false, static_cast<T>(1.0), &gate_t,
static_cast<T>(1.0));
cur_in_data += D4;
cur_prev_c_data += D;
cur_c_out_data += D;
cur_h_out_data += D;
}
lstm_value.gate_value = gate_t.data<T>();
lstm_value.output_value = out_t.data<T>();
lstm_value.state_value = cell_t.data<T>();
lstm_value.state_active_value = cell_pre_act_t.data<T>();
math::LstmUnitFunctor<DeviceContext, T>::compute(
dev_ctx, lstm_value, frame_size, cur_batch_size, gate_act, cell_act,
cand_act);
lstm_value.prev_state_value = lstm_value.state_value;
prev_c_data = batched_c_out_data;
prev_h_data = batched_h_out_data;
batched_c_out_data = cur_c_out_data;
batched_h_out_data = cur_h_out_data;
batched_input_data = cur_in_data;
}
math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
batch_hidden.set_lod(batched_gate->lod());
// restore the output hidden in LoDTensor from the batch hidden
to_seq(dev_ctx, batch_hidden, hidden_out);
batch_cell.set_lod(batched_gate->lod());
// restore the output cell state in LoDTensor from the batch cell
to_seq(dev_ctx, batch_cell, cell_out);
batched_h_out->set_lod(batched_lod);
to_seq(dev_ctx, *batched_h_out, hidden_out);
batched_c_out->set_lod(batched_lod);
to_seq(dev_ctx, *batched_c_out, cell_out);
}
void Compute(const framework::ExecutionContext& ctx) const override {
if (FLAGS_seq_mode) {
if (ctx.Attr<bool>("use_seq")) {
SeqCompute(ctx);
} else {
BatchCompute(ctx);
}
}
#undef INIT_BASE_SIZES
#undef INIT_BASE_INPUT_OUTPUT
#undef INIT_VEC_FUNC
};
} // namespace operators
......
......@@ -92,12 +92,12 @@ class GRUUnitKernel : public framework::OpKernel<T> {
gate_data, frame_size * 3);
// calculate activited gate
Eigen::array<int, 2> extents({{batch_size, frame_size}});
Eigen::array<int, 2> u_offsets({{0, 0}});
Eigen::array<int, 2> extents = {batch_size, frame_size};
Eigen::array<int, 2> u_offsets = {0, 0};
ActCompute(context.Attr<int>("gate_activation"), place,
g.slice(u_offsets, extents), g.slice(u_offsets, extents));
auto u = g.slice(u_offsets, extents); // update gate
Eigen::array<int, 2> r_offsets({{0, frame_size}});
Eigen::array<int, 2> r_offsets = {0, frame_size};
ActCompute(context.Attr<int>("gate_activation"), place,
g.slice(r_offsets, extents), g.slice(r_offsets, extents));
auto r = g.slice(r_offsets, extents); // reset gate
......@@ -107,7 +107,7 @@ class GRUUnitKernel : public framework::OpKernel<T> {
weight_data + frame_size * frame_size * 2, frame_size, 1,
gate_data + frame_size * 2, frame_size * 3);
Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
Eigen::array<int, 2> c_offsets = {0, frame_size * 2};
ActCompute(context.Attr<int>("activation"), place,
g.slice(c_offsets, extents), g.slice(c_offsets, extents));
auto c = g.slice(c_offsets, extents); // output candidate
......@@ -171,12 +171,12 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
int batch_size = input->dims()[0];
int frame_size = hidden_prev->dims()[1];
Eigen::array<int, 2> extents({{batch_size, frame_size}});
Eigen::array<int, 2> u_offsets({{0, 0}});
Eigen::array<int, 2> extents = {batch_size, frame_size};
Eigen::array<int, 2> u_offsets = {0, 0};
auto u = g.slice(u_offsets, extents); // update gate
Eigen::array<int, 2> r_offsets({{0, frame_size}});
Eigen::array<int, 2> r_offsets = {0, frame_size};
auto r = g.slice(r_offsets, extents); // reset gate
Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
Eigen::array<int, 2> c_offsets = {0, frame_size * 2};
auto c = g.slice(c_offsets, extents); // output candidate
// backward for unactivated update gate
......
......@@ -38,7 +38,8 @@ class LabelSmoothKernel : public framework::OpKernel<T> {
auto dist = framework::EigenVector<T>::Flatten(*dist_t);
out.device(dev) =
static_cast<T>(1 - epsilon) * in +
epsilon * dist.broadcast(Eigen::DSizes<int, 1>(in_t->numel()));
static_cast<T>(epsilon) *
dist.broadcast(Eigen::DSizes<int, 1>(in_t->numel()));
} else {
out.device(dev) = static_cast<T>(1 - epsilon) * in +
static_cast<T>(epsilon / label_dim);
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/port.h"
namespace paddle {
namespace operators {
......
if (NOT WIN32)
add_subdirectory(detail)
endif(NOT WIN32)
function(math_library TARGET)
# math_library is a function to create math library.
......@@ -38,9 +40,13 @@ math_library(context_project DEPS im2col math_function)
math_library(cross_entropy)
math_library(cos_sim_functor)
math_library(depthwise_conv)
math_library(gru_compute DEPS activation_functions math_function)
math_library(im2col)
if (NOT WIN32) # windows do not support avx functions yet.
math_library(gru_compute DEPS activation_functions math_function)
math_library(lstm_compute DEPS activation_functions)
endif (NOT WIN32)
cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
math_library(math_function DEPS blas)
math_library(maxouting)
......@@ -51,7 +57,9 @@ math_library(sequence_padding)
math_library(sequence_pooling DEPS math_function)
math_library(sequence_scale)
math_library(softmax DEPS math_function)
if (NOT WIN32)
math_library(matrix_bit_code)
endif (NOT WIN32)
math_library(unpooling)
math_library(vol2col)
......
......@@ -132,6 +132,121 @@ inline void vec_scal<float, platform::jit::avx512_common>(const int n,
vec_scal<float, platform::jit::avx2>(n, a, x, y);
}
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
inline void vec_bias_sub(const int n, const T a, const T* x, T* y) {
for (int i = 0; i < n; ++i) {
y[i] = a - x[i];
}
}
template <>
inline void vec_bias_sub<float, platform::jit::avx>(const int n, const float a,
const float* x, float* y) {
#ifdef __AVX__
constexpr int block = AVX_FLOAT_BLOCK;
if (n < block) {
vec_bias_sub<float, platform::jit::isa_any>(n, a, x, y);
return;
}
const int rest = n % block;
const int end = n - rest;
int i = 0;
__m256 bias = _mm256_set1_ps(a);
__m256 tmp;
#define MOVE_ONE_STEP \
tmp = _mm256_loadu_ps(x + i); \
tmp = _mm256_sub_ps(bias, tmp); \
_mm256_storeu_ps(y + i, tmp)
for (i = 0; i < end; i += block) {
MOVE_ONE_STEP;
}
#undef MOVE_ONE_STEP
if (rest == 0) {
return;
}
// can not continue move step if src and dst are inplace
for (i = n - rest; i < n; ++i) {
y[i] = a - x[i];
}
#else
vec_bias_sub<float, platform::jit::isa_any>(n, a, x, y);
#endif
}
template <>
inline void vec_bias_sub<float, platform::jit::avx2>(const int n, const float a,
const float* x, float* y) {
vec_bias_sub<float, platform::jit::avx>(n, a, x, y);
}
template <>
inline void vec_bias_sub<float, platform::jit::avx512_common>(const int n,
const float a,
const float* x,
float* y) {
// TODO(TJ): enable me
vec_bias_sub<float, platform::jit::avx2>(n, a, x, y);
}
// out = x*y + (1-x)*z
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) {
for (int i = 0; i < n; ++i) {
out[i] = x[i] * y[i] + (static_cast<T>(1) - x[i]) * z[i];
}
}
template <>
inline void vec_cross<float, platform::jit::avx>(const int n, const float* x,
const float* y, const float* z,
float* out) {
#ifdef __AVX__
constexpr int block = AVX_FLOAT_BLOCK;
if (n < block) {
vec_cross<float, platform::jit::isa_any>(n, x, y, z, out);
return;
}
const int rest = n % block;
const int end = n - rest;
int i = 0;
__m256 bias = _mm256_set1_ps(1.f);
__m256 tmpx, tmpy, tmpz;
for (i = 0; i < end; i += block) {
tmpx = _mm256_loadu_ps(x + i);
tmpy = _mm256_loadu_ps(y + i);
tmpz = _mm256_loadu_ps(z + i);
tmpy = _mm256_mul_ps(tmpx, tmpy);
tmpx = _mm256_sub_ps(bias, tmpx);
tmpz = _mm256_mul_ps(tmpx, tmpz);
tmpz = _mm256_add_ps(tmpy, tmpz);
_mm256_storeu_ps(out + i, tmpz);
}
if (rest == 0) {
return;
}
// can not continue move step if src and dst are inplace
for (i = n - rest; i < n; ++i) {
out[i] = x[i] * y[i] + (1.f - x[i]) * z[i];
}
#else
vec_cross<float, platform::jit::isa_any>(n, x, y, z, out);
#endif
}
template <>
inline void vec_cross<float, platform::jit::avx2>(const int n, const float* x,
const float* y,
const float* z, float* out) {
vec_cross<float, platform::jit::avx>(n, x, y, z, out);
}
template <>
inline void vec_cross<float, platform::jit::avx512_common>(
const int n, const float* x, const float* y, const float* z, float* out) {
// TODO(TJ): enable me
vec_cross<float, platform::jit::avx>(n, x, y, z, out);
}
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
for (int i = 0; i < n; ++i) {
......
......@@ -55,7 +55,7 @@ struct TensorSetConstantCPU {
TensorSetConstantCPU(framework::Tensor* tensor, float value)
: tensor_(tensor), value_(value) {}
template <typename T>
void operator()() const {
void apply() const {
auto cpu = platform::CPUPlace();
auto* begin = tensor_->mutable_data<T>(cpu);
std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
......
......@@ -52,7 +52,7 @@ struct TensorSetConstantGPU {
: context_(context), tensor_(tensor), value_(value) {}
template <typename T>
void operator()() const {
void apply() const {
SetConstant<platform::CUDADeviceContext, T> functor;
functor(reinterpret_cast<const platform::CUDADeviceContext&>(context_),
tensor_, static_cast<T>(value_));
......
......@@ -17,6 +17,11 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
#if defined(_WIN32)
#include <intrin.h>
#include <windows.h>
#endif // _WIN32
namespace paddle {
namespace operators {
namespace math {
......@@ -55,12 +60,38 @@ namespace math {
* FindLastSet(x) = 1 + \floor*{\log_{2}x}
* \f]
*/
#if !defined(_WIN32)
inline constexpr size_t FindLastSet(size_t x) {
return std::is_same<size_t, unsigned int>::value
? (x ? 8 * sizeof(x) - __builtin_clz(x) : 0)
: (std::is_same<size_t, unsigned long>::value // NOLINT
? (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0)
: (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0));
#else
// windows don't have built-in clz, ctz function
template <typename T>
inline int ctz(const T& value) {
DWORD trailing_zero = 0;
if (_BitScanForward(&trailing_zero, value)) {
return static_cast<int>(trailing_zero);
} else {
return static_cast<int>(0);
}
}
template <typename T>
inline int clz(const T& value) {
DWORD leadning_zero = 0;
if (_BitScanReverse(&leadning_zero, value)) {
return static_cast<int>(sizeof(T) * 8 - leadning_zero);
} else {
return static_cast<int>(0);
}
}
inline size_t FindLastSet(size_t x) { return sizeof(size_t) * 8 - clz(x); }
#endif // !_WIN32
}
struct SimpleCode {
......
......@@ -16,13 +16,12 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/hostdevice.h"
#include "paddle/fluid/platform/macros.h"
namespace paddle {
namespace operators {
namespace math {
#define FLT_MAX __FLT_MAX__
template <typename DeviceContext, typename T>
class MaxOutFunctor {
public:
......
......@@ -18,15 +18,12 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/hostdevice.h"
#include "paddle/fluid/platform/macros.h"
namespace paddle {
namespace operators {
namespace math {
#define FLT_MAX \
__FLT_MAX__ // TODO(zcd) :It might need to be placed in another file, but I'm
// still wondering where to put it.
/*
* \brief Extracting simple operations from pooling.
* Both MaxPool and AvgPool need "initial", "compute" and "finalize"
......
......@@ -92,7 +92,7 @@ class LoDTensor2BatchFunctor {
// Calculate the start position of each batch.
// example: sequences = {s0, s1, s2}
// s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
// num_batch = 5,
// max_seqlen = 5,
// batchIndex = {b0, b1, b2, b3, b4}
// b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
// batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
......@@ -109,7 +109,7 @@ class LoDTensor2BatchFunctor {
// where 1 is the second sequence,
// 0 is the first sequence,
// 2 is the third sequence.
// The num_batch represents batch size after rearranging the
// The max_seqlen represents batch size after rearranging the
// input LodTensor. It is also the maximum length of input sequence.
paddle::framework::LoD batch_lods;
......@@ -118,8 +118,8 @@ class LoDTensor2BatchFunctor {
batch_lods.emplace_back(std::vector<size_t>{0});
// batch_lods[0] is the start positions for batch LoDTensor
int num_batch = seq_info[0].length;
batch_lods[0].resize(static_cast<size_t>(num_batch + 1));
int max_seqlen = seq_info[0].length;
batch_lods[0].resize(static_cast<size_t>(max_seqlen + 1));
// batch_lods[1] is the raw index in the input LoDTensor
batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
// batch_lods[2] is the sort order for the input LoDTensor.
......@@ -128,7 +128,7 @@ class LoDTensor2BatchFunctor {
size_t* batch_starts = batch_lods[0].data();
size_t* seq2batch_idx = batch_lods[1].data();
batch_starts[0] = 0;
for (int n = 0; n < num_batch; n++) {
for (int n = 0; n < max_seqlen; n++) {
auto batch_id = static_cast<int>(batch_starts[n]);
for (size_t i = 0; i < seq_info.size(); ++i) {
int seq_len = seq_info[i].length;
......
......@@ -41,7 +41,7 @@ struct OneHotOpCUDAFunctor {
: in_(in), out_(out), depth_(depth), ctx_(ctx) {}
template <typename OutT>
void operator()() const {
void apply() const {
auto* p_in_data = in_->data<InT>();
auto numel = in_->numel();
auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
......
......@@ -31,7 +31,7 @@ struct OneHotOpFunctor {
: in_(in), out_(out), depth_(depth), ctx_(ctx) {}
template <typename OutT>
void operator()() const {
void apply() const {
auto* p_in_data = in_->data<InT>();
auto numel = in_->numel();
auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
using framework::Tensor;
template <typename T>
void Pad2DConstNCHW(const T* in_data, const int num, const int channels,
const int in_height, const int in_width,
const int out_height, const int out_width,
const int pad_top, const int pad_left, T value,
T* out_data) {
for (int n = 0; n < num; ++n) {
for (int c = 0; c < channels; ++c) {
for (int out_h = 0; out_h < out_height; ++out_h) {
for (int out_w = 0; out_w < out_width; ++out_w) {
int in_h = out_h - pad_top;
int in_w = out_w - pad_left;
out_data[out_h * out_width + out_w] =
(in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width)
? value
: in_data[in_h * in_width + in_w];
}
}
in_data += in_height * in_width;
out_data += out_height * out_width;
}
}
}
template <typename T>
void Pad2DConstNHWC(const T* in_data, const int num, const int channels,
const int in_height, const int in_width,
const int out_height, const int out_width,
const int pad_top, const int pad_left, T value,
T* out_data) {
for (int n = 0; n < num; ++n) {
for (int out_h = 0; out_h < out_height; ++out_h) {
for (int out_w = 0; out_w < out_width; ++out_w) {
int in_h = out_h - pad_top;
int in_w = out_w - pad_left;
const int out_index = (out_h * out_width + out_w) * channels;
if (in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width) {
for (int c = 0; c < channels; ++c) {
out_data[out_index + c] = value;
}
} else {
const int in_index = (in_h * in_width + in_w) * channels;
for (int c = 0; c < channels; ++c) {
out_data[out_index + c] = in_data[in_index + c];
}
}
}
}
in_data += in_height * in_width * channels;
out_data += out_height * out_width * channels;
}
}
template <typename T>
void Pad2DReflectNCHW(const T* in_data, const int num, const int channels,
const int in_height, const int in_width,
const int out_height, const int out_width,
const int pad_top, const int pad_left, T* out_data) {
for (int n = 0; n < num; ++n) {
for (int c = 0; c < channels; ++c) {
for (int out_h = 0; out_h < out_height; ++out_h) {
for (int out_w = 0; out_w < out_width; ++out_w) {
int in_h = out_h - pad_top;
int in_w = out_w - pad_left;
in_h = std::max(in_h, -in_h); // reflect by 0
in_h =
std::min(in_h, 2 * in_height - in_h - 2); // reflect by in_height
in_w = std::max(in_w, -in_w); // reflect by 0
in_w =
std::min(in_w, 2 * in_width - in_w - 2); // reflect by in_width
out_data[out_h * out_width + out_w] = in_data[in_h * in_width + in_w];
}
}
in_data += in_height * in_width;
out_data += out_height * out_width;
}
}
}
template <typename T>
void Pad2DReflectNHWC(const T* in_data, const int num, const int channels,
const int in_height, const int in_width,
const int out_height, const int out_width,
const int pad_top, const int pad_left, T* out_data) {
for (int n = 0; n < num; ++n) {
for (int out_h = 0; out_h < out_height; ++out_h) {
for (int out_w = 0; out_w < out_width; ++out_w) {
const int out_index = (out_h * out_width + out_w) * channels;
int in_h = out_h - pad_top;
int in_w = out_w - pad_left;
in_h = std::max(in_h, -in_h);
in_h = std::min(in_h, 2 * in_height - in_h - 2);
in_w = std::max(in_w, -in_w);
in_w = std::min(in_w, 2 * in_width - in_w - 2);
const int in_index = (in_h * in_width + in_w) * channels;
for (int c = 0; c < channels; ++c) {
out_data[out_index + c] = in_data[in_index + c];
}
}
}
in_data += in_height * in_width * channels;
out_data += out_height * out_width * channels;
}
}
template <typename T>
void Pad2DEdgeNCHW(const T* in_data, const int num, const int channels,
const int in_height, const int in_width,
const int out_height, const int out_width, const int pad_top,
const int pad_left, T* out_data) {
for (int n = 0; n < num; ++n) {
for (int c = 0; c < channels; ++c) {
for (int out_h = 0; out_h < out_height; ++out_h) {
for (int out_w = 0; out_w < out_width; ++out_w) {
int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
out_data[out_h * out_width + out_w] = in_data[in_h * in_width + in_w];
}
}
in_data += in_height * in_width;
out_data += out_height * out_width;
}
}
}
template <typename T>
void Pad2DEdgeNHWC(const T* in_data, const int num, const int channels,
const int in_height, const int in_width,
const int out_height, const int out_width, const int pad_top,
const int pad_left, T* out_data) {
for (int n = 0; n < num; ++n) {
for (int out_h = 0; out_h < out_height; ++out_h) {
for (int out_w = 0; out_w < out_width; ++out_w) {
const int out_index = (out_h * out_width + out_w) * channels;
int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
const int in_index = (in_h * in_width + in_w) * channels;
for (int c = 0; c < channels; ++c) {
out_data[out_index + c] = in_data[in_index + c];
}
}
}
in_data += in_height * in_width * channels;
out_data += out_height * out_width * channels;
}
}
template <typename T>
void Pad2DGradConstNCHW(T* d_in_data, const int num, const int channels,
const int in_height, const int in_width,
const int out_height, const int out_width,
const int pad_top, const int pad_left,
const T* d_out_data) {
for (int n = 0; n < num; ++n) {
for (int c = 0; c < channels; ++c) {
for (int out_h = 0; out_h < out_height; ++out_h) {
for (int out_w = 0; out_w < out_width; ++out_w) {
int in_h = out_h - pad_top;
int in_w = out_w - pad_left;
if (!(in_h < 0 || in_w < 0 || in_h >= in_height ||
in_w >= in_width)) {
d_in_data[in_h * in_width + in_w] =
d_out_data[out_h * out_width + out_w];
}
}
}
d_in_data += in_height * in_width;
d_out_data += out_height * out_width;
}
}
}
template <typename T>
void Pad2DGradConstNHWC(T* d_in_data, const int num, const int channels,
const int in_height, const int in_width,
const int out_height, const int out_width,
const int pad_top, const int pad_left,
const T* d_out_data) {
for (int n = 0; n < num; ++n) {
for (int out_h = 0; out_h < out_height; ++out_h) {
for (int out_w = 0; out_w < out_width; ++out_w) {
int in_h = out_h - pad_top;
int in_w = out_w - pad_left;
const int out_index = (out_h * out_width + out_w) * channels;
if (!(in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width)) {
const int in_index = (in_h * in_width + in_w) * channels;
for (int c = 0; c < channels; ++c) {
d_in_data[in_index + c] = d_out_data[out_index + c];
}
}
}
}
d_in_data += in_height * in_width * channels;
d_out_data += out_height * out_width * channels;
}
}
template <typename T>
void Pad2DGradReflectNCHW(T* d_in_data, const int num, const int channels,
const int in_height, const int in_width,
const int out_height, const int out_width,
const int pad_top, const int pad_left,
const T* d_out_data) {
for (int n = 0; n < num; ++n) {
for (int c = 0; c < channels; ++c) {
for (int out_h = 0; out_h < out_height; ++out_h) {
for (int out_w = 0; out_w < out_width; ++out_w) {
int in_h = out_h - pad_top;
int in_w = out_w - pad_left;
in_h = std::max(in_h, -in_h); // reflect over 0
in_h = std::min(in_h,
2 * in_height - in_h - 2); // reflect over in_height
in_w = std::max(in_w, -in_w); // reflect over 0
in_w =
std::min(in_w, 2 * in_width - in_w - 2); // reflect over in_width
d_in_data[in_h * in_width + in_w] +=
d_out_data[out_h * out_width + out_w];
}
}
d_in_data += in_height * in_width;
d_out_data += out_height * out_width;
}
}
}
template <typename T>
void Pad2DGradReflectNHWC(T* d_in_data, const int num, const int channels,
const int in_height, const int in_width,
const int out_height, const int out_width,
const int pad_top, const int pad_left,
const T* d_out_data) {
for (int n = 0; n < num; ++n) {
for (int out_h = 0; out_h < out_height; ++out_h) {
for (int out_w = 0; out_w < out_width; ++out_w) {
const int out_index = (out_h * out_width + out_w) * channels;
int in_h = out_h - pad_top;
int in_w = out_w - pad_left;
in_h = std::max(in_h, -in_h);
in_h = std::min(in_h, 2 * in_height - in_h - 2);
in_w = std::max(in_w, -in_w);
in_w = std::min(in_w, 2 * in_width - in_w - 2);
const int in_index = (in_h * in_width + in_w) * channels;
for (int c = 0; c < channels; ++c) {
d_in_data[in_index + c] += d_out_data[out_index + c];
}
}
}
d_in_data += in_height * in_width * channels;
d_out_data += out_height * out_width * channels;
}
}
template <typename T>
void Pad2DGradEdgeNCHW(T* d_in_data, const int num, const int channels,
const int in_height, const int in_width,
const int out_height, const int out_width,
const int pad_top, const int pad_left,
const T* d_out_data) {
for (int n = 0; n < num; ++n) {
for (int c = 0; c < channels; ++c) {
for (int out_h = 0; out_h < out_height; ++out_h) {
for (int out_w = 0; out_w < out_width; ++out_w) {
int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
d_in_data[in_h * in_width + in_w] +=
d_out_data[out_h * out_width + out_w];
}
}
d_in_data += in_height * in_width;
d_out_data += out_height * out_width;
}
}
}
template <typename T>
void Pad2DGradEdgeNHWC(T* d_in_data, const int num, const int channels,
const int in_height, const int in_width,
const int out_height, const int out_width,
const int pad_top, const int pad_left,
const T* d_out_data) {
for (int n = 0; n < num; ++n) {
for (int out_h = 0; out_h < out_height; ++out_h) {
for (int out_w = 0; out_w < out_width; ++out_w) {
const int out_index = (out_h * out_width + out_w) * channels;
int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
const int in_index = (in_h * in_width + in_w) * channels;
for (int c = 0; c < channels; ++c) {
d_in_data[in_index + c] += d_out_data[out_index + c];
}
}
}
d_in_data += in_height * in_width * channels;
d_out_data += out_height * out_width * channels;
}
}
template <typename T>
class Pad2dCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto pads = context.Attr<std::vector<int>>("paddings");
auto mode = context.Attr<std::string>("mode");
auto data_format = context.Attr<std::string>("data_format");
T value = context.Attr<T>("pad_value");
auto* x = context.Input<Tensor>("X");
auto* out = context.Output<Tensor>("Out");
auto in_dims = x->dims();
auto out_dims = out->dims();
const T* in_data = x->data<T>();
T* out_data = out->mutable_data<T>(context.GetPlace());
const int pad_top = pads[0];
const int pad_left = pads[2];
const int num = in_dims[0];
if (data_format == "NCHW") {
const int channels = in_dims[1];
const int in_height = in_dims[2];
const int in_width = in_dims[3];
const int out_height = out_dims[2];
const int out_width = out_dims[3];
if (mode == "reflect") {
Pad2DReflectNCHW(in_data, num, channels, in_height, in_width,
out_height, out_width, pad_top, pad_left, out_data);
} else if (mode == "edge") {
Pad2DEdgeNCHW(in_data, num, channels, in_height, in_width, out_height,
out_width, pad_top, pad_left, out_data);
} else {
Pad2DConstNCHW(in_data, num, channels, in_height, in_width, out_height,
out_width, pad_top, pad_left, value, out_data);
}
} else {
const int channels = in_dims[3];
const int in_height = in_dims[1];
const int in_width = in_dims[2];
const int out_height = out_dims[1];
const int out_width = out_dims[2];
if (mode == "reflect") {
Pad2DReflectNHWC(in_data, num, channels, in_height, in_width,
out_height, out_width, pad_top, pad_left, out_data);
} else if (mode == "edge") {
Pad2DEdgeNHWC(in_data, num, channels, in_height, in_width, out_height,
out_width, pad_top, pad_left, out_data);
} else {
Pad2DConstNHWC(in_data, num, channels, in_height, in_width, out_height,
out_width, pad_top, pad_left, value, out_data);
}
}
}
};
template <typename T>
class Pad2dGradCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto pads = context.Attr<std::vector<int>>("paddings");
auto mode = context.Attr<std::string>("mode");
auto data_format = context.Attr<std::string>("data_format");
auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
auto* d_in = context.Output<Tensor>(framework::GradVarName("X"));
auto d_in_dims = d_in->dims();
auto d_out_dims = d_out->dims();
const T* d_out_data = d_out->data<T>();
T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
math::SetConstant<platform::CPUDeviceContext, T> set_zero;
set_zero(context.template device_context<platform::CPUDeviceContext>(),
d_in, static_cast<T>(0));
const int pad_top = pads[0];
const int pad_left = pads[2];
const int num = d_in_dims[0];
if (data_format == "NCHW") {
const int channels = d_in_dims[1];
const int in_height = d_in_dims[2];
const int in_width = d_in_dims[3];
const int out_height = d_out_dims[2];
const int out_width = d_out_dims[3];
if (mode == "reflect") {
Pad2DGradReflectNCHW(d_in_data, num, channels, in_height, in_width,
out_height, out_width, pad_top, pad_left,
d_out_data);
} else if (mode == "edge") {
Pad2DGradEdgeNCHW(d_in_data, num, channels, in_height, in_width,
out_height, out_width, pad_top, pad_left, d_out_data);
} else {
Pad2DGradConstNCHW(d_in_data, num, channels, in_height, in_width,
out_height, out_width, pad_top, pad_left,
d_out_data);
}
} else {
const int channels = d_in_dims[3];
const int in_height = d_in_dims[1];
const int in_width = d_in_dims[2];
const int out_height = d_out_dims[1];
const int out_width = d_out_dims[2];
if (mode == "reflect") {
Pad2DGradReflectNHWC(d_in_data, num, channels, in_height, in_width,
out_height, out_width, pad_top, pad_left,
d_out_data);
} else if (mode == "edge") {
Pad2DGradEdgeNHWC(d_in_data, num, channels, in_height, in_width,
out_height, out_width, pad_top, pad_left, d_out_data);
} else {
Pad2DGradConstNHWC(d_in_data, num, channels, in_height, in_width,
out_height, out_width, pad_top, pad_left,
d_out_data);
}
}
}
};
class Pad2dOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of Pad2dOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of Pad2dOp should not be null.");
auto x_dim = ctx->GetInputDim("X");
auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
PADDLE_ENFORCE_EQ(x_dim.size(), 4,
"Size of paddings should be equal to 4.");
std::vector<int64_t> out_dims(x_dim.size());
auto data_format = ctx->Attrs().Get<std::string>("data_format");
out_dims[0] = x_dim[0];
if (data_format == "NCHW") {
out_dims[1] = x_dim[1];
out_dims[2] = x_dim[2] + paddings[0] + paddings[1]; // height
out_dims[3] = x_dim[3] + paddings[2] + paddings[3]; // width
} else { // NHWC
out_dims[3] = x_dim[3];
out_dims[1] = x_dim[1] + paddings[0] + paddings[1];
out_dims[2] = x_dim[2] + paddings[2] + paddings[3];
}
ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
if (out_dims[0] == x_dim[0]) {
// Only pass LoD when the first dimension is equal between
// output and input.
ctx->ShareLoD("X", /*->*/ "Out");
}
}
};
class Pad2dOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"The input of pad2d op. "
"The input should be a 4-D tensor with formate NCHW or NHWC.");
AddOutput("Out",
"The output of pad2d op. "
"A tensor with the same shape as X.");
AddAttr<std::vector<int>>(
"paddings",
"(vector<int>) "
"A list<int> to describe the padding rules."
"paddings=[0, 1, 2, 3] means "
"padding 0 row to top, 1 row to bottom, 2 columns to left "
"and 3 columns to right. Size of paddings must be 4.");
AddAttr<float>("pad_value",
"(float, default 0.0) "
"The value to fill the padded areas in constant mode.")
.SetDefault(0.0f);
AddAttr<std::string>("mode",
"(float, default constant) "
"Three modes: constant(default), reflect, edge.")
.SetDefault("constant");
AddAttr<std::string>(
"data_format",
"(string, default NCHW) Only used in "
"An optional string from: \"NHWC\", \"NCHW\". "
"Defaults to \"NHWC\". Specify the data format of the input data.")
.SetDefault("NCHW");
AddComment(R"DOC(
Pad2d Operator.
Pad 2-d images accordding to 'paddings' and 'mode'.
If mode is 'reflect', paddings[0] and paddings[1] must be no greater
than height-1. And the width dimension has the same condition.
Given that X is a channel of image from input:
X = [[1, 2, 3],
[4, 5, 6]]
Case 0:
paddings = [0, 1, 2, 3],
mode = 'constant'
pad_value = 0
Out = [[0, 0, 1, 2, 3, 0, 0, 0]
[0, 0, 4, 5, 6, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0]]
Case 1:
paddings = [0, 1, 2, 1],
mode = 'reflect'
Out = [[3, 2, 1, 2, 3, 2]
[6, 5, 4, 5, 6, 5]
[3, 2, 1, 2, 3, 2]]
Case 2:
paddings = [0, 1, 2, 1],
mode = 'edge'
Out = [[1, 1, 1, 2, 3, 3]
[4, 4, 4, 5, 6, 6]
[4, 4, 4, 5, 6, 6]]
)DOC");
}
};
class Pad2dOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) should not be null");
auto x_dims = ctx->GetInputDim("X");
auto x_grad_name = framework::GradVarName("X");
if (ctx->HasOutput(x_grad_name)) {
ctx->SetOutputDim(x_grad_name, x_dims);
}
}
};
class Pad2dOpGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
auto* bind = new framework::OpDesc();
bind->SetInput("X", Input("X"));
bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
bind->SetOutput(framework::GradVarName("X"), InputGrad("X"));
bind->SetAttrMap(Attrs());
bind->SetType("pad2d_grad");
return std::unique_ptr<framework::OpDesc>(bind);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(pad2d, ops::Pad2dOp, ops::Pad2dOpMaker,
ops::Pad2dOpGradMaker);
REGISTER_OPERATOR(pad2d_grad, ops::Pad2dOpGrad);
REGISTER_OP_CPU_KERNEL(pad2d, ops::Pad2dCPUKernel<float>);
REGISTER_OP_CPU_KERNEL(pad2d_grad, ops::Pad2dGradCPUKernel<float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/gpu_info.h"
namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
using framework::Tensor;
template <typename T>
__global__ void Pad2DConstNCHW(const int nthreads, const T* in_data,
const int num, const int channels,
const int in_height, const int in_width,
const int out_height, const int out_width,
const int pad_top, const int pad_left, T value,
T* out_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int nc = index / out_width;
const int out_w = index % out_width;
const int out_h = nc % out_height;
nc /= out_height;
int in_h = out_h - pad_top;
int in_w = out_w - pad_left;
out_data[index] =
(in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width)
? value
: in_data[(nc * in_height + in_h) * in_width + in_w];
}
}
template <typename T>
__global__ void Pad2DConstNHWC(const int nthreads, const T* in_data,
const int num, const int channels,
const int in_height, const int in_width,
const int out_height, const int out_width,
const int pad_top, const int pad_left, T value,
T* out_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int n = index / channels;
const int c = index % channels;
const int out_w = n % out_width;
n /= out_width;
const int out_h = n % out_height;
n /= out_height;
const int in_h = out_h - pad_top;
const int in_w = out_w - pad_left;
out_data[index] =
(in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width)
? value
: in_data[((n * in_height + in_h) * in_width + in_w) * channels +
c];
}
}
template <typename T>
__global__ void Pad2DReflectNCHW(const int nthreads, const T* in_data,
const int num, const int channels,
const int in_height, const int in_width,
const int out_height, const int out_width,
const int pad_top, const int pad_left,
T* out_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int nc = index / out_width;
const int out_w = index % out_width;
const int out_h = nc % out_height;
nc /= out_height;
int in_h = out_h - pad_top;
int in_w = out_w - pad_left;
in_h = max(in_h, -in_h); // reflect by 0
in_h = min(in_h, 2 * in_height - in_h - 2); // reflect by in_height
in_w = max(in_w, -in_w); // reflect by 0
in_w = min(in_w, 2 * in_width - in_w - 2); // reflect by in_width
out_data[index] = in_data[(nc * in_height + in_h) * in_width + in_w];
}
}
template <typename T>
__global__ void Pad2DReflectNHWC(const int nthreads, const T* in_data,
const int num, const int channels,
const int in_height, const int in_width,
const int out_height, const int out_width,
const int pad_top, const int pad_left,
T* out_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int n = index / channels;
const int c = index % channels;
const int out_w = n % out_width;
n /= out_width;
const int out_h = n % out_height;
n /= out_height;
int in_h = out_h - pad_top;
int in_w = out_w - pad_left;
in_h = max(in_h, -in_h);
in_h = min(in_h, 2 * in_height - in_h - 2);
in_w = max(in_w, -in_w);
in_w = min(in_w, 2 * in_width - in_w - 2);
out_data[index] =
in_data[((n * in_height + in_h) * in_width + in_w) * channels + c];
}
}
template <typename T>
__global__ void Pad2DEdgeNCHW(const int nthreads, const T* in_data,
const int num, const int channels,
const int in_height, const int in_width,
const int out_height, const int out_width,
const int pad_top, const int pad_left,
T* out_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int nc = index / out_width;
const int out_w = index % out_width;
const int out_h = nc % out_height;
nc /= out_height;
int in_h = min(in_height - 1, max(out_h - pad_top, 0));
int in_w = min(in_width - 1, max(out_w - pad_left, 0));
out_data[index] = in_data[(nc * in_height + in_h) * in_width + in_w];
}
}
template <typename T>
__global__ void Pad2DEdgeNHWC(const int nthreads, const T* in_data,
const int num, const int channels,
const int in_height, const int in_width,
const int out_height, const int out_width,
const int pad_top, const int pad_left,
T* out_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int n = index / channels;
const int c = index % channels;
const int out_w = n % out_width;
n /= out_width;
const int out_h = n % out_height;
n /= out_height;
int in_h = min(in_height - 1, max(out_h - pad_top, 0));
int in_w = min(in_width - 1, max(out_w - pad_left, 0));
out_data[index] =
in_data[((n * in_height + in_h) * in_width + in_w) * channels + c];
}
}
template <typename T>
__global__ void Pad2DGradConstNCHW(const int in_size, T* d_in_data,
const int num, const int channels,
const int in_height, const int in_width,
const int out_height, const int out_width,
const int pad_top, const int pad_left,
const T* d_out_data) {
CUDA_1D_KERNEL_LOOP(in_index, in_size) {
int nc = in_index / in_width;
const int out_w = in_index % in_width + pad_left;
const int out_h = nc % in_height + pad_top;
nc /= in_height;
d_in_data[in_index] =
d_out_data[(nc * out_height + out_h) * out_width + out_w];
}
}
template <typename T>
__global__ void Pad2DGradConstNHWC(const int in_size, T* d_in_data,
const int num, const int channels,
const int in_height, const int in_width,
const int out_height, const int out_width,
const int pad_top, const int pad_left,
const T* d_out_data) {
CUDA_1D_KERNEL_LOOP(in_index, in_size) {
int n = in_index / channels;
const int c = in_index % channels;
const int out_w = n % in_width + pad_left;
n /= in_width;
const int out_h = n % in_height + pad_top;
n /= in_height;
d_in_data[in_index] =
d_out_data[((n * out_height + out_h) * out_width + out_w) * channels +
c];
}
}
template <typename T>
__global__ void Pad2DGradReflectNCHW(const int out_size, T* d_in_data,
const int num, const int channels,
const int in_height, const int in_width,
const int out_height, const int out_width,
const int pad_top, const int pad_left,
const T* d_out_data) {
CUDA_1D_KERNEL_LOOP(out_index, out_size) {
int nc = out_index / out_width;
const int out_w = out_index % out_width;
const int out_h = nc % out_height;
nc /= out_height;
int in_h = out_h - pad_top;
int in_w = out_w - pad_left;
in_h = max(in_h, -in_h);
in_w = max(in_w, -in_w);
in_h = min(in_h, 2 * in_height - in_h - 2);
in_w = min(in_w, 2 * in_width - in_w - 2);
atomicAdd(&d_in_data[(nc * in_height + in_h) * in_width + in_w],
d_out_data[out_index]);
}
}
template <typename T>
__global__ void Pad2DGradReflectNHWC(const int out_size, T* d_in_data,
const int num, const int channels,
const int in_height, const int in_width,
const int out_height, const int out_width,
const int pad_top, const int pad_left,
const T* d_out_data) {
CUDA_1D_KERNEL_LOOP(out_index, out_size) {
const int c = out_index % channels;
int n = out_index / channels;
const int out_w = n % out_width;
n /= out_width;
const int out_h = n % out_height;
n /= out_height;
int in_h = out_h - pad_top;
int in_w = out_w - pad_left;
in_h = max(in_h, -in_h);
in_w = max(in_w, -in_w);
in_h = min(in_h, in_height * 2 - in_h - 2);
in_w = min(in_w, in_width * 2 - in_w - 2);
atomicAdd(
&d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c],
d_out_data[out_index]);
}
}
template <typename T>
__global__ void Pad2DGradEdgeNCHW(const int out_size, T* d_in_data,
const int num, const int channels,
const int in_height, const int in_width,
const int out_height, const int out_width,
const int pad_top, const int pad_left,
const T* d_out_data) {
CUDA_1D_KERNEL_LOOP(out_index, out_size) {
int nc = out_index / out_width;
const int out_w = out_index % out_width;
const int out_h = nc % out_height;
nc /= out_height;
const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
atomicAdd(&d_in_data[(nc * in_height + in_h) * in_width + in_w],
d_out_data[out_index]);
}
}
template <typename T>
__global__ void Pad2DGradEdgeNHWC(const int out_size, T* d_in_data,
const int num, const int channels,
const int in_height, const int in_width,
const int out_height, const int out_width,
const int pad_top, const int pad_left,
const T* d_out_data) {
CUDA_1D_KERNEL_LOOP(out_index, out_size) {
const int c = out_index % channels;
int n = out_index / channels;
const int out_w = n % out_width;
n /= out_width;
const int out_h = n % out_height;
n /= out_height;
const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
atomicAdd(
&d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c],
d_out_data[out_index]);
}
}
template <typename T>
class Pad2dCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto pads = context.Attr<std::vector<int>>("paddings");
auto mode = context.Attr<std::string>("mode");
auto data_format = context.Attr<std::string>("data_format");
T value = context.Attr<T>("pad_value");
auto* x = context.Input<Tensor>("X");
auto* out = context.Output<Tensor>("Out");
auto in_dims = x->dims();
auto out_dims = out->dims();
const T* in_data = x->data<T>();
T* out_data = out->mutable_data<T>(context.GetPlace());
const int pad_top = pads[0];
const int pad_left = pads[2];
const int num = in_dims[0];
auto stream = context.cuda_device_context().stream();
int block = PADDLE_CUDA_NUM_THREADS;
const int out_size = out->numel();
int grid = (out_size + block - 1) / block;
if (data_format == "NCHW") {
const int channels = in_dims[1];
const int in_height = in_dims[2];
const int in_width = in_dims[3];
const int out_height = out_dims[2];
const int out_width = out_dims[3];
if (mode == "reflect") {
Pad2DReflectNCHW<T><<<grid, block, 0, stream>>>(
out_size, in_data, num, channels, in_height, in_width, out_height,
out_width, pad_top, pad_left, out_data);
} else if (mode == "edge") {
Pad2DEdgeNCHW<T><<<grid, block, 0, stream>>>(
out_size, in_data, num, channels, in_height, in_width, out_height,
out_width, pad_top, pad_left, out_data);
} else {
Pad2DConstNCHW<T><<<grid, block, 0, stream>>>(
out_size, in_data, num, channels, in_height, in_width, out_height,
out_width, pad_top, pad_left, value, out_data);
}
} else {
const int channels = in_dims[3];
const int in_height = in_dims[1];
const int in_width = in_dims[2];
const int out_height = out_dims[1];
const int out_width = out_dims[2];
if (mode == "reflect") {
Pad2DReflectNHWC<T><<<grid, block, 0, stream>>>(
out_size, in_data, num, channels, in_height, in_width, out_height,
out_width, pad_top, pad_left, out_data);
} else if (mode == "edge") {
Pad2DEdgeNHWC<T><<<grid, block, 0, stream>>>(
out_size, in_data, num, channels, in_height, in_width, out_height,
out_width, pad_top, pad_left, out_data);
} else {
Pad2DConstNHWC<T><<<grid, block, 0, stream>>>(
out_size, in_data, num, channels, in_height, in_width, out_height,
out_width, pad_top, pad_left, value, out_data);
}
}
}
};
template <typename T>
class Pad2dGradCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto pads = context.Attr<std::vector<int>>("paddings");
auto mode = context.Attr<std::string>("mode");
auto data_format = context.Attr<std::string>("data_format");
auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
auto* d_in = context.Output<Tensor>(framework::GradVarName("X"));
auto d_in_dims = d_in->dims();
auto d_out_dims = d_out->dims();
const T* d_out_data = d_out->data<T>();
T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
math::SetConstant<platform::CUDADeviceContext, T> set_zero;
set_zero(context.template device_context<platform::CUDADeviceContext>(),
d_in, static_cast<T>(0));
const int pad_top = pads[0];
const int pad_left = pads[2];
const int num = d_in_dims[0];
auto stream = context.cuda_device_context().stream();
int block = PADDLE_CUDA_NUM_THREADS;
const int out_size = d_out->numel();
const int in_size = d_in->numel();
int grid = (out_size + block - 1) / block;
if (data_format == "NCHW") {
const int channels = d_in_dims[1];
const int in_height = d_in_dims[2];
const int in_width = d_in_dims[3];
const int out_height = d_out_dims[2];
const int out_width = d_out_dims[3];
if (mode == "reflect") {
Pad2DGradReflectNCHW<T><<<grid, block, 0, stream>>>(
out_size, d_in_data, num, channels, in_height, in_width, out_height,
out_width, pad_top, pad_left, d_out_data);
} else if (mode == "edge") {
Pad2DGradEdgeNCHW<T><<<grid, block, 0, stream>>>(
out_size, d_in_data, num, channels, in_height, in_width, out_height,
out_width, pad_top, pad_left, d_out_data);
} else {
grid = (in_size + block - 1) / block;
Pad2DGradConstNCHW<T><<<grid, block, 0, stream>>>(
in_size, d_in_data, num, channels, in_height, in_width, out_height,
out_width, pad_top, pad_left, d_out_data);
}
} else {
const int channels = d_in_dims[3];
const int in_height = d_in_dims[1];
const int in_width = d_in_dims[2];
const int out_height = d_out_dims[1];
const int out_width = d_out_dims[2];
if (mode == "reflect") {
Pad2DGradReflectNHWC<T><<<grid, block, 0, stream>>>(
out_size, d_in_data, num, channels, in_height, in_width, out_height,
out_width, pad_top, pad_left, d_out_data);
} else if (mode == "edge") {
Pad2DGradEdgeNHWC<T><<<grid, block, 0, stream>>>(
out_size, d_in_data, num, channels, in_height, in_width, out_height,
out_width, pad_top, pad_left, d_out_data);
} else {
grid = (in_size + block - 1) / block;
Pad2DGradConstNHWC<T><<<grid, block, 0, stream>>>(
in_size, d_in_data, num, channels, in_height, in_width, out_height,
out_width, pad_top, pad_left, d_out_data);
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(pad2d, ops::Pad2dCUDAKernel<float>);
REGISTER_OP_CUDA_KERNEL(pad2d_grad, ops::Pad2dGradCUDAKernel<float>);
......@@ -38,10 +38,9 @@ class PReluKernel : public framework::OpKernel<T> {
auto dim = x->dims();
int index = 0;
int i = 0;
int temp = 0;
if (mode == "channel") {
int temp = numel / (dim[0] * dim[1]);
for (i = 0; i < numel; i++) {
temp = numel / (dim[0] * dim[1]);
index = (i / temp) % dim[1];
o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
}
......
......@@ -53,7 +53,7 @@ class SamplingIdKernel : public framework::OpKernel<T> {
static_cast<T>(context.Attr<float>("min")),
static_cast<T>(context.Attr<float>("max")));
std::vector<T> ids(batch_size);
std::vector<int64_t> ids(batch_size);
for (int i = 0; i < batch_size; ++i) {
T r = dist(engine);
int idx = width - 1;
......@@ -63,7 +63,7 @@ class SamplingIdKernel : public framework::OpKernel<T> {
break;
}
}
ids[i] = ins_vector[idx];
ids[i] = int64_t(idx);
}
std::vector<int64_t> out_dim;
......
......@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <stdint.h>
#include <sys/stat.h>
#include <fstream>
#include <numeric>
#include <sstream>
......@@ -23,40 +22,11 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/port.h"
namespace paddle {
namespace operators {
// TODO(sidgoyal78): These function are needed by other files (save_op), move
// them to paddle::filesystem namespace. (as noted by yuyang18 in save_op).
constexpr char kSEP = '/';
static bool FileExists(const std::string &filepath) {
struct stat buffer;
return (stat(filepath.c_str(), &buffer) == 0);
}
static std::string DirName(const std::string &filepath) {
auto pos = filepath.rfind(kSEP);
if (pos == std::string::npos) {
return "";
}
return filepath.substr(0, pos);
}
static void MkDir(const char *path) {
if (mkdir(path, 0755)) {
PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
}
}
static void MkDirRecursively(const char *fullpath) {
if (*fullpath == '\0') return; // empty string
if (FileExists(fullpath)) return;
MkDirRecursively(DirName(fullpath).c_str());
MkDir(fullpath);
}
class SaveCombineOp : public framework::OperatorBase {
public:
SaveCombineOp(const std::string &type,
......
......@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <stdint.h>
#include <sys/stat.h>
#include <fstream>
#include <numeric>
......@@ -25,6 +24,7 @@ limitations under the License. */
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/port.h"
namespace paddle {
namespace operators {
......@@ -33,36 +33,6 @@ namespace operators {
// to directory specified.
constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath";
// TODO(yuyang18): If the functions below are needed by other files, move them
// to paddle::filesystem namespace.
constexpr char kSEP = '/';
static bool FileExists(const std::string &filepath) {
struct stat buffer;
return (stat(filepath.c_str(), &buffer) == 0);
}
static std::string DirName(const std::string &filepath) {
auto pos = filepath.rfind(kSEP);
if (pos == std::string::npos) {
return "";
}
return filepath.substr(0, pos);
}
static void MkDir(const char *path) {
if (mkdir(path, 0755)) {
PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
}
}
static void MkDirRecursively(const char *fullpath) {
if (*fullpath == '\0') return; // empty string
if (FileExists(fullpath)) return;
MkDirRecursively(DirName(fullpath).c_str());
MkDir(fullpath);
}
class SaveOp : public framework::OperatorBase {
public:
SaveOp(const std::string &type, const framework::VariableNameMap &inputs,
......
......@@ -99,7 +99,7 @@ struct SequenceMaskFunctor {
: ctx_(ctx), x_(x), y_(y), limits_(limits), maxlen_(maxlen) {}
template <typename Ty>
void operator()() const {
void apply() const {
auto *y_data = y_->mutable_data<Ty>(ctx_.GetPlace());
platform::ForRange<DeviceContext> for_range(ctx_, limits_);
for_range(SequenceMaskForRangeFunctor<Tx, Ty>(x_, y_data, maxlen_));
......
......@@ -62,7 +62,10 @@ class ShrinkRNNMemoryOp : public ArrayOp {
}
if (dst_num_rows != 0) {
out_tensor.ShareDataWith(x_tensor.Slice(0, height));
out_tensor.mutable_data(place, x_tensor.type());
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
framework::TensorCopy(x_tensor.Slice(0, height), place, *dev_ctx,
&out_tensor);
}
}
};
......
......@@ -16,6 +16,9 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/memory/memory.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/framework/rw_lock.h"
#endif
namespace paddle {
namespace platform {
......@@ -142,7 +145,59 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
mutable unsigned int* semaphore_;
};
CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
class CudnnHolder {
public:
CudnnHolder(const cudaStream_t* stream, const CUDAPlace& place)
: workspace_(nullptr), workspace_len_(0), stream_(stream), place_(place) {
PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, *stream_));
}
cudnnHandle_t cudnn_handle() const { return cudnn_handle_; }
void RunFunc(const std::function<void(void*)>& cudnn_func,
size_t required_workspace_len) {
std::lock_guard<std::mutex> lock(mtx_);
if (required_workspace_len > workspace_len_) {
ReallocateWorkspace(required_workspace_len);
}
cudnn_func(workspace_);
}
~CudnnHolder() {
PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
if (workspace_ != nullptr) {
paddle::memory::Free(place_, workspace_);
}
}
private:
void ReallocateWorkspace(size_t required_workspace_len) {
if (required_workspace_len <= workspace_len_) {
return;
}
void* new_workspace = paddle::memory::Alloc(place_, required_workspace_len);
if (workspace_ != nullptr) {
// Maybe someone is using the current workspace
PADDLE_ENFORCE(cudaStreamSynchronize(*stream_));
paddle::memory::Free(place_, workspace_);
}
workspace_ = new_workspace;
workspace_len_ = required_workspace_len;
}
cudnnHandle_t cudnn_handle_;
void* workspace_;
size_t workspace_len_;
const cudaStream_t* stream_; // not owned;
const CUDAPlace place_;
std::mutex mtx_;
};
CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
: place_(place), cudnn_holder_(nullptr) {
SetDeviceId(place_.device);
compute_capability = GetCUDAComputeCapability(place_.device);
multi_process = GetCUDAMultiProcessors(place_.device);
......@@ -154,10 +209,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_));
PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_));
if (dynload::HasCUDNN()) {
PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_));
} else {
cudnn_handle_ = nullptr;
cudnn_holder_.reset(new CudnnHolder(&stream_, place));
}
}
......@@ -165,9 +217,6 @@ CUDADeviceContext::~CUDADeviceContext() {
SetDeviceId(place_.device);
Wait();
PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_));
if (cudnn_handle_ != nullptr) {
PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
}
eigen_stream_.reset();
eigen_device_.reset();
PADDLE_ENFORCE(cudaStreamDestroy(stream_));
......@@ -196,7 +245,14 @@ cublasHandle_t CUDADeviceContext::cublas_handle() const {
return cublas_handle_;
}
cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; }
cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
return cudnn_holder_->cudnn_handle();
}
void CUDADeviceContext::RunCudnnFuncWithWorkspace(
const std::function<void(void*)>& cudnn_func, size_t workspace_len) const {
cudnn_holder_->RunFunc(cudnn_func, workspace_len);
}
cudaStream_t CUDADeviceContext::stream() const { return stream_; }
......
......@@ -69,6 +69,7 @@ struct DefaultDeviceContextType<platform::CPUPlace> {
#ifdef PADDLE_WITH_CUDA
class EigenCudaStreamDevice;
class CudnnHolder;
class CUDADeviceContext : public DeviceContext {
public:
......@@ -96,6 +97,11 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Return cudnn handle in the device context. */
cudnnHandle_t cudnn_handle() const;
/*! \brief Run a cudnn function with the workspace provided by
* CUDADeviceContext */
void RunCudnnFuncWithWorkspace(const std::function<void(void*)>& cudnn_func,
size_t workspace_len) const;
/*! \brief Return cuda stream in the device context. */
cudaStream_t stream() const;
......@@ -111,8 +117,8 @@ class CUDADeviceContext : public DeviceContext {
std::unique_ptr<Eigen::GpuDevice> eigen_device_;
std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
std::unique_ptr<CudnnHolder> cudnn_holder_;
cudaStream_t stream_;
cudnnHandle_t cudnn_handle_;
cublasHandle_t cublas_handle_;
int compute_capability;
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cfloat>
// Disable the copy and assignment operator for a class.
#ifndef DISABLE_COPY_AND_ASSIGN
......@@ -23,3 +24,7 @@ limitations under the License. */
classname& operator=(const classname&) = delete; \
classname& operator=(classname&&) = delete
#endif
#if defined(__FLT_MAX__)
#define FLT_MAX __FLT_MAX__
#endif // __FLT_MAX__
......@@ -14,24 +14,141 @@
#pragma once
#include <cstdio>
#include <stdexcept>
#include <memory>
#include <string>
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#include "glog/logging.h"
#if !defined(_WIN32)
#include <dlfcn.h> // for dladdr
#include <execinfo.h> // for backtrace
#define UNUSED __attribute__((unused))
#include <dlfcn.h> // dladdr
#include <execinfo.h> // backtrace
#include <sys/stat.h>
#include <algorithm> // std::accumulate
#else
#include <Shlwapi.h>
#include <Windows.h>
#include <io.h> // _popen, _pclose
#include <windows.h>
#if defined(_WIN32)
#include <numeric> // std::accumulate in msvc
#endif
// windows version of __attribute__((unused))
#define UNUSED __pragma(warning(suppress : 4100))
static void* dlsym(void* handle, const char* symbol_name) {
#ifndef S_ISDIR // windows port for sys/stat.h
#define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
#endif // S_ISDIR
static void *dlsym(void *handle, const char *symbol_name) {
FARPROC found_symbol;
found_symbol = GetProcAddress((HMODULE)handle, symbol_name);
if (found_symbol == NULL) {
throw std::runtime_error(std::string(symbol_name) + " not found.");
}
return reinterpret_cast<void*>(found_symbol);
return reinterpret_cast<void *>(found_symbol);
}
#endif
static void *dlopen(const char *filename, int flag) {
std::string file_name(filename);
file_name.replace(0, file_name.size() - 1, '/', '\\');
HMODULE hModule = LoadLibrary(file_name.c_str());
if (!hModule) {
throw std::runtime_error(file_name + " not found.");
}
return reinterpret_cast<void *>(hModule);
}
#endif // !_WIN32
static void ExecShellCommand(const std::string &cmd, std::string *message) {
char buffer[128];
#if !defined(_WIN32)
std::shared_ptr<FILE> pipe(popen(cmd.c_str(), "r"), pclose);
#else
std::shared_ptr<FILE> pipe(_popen(cmd.c_str(), "r"), _pclose);
#endif // _WIN32
if (!pipe) {
LOG(ERROR) << "error running command: " << cmd;
return;
}
while (!feof(pipe.get())) {
if (fgets(buffer, 128, pipe.get()) != nullptr) {
*message += buffer;
}
}
}
static bool PathExists(const std::string &path) {
#if !defined(_WIN32)
struct stat statbuf;
if (stat(path.c_str(), &statbuf) != -1) {
if (S_ISDIR(statbuf.st_mode)) {
return true;
}
}
#else
struct _stat statbuf;
if (_stat(path.c_str(), &statbuf) != -1) {
if (S_ISDIR(statbuf.st_mode)) {
return true;
}
}
#endif // !_WIN32
return false;
}
// TODO(yuyang18): If the functions below are needed by other files, move them
// to paddle::filesystem namespace.
#if !defined(_WIN32)
constexpr char kSEP = '/';
#else
constexpr char kSEP = '\\';
#endif // _WIN32
static bool FileExists(const std::string &filepath) {
#if !defined(_WIN32)
struct stat buffer;
return (stat(filepath.c_str(), &buffer) == 0);
#else
struct _stat buffer;
return (_stat(filepath.c_str(), &buffer) == 0);
#endif // !_WIN32
}
static std::string DirName(const std::string &filepath) {
auto pos = filepath.rfind(kSEP);
if (pos == std::string::npos) {
return "";
}
return filepath.substr(0, pos);
}
static void MkDir(const char *path) {
std::string path_error(path);
path_error += " mkdir failed!";
#if !defined(_WIN32)
if (mkdir(path, 0755)) {
if (errno != EEXIST) {
throw std::runtime_error(path_error);
}
}
#else
CreateDirectory(path, NULL);
auto errorno = GetLastError();
if (errorno != ERROR_ALREADY_EXISTS) {
throw std::runtime_error(path_error);
}
#endif // !_WIN32
}
static void MkDirRecursively(const char *fullpath) {
if (*fullpath == '\0') return; // empty string
if (FileExists(fullpath)) return;
MkDirRecursively(DirName(fullpath).c_str());
MkDir(fullpath);
}
......@@ -98,10 +98,9 @@ class Inferencer(object):
raise ValueError(
"inputs should be a map of {'input_name': input_var}")
with executor.scope_guard(self.scope):
results = self.exe.run(self.inference_program,
feed=inputs,
fetch_list=[self.predict_var],
with self._prog_and_scope_guard():
results = self.exe.run(feed=inputs,
fetch_list=[self.predict_var.name],
return_numpy=return_numpy)
return results
......
......@@ -109,6 +109,7 @@ __all__ = [
'flatten',
'sequence_mask',
'stack',
'pad2d',
'unstack',
]
......@@ -5614,6 +5615,94 @@ def rank_loss(label, left, right, name=None):
return out
def pad2d(input,
paddings=[0, 0, 0, 0],
mode='constant',
pad_value=0.0,
data_format="NCHW",
name=None):
"""
Pad 2-d images accordding to 'paddings' and 'mode'.
If mode is 'reflect', paddings[0] and paddings[1] must be no greater
than height-1. And the width dimension has the same condition.
Example:
Given that X is a channel of image from input:
X = [[1, 2, 3],
[4, 5, 6]]
Case 0:
paddings = [0, 1, 2, 3],
mode = 'constant'
pad_value = 0
Out = [[0, 0, 1, 2, 3, 0, 0, 0]
[0, 0, 4, 5, 6, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0]]
Case 1:
paddings = [0, 1, 2, 1],
mode = 'reflect'
Out = [[3, 2, 1, 2, 3, 2]
[6, 5, 4, 5, 6, 5]
[3, 2, 1, 2, 3, 2]]
Case 2:
paddings = [0, 1, 2, 1],
mode = 'edge'
Out = [[1, 1, 1, 2, 3, 3]
[4, 4, 4, 5, 6, 6]
[4, 4, 4, 5, 6, 6]]
Args:
input (Variable): The input image with [N, C, H, W] format or [N, H, W, C] format.
paddings (tuple|list): The padding size. If padding is a tuple, it must
contain four integers, (padding_top, padding_bottom, padding_left, padding_right).
Default: padding = [0, 0, 0, 0].
mode (str): Three modes: constant(default), reflect, edge. Default: constant
pad_value (float32): The value to fill the padded areas in constant mode. Default: 0
data_format (str): An optional string from: "NHWC", "NCHW". Specify the data format of
the input data.
Default: "NCHW"
name (str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: The tensor variable padded accordding to paddings and mode.
Examples:
.. code-block:: python
data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
result = fluid.layers.pad2d(input=data, padding=[1,2,3,4], mode='reflect')
"""
helper = LayerHelper('pad2d', **locals())
dtype = helper.input_dtype(input_param_name='input')
out = helper.create_tmp_variable(dtype)
helper.append_op(
type='pad2d',
inputs={'X': input},
outputs={"Out": out},
attrs={
'paddings': paddings,
'mode': mode,
'pad_value': pad_value,
'data_frmat': data_format
})
return out
def prelu(x, mode, param_attr=None, name=None):
"""
Equation:
......@@ -5628,8 +5717,8 @@ def prelu(x, mode, param_attr=None, name=None):
all: all elements share same weight
channel:elements in a channel share same weight
element:each element has a weight
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: The output tensor with the same shape as input.
......
......@@ -16,7 +16,9 @@ from __future__ import print_function
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
import numpy
import os
import cifar10_small_test_set
......@@ -89,7 +91,7 @@ def optimizer_func():
return fluid.optimizer.Adam(learning_rate=0.001)
def train(use_cuda, train_program, params_dirname):
def train(use_cuda, train_program, parallel, params_dirname):
BATCH_SIZE = 128
EPOCH_NUM = 1
......@@ -116,7 +118,10 @@ def train(use_cuda, train_program, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
trainer = fluid.Trainer(
train_func=train_program, optimizer_func=optimizer_func, place=place)
train_func=train_program,
optimizer_func=optimizer_func,
place=place,
parallel=parallel)
trainer.train(
reader=train_reader,
......@@ -125,10 +130,13 @@ def train(use_cuda, train_program, params_dirname):
feed_order=['pixel', 'label'])
def infer(use_cuda, inference_program, params_dirname=None):
def infer(use_cuda, inference_program, parallel, params_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(
infer_func=inference_program, param_path=params_dirname, place=place)
infer_func=inference_program,
param_path=params_dirname,
place=place,
parallel=parallel)
# The input's dimension of conv should be 4-D or 5-D.
# Use normilized image pixels as input data, which should be in the range
......@@ -139,22 +147,34 @@ def infer(use_cuda, inference_program, params_dirname=None):
print("infer results: ", results)
def main(use_cuda):
def main(use_cuda, parallel):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
save_path = "image_classification_resnet.inference.model"
os.environ['CPU_NUM'] = str(4)
train(
use_cuda=use_cuda,
train_program=train_network,
params_dirname=save_path)
params_dirname=save_path,
parallel=parallel)
# FIXME(zcd): in the inference stage, the number of
# input data is one, it is not appropriate to use parallel.
if parallel and use_cuda:
return
os.environ['CPU_NUM'] = str(1)
infer(
use_cuda=use_cuda,
inference_program=inference_network,
params_dirname=save_path)
params_dirname=save_path,
parallel=parallel)
if __name__ == '__main__':
for use_cuda in (False, True):
main(use_cuda=use_cuda)
for parallel in (False, True):
if use_cuda and not core.is_compiled_with_cuda():
continue
main(use_cuda=use_cuda, parallel=parallel)
......@@ -16,7 +16,9 @@ from __future__ import print_function
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
import numpy
import os
import cifar10_small_test_set
......@@ -68,7 +70,7 @@ def optimizer_func():
return fluid.optimizer.Adam(learning_rate=0.001)
def train(use_cuda, train_program, params_dirname):
def train(use_cuda, train_program, parallel, params_dirname):
BATCH_SIZE = 128
train_reader = paddle.batch(
paddle.reader.shuffle(
......@@ -93,7 +95,10 @@ def train(use_cuda, train_program, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
trainer = fluid.Trainer(
train_func=train_program, place=place, optimizer_func=optimizer_func)
train_func=train_program,
place=place,
optimizer_func=optimizer_func,
parallel=parallel)
trainer.train(
reader=train_reader,
......@@ -102,10 +107,13 @@ def train(use_cuda, train_program, params_dirname):
feed_order=['pixel', 'label'])
def infer(use_cuda, inference_program, params_dirname=None):
def infer(use_cuda, inference_program, parallel, params_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(
infer_func=inference_program, param_path=params_dirname, place=place)
infer_func=inference_program,
param_path=params_dirname,
place=place,
parallel=parallel)
# The input's dimension of conv should be 4-D or 5-D.
# Use normilized image pixels as input data, which should be in the range
......@@ -116,22 +124,31 @@ def infer(use_cuda, inference_program, params_dirname=None):
print("infer results: ", results)
def main(use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
def main(use_cuda, parallel):
save_path = "image_classification_vgg.inference.model"
os.environ['CPU_NUM'] = str(4)
train(
use_cuda=use_cuda,
train_program=train_network,
params_dirname=save_path)
params_dirname=save_path,
parallel=parallel)
# FIXME(zcd): in the inference stage, the number of
# input data is one, it is not appropriate to use parallel.
if parallel and use_cuda:
return
os.environ['CPU_NUM'] = str(1)
infer(
use_cuda=use_cuda,
inference_program=inference_network,
params_dirname=save_path)
params_dirname=save_path,
parallel=parallel)
if __name__ == '__main__':
for use_cuda in (False, True):
main(use_cuda=use_cuda)
for parallel in (False, True):
if use_cuda and not core.is_compiled_with_cuda():
continue
main(use_cuda=use_cuda, parallel=parallel)
......@@ -64,14 +64,14 @@ def optimizer_func():
return fluid.optimizer.Adam(learning_rate=0.001)
def train(use_cuda, train_program, params_dirname):
def train(use_cuda, train_program, parallel, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
trainer = fluid.Trainer(
train_func=train_program,
place=place,
optimizer_func=optimizer_func,
parallel=True)
parallel=parallel)
def event_handler(event):
if isinstance(event, fluid.EndEpochEvent):
......@@ -108,11 +108,14 @@ def train(use_cuda, train_program, params_dirname):
feed_order=['img', 'label'])
def infer(use_cuda, inference_program, params_dirname=None):
def infer(use_cuda, inference_program, parallel, params_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(
infer_func=inference_program, param_path=params_dirname, place=place)
infer_func=inference_program,
param_path=params_dirname,
place=place,
parallel=parallel)
batch_size = 1
tensor_img = numpy.random.uniform(-1.0, 1.0,
......@@ -123,20 +126,32 @@ def infer(use_cuda, inference_program, params_dirname=None):
print("infer results: ", results[0])
def main(use_cuda):
def main(use_cuda, parallel):
params_dirname = "recognize_digits_conv.inference.model"
# call train() with is_local argument to run distributed train
os.environ['CPU_NUM'] = str(4)
train(
use_cuda=use_cuda,
train_program=train_program,
params_dirname=params_dirname)
params_dirname=params_dirname,
parallel=parallel)
# FIXME(zcd): in the inference stage, the number of
# input data is one, it is not appropriate to use parallel.
if parallel and use_cuda:
return
os.environ['CPU_NUM'] = str(1)
infer(
use_cuda=use_cuda,
inference_program=inference_program,
params_dirname=params_dirname)
params_dirname=params_dirname,
parallel=parallel)
if __name__ == '__main__':
# for use_cuda in (False, True):
main(use_cuda=core.is_compiled_with_cuda())
for use_cuda in (False, True):
for parallel in (False, True):
if use_cuda and not core.is_compiled_with_cuda():
continue
main(use_cuda=use_cuda, parallel=parallel)
......@@ -16,6 +16,7 @@ from __future__ import print_function
import argparse
import paddle.fluid as fluid
import paddle.fluid.core as core
import paddle
import sys
import numpy
......@@ -50,11 +51,14 @@ def optimizer_func():
return fluid.optimizer.Adam(learning_rate=0.001)
def train(use_cuda, train_program, params_dirname):
def train(use_cuda, train_program, params_dirname, parallel):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
trainer = fluid.Trainer(
train_func=train_program, place=place, optimizer_func=optimizer_func)
train_func=train_program,
place=place,
optimizer_func=optimizer_func,
parallel=parallel)
def event_handler(event):
if isinstance(event, fluid.EndEpochEvent):
......@@ -86,11 +90,14 @@ def train(use_cuda, train_program, params_dirname):
feed_order=['img', 'label'])
def infer(use_cuda, inference_program, params_dirname=None):
def infer(use_cuda, inference_program, parallel, params_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(
infer_func=inference_program, param_path=params_dirname, place=place)
infer_func=inference_program,
param_path=params_dirname,
place=place,
parallel=parallel)
batch_size = 1
tensor_img = numpy.random.uniform(-1.0, 1.0,
......@@ -101,20 +108,32 @@ def infer(use_cuda, inference_program, params_dirname=None):
print("infer results: ", results[0])
def main(use_cuda):
def main(use_cuda, parallel):
params_dirname = "recognize_digits_mlp.inference.model"
# call train() with is_local argument to run distributed train
os.environ['CPU_NUM'] = str(4)
train(
use_cuda=use_cuda,
train_program=train_program,
params_dirname=params_dirname)
params_dirname=params_dirname,
parallel=parallel)
# FIXME(zcd): in the inference stage, the number of
# input data is one, it is not appropriate to use parallel.
if parallel and use_cuda:
return
os.environ['CPU_NUM'] = str(1)
infer(
use_cuda=use_cuda,
inference_program=inference_program,
params_dirname=params_dirname)
params_dirname=params_dirname,
parallel=parallel)
if __name__ == '__main__':
# for use_cuda in (False, True):
main(use_cuda=False)
for use_cuda in (False, True):
for parallel in (False, True):
if use_cuda and not core.is_compiled_with_cuda():
continue
main(use_cuda=use_cuda, parallel=parallel)
......@@ -438,7 +438,7 @@ class TestLocalLookupTable(TestDistLookupTableBase):
# 2 optimize for table adam
# NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
self.assertEqual([op.type for op in pserver1.blocks[2].ops],
["sum", "adam", "scale", "scale"])
["sum", "scale", "adam", "scale", "scale"])
trainer, _ = self.get_trainer()
self.assertEqual(len(trainer.blocks), 1)
......
......@@ -521,6 +521,20 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(out)
print(str(program))
def test_pad2d(self):
program = Program()
with program_guard(program):
input = layers.data(
name="input", shape=[3, 100, 100], dtype="float32")
out = layers.pad2d(
input,
paddings=[1, 2, 3, 4],
mode='reflect',
data_format='NCHW',
name="shape")
self.assertIsNotNone(out)
print(str(program))
def test_prelu(self):
program = Program()
with program_guard(program):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from op_test import OpTest
class TestPad2dOp(OpTest):
def setUp(self):
self.pad_value = 0.0
self.initTestCase()
self.op_type = "pad2d"
self.inputs = {'X': np.random.random(self.shape).astype("float32"), }
self.attrs = {}
self.attrs['paddings'] = np.array(self.paddings).flatten()
self.attrs['pad_value'] = self.pad_value
self.attrs['mode'] = self.mode
self.attrs['data_format'] = self.data_format
if self.data_format == "NCHW":
paddings = [(0, 0), (0, 0), (self.paddings[0], self.paddings[1]),
(self.paddings[2], self.paddings[3])]
else:
paddings = [(0, 0), (self.paddings[0], self.paddings[1]),
(self.paddings[2], self.paddings[3]), (0, 0)]
if self.mode == "constant":
out = np.pad(self.inputs['X'],
paddings,
mode=self.mode,
constant_values=self.pad_value)
else:
out = np.pad(self.inputs['X'], paddings, mode=self.mode)
self.outputs = {'Out': out}
def test_check_output(self):
self.check_output()
def test_check_grad_normal(self):
self.check_grad(['X'], 'Out', max_relative_error=0.006)
def initTestCase(self):
self.shape = (2, 3, 4, 4)
self.paddings = [0, 1, 2, 3]
self.mode = "constant"
self.data_format = "NCHW"
self.pad_value = 0.0
class TestCase1(TestPad2dOp):
def initTestCase(self):
self.shape = (2, 3, 4, 4)
self.paddings = [0, 1, 2, 3]
self.mode = "reflect"
self.data_format = "NCHW"
class TestCase2(TestPad2dOp):
def initTestCase(self):
self.shape = (2, 3, 4, 4)
self.paddings = [0, 1, 2, 3]
self.mode = "edge"
self.data_format = "NCHW"
class TestCase3(TestPad2dOp):
def initTestCase(self):
self.shape = (2, 4, 4, 2)
self.paddings = [0, 1, 2, 3]
self.mode = "reflect"
self.data_format = "NHWC"
class TestCase4(TestPad2dOp):
def initTestCase(self):
self.shape = (2, 4, 4, 2)
self.paddings = [0, 1, 2, 3]
self.mode = "edge"
self.data_format = "NHWC"
class TestCase5(TestPad2dOp):
def initTestCase(self):
self.shape = (2, 4, 4, 2)
self.paddings = [0, 1, 2, 3]
self.mode = "constant"
self.pad_value = 1.2
self.data_format = "NHWC"
if __name__ == '__main__':
unittest.main()
......@@ -25,9 +25,9 @@ class TestSamplingIdOp(OpTest):
self.op_type = "sampling_id"
self.use_mkldnn = False
self.init_kernel_type()
self.X = np.random.random((8, 4)).astype('float32')
self.X = np.random.random((100, 10)).astype('float32')
self.inputs = {"X": self.X}
self.Y = np.random.random(8).astype('float32')
self.Y = np.random.random(100).astype('int64')
self.outputs = {'Out': self.Y}
self.attrs = {'max': 1.0, 'min': 0.0, 'seed': 1}
......@@ -36,6 +36,16 @@ class TestSamplingIdOp(OpTest):
y1 = self.out
self.check_output_customized(self.verify_output)
y2 = self.out
# check dtype
assert y1.dtype == np.int64
assert y2.dtype == np.int64
# check output is index ids of inputs
inputs_ids = np.arange(self.X.shape[1])
assert np.isin(y1, inputs_ids).all()
assert np.isin(y2, inputs_ids).all()
self.assertTrue(np.array_equal(y1, y2))
self.assertEqual(len(y1), len(self.Y))
......
......@@ -1390,13 +1390,11 @@ class DistributeTranspiler(object):
inputs={"X": vars2merge},
outputs={"Out": merged_var},
attrs={"use_mkldnn": False})
# TODO(panyx0718): What if it's SELECTED_ROWS.
if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS:
optimize_block.append_op(
type="scale",
inputs={"X": merged_var},
outputs={"Out": merged_var},
attrs={"scale": 1.0 / float(self.trainer_num)})
optimize_block.append_op(
type="scale",
inputs={"X": merged_var},
outputs={"Out": merged_var},
attrs={"scale": 1.0 / float(self.trainer_num)})
return merged_var
def _append_pserver_ops(self, optimize_block, opt_op, endpoint,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册